arm: vp9itxfm16: Make the larger core transforms standalone functions
This work is sponsored by, and copyright, Google. This reduces the code size of libavcodec/arm/vp9itxfm_16bpp_neon.o from 17500 to 14516 bytes. This gives a small slowdown of a couple tens of cycles, up to around 150 cycles for the full case of the largest transform, but makes it more feasible to add more optimized versions of these transforms. Before: Cortex A7 A8 A9 A53 vp9_inv_dct_dct_16x16_sub4_add_10_neon: 4237.4 3561.5 3971.8 2525.3 vp9_inv_dct_dct_16x16_sub16_add_10_neon: 6371.9 5452.0 5779.3 3910.5 vp9_inv_dct_dct_32x32_sub4_add_10_neon: 22068.8 17867.5 19555.2 13871.6 vp9_inv_dct_dct_32x32_sub32_add_10_neon: 37268.9 38684.2 32314.2 23969.0 After: vp9_inv_dct_dct_16x16_sub4_add_10_neon: 4375.1 3571.9 4283.8 2567.2 vp9_inv_dct_dct_16x16_sub16_add_10_neon: 6415.6 5578.9 5844.6 3948.3 vp9_inv_dct_dct_32x32_sub4_add_10_neon: 22653.7 18079.7 19603.7 13905.3 vp9_inv_dct_dct_32x32_sub32_add_10_neon: 37593.2 38862.2 32235.8 24070.9 Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
b76533f105
commit
0ea603203d
@ -807,7 +807,7 @@ function idct16x16_dc_add_neon
|
|||||||
endfunc
|
endfunc
|
||||||
.ltorg
|
.ltorg
|
||||||
|
|
||||||
.macro idct16
|
function idct16
|
||||||
mbutterfly0 d16, d24, d16, d24, d8, d10, q4, q5 @ d16 = t0a, d24 = t1a
|
mbutterfly0 d16, d24, d16, d24, d8, d10, q4, q5 @ d16 = t0a, d24 = t1a
|
||||||
mbutterfly d20, d28, d1[0], d1[1], q4, q5 @ d20 = t2a, d28 = t3a
|
mbutterfly d20, d28, d1[0], d1[1], q4, q5 @ d20 = t2a, d28 = t3a
|
||||||
mbutterfly d18, d30, d2[0], d2[1], q4, q5 @ d18 = t4a, d30 = t7a
|
mbutterfly d18, d30, d2[0], d2[1], q4, q5 @ d18 = t4a, d30 = t7a
|
||||||
@ -853,9 +853,10 @@ endfunc
|
|||||||
vmov d8, d21 @ d8 = t10a
|
vmov d8, d21 @ d8 = t10a
|
||||||
butterfly d20, d27, d10, d27 @ d20 = out[4], d27 = out[11]
|
butterfly d20, d27, d10, d27 @ d20 = out[4], d27 = out[11]
|
||||||
butterfly d21, d26, d26, d8 @ d21 = out[5], d26 = out[10]
|
butterfly d21, d26, d26, d8 @ d21 = out[5], d26 = out[10]
|
||||||
.endm
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
.macro iadst16
|
function iadst16
|
||||||
movrel r12, iadst16_coeffs
|
movrel r12, iadst16_coeffs
|
||||||
vld1.16 {q0}, [r12,:128]!
|
vld1.16 {q0}, [r12,:128]!
|
||||||
vmovl.s16 q1, d1
|
vmovl.s16 q1, d1
|
||||||
@ -933,7 +934,8 @@ endfunc
|
|||||||
|
|
||||||
vmov d16, d2
|
vmov d16, d2
|
||||||
vmov d30, d4
|
vmov d30, d4
|
||||||
.endm
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
.macro itxfm16_1d_funcs txfm
|
.macro itxfm16_1d_funcs txfm
|
||||||
@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
|
@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
|
||||||
@ -941,6 +943,8 @@ endfunc
|
|||||||
@ r0 = dst (temp buffer)
|
@ r0 = dst (temp buffer)
|
||||||
@ r2 = src
|
@ r2 = src
|
||||||
function \txfm\()16_1d_2x16_pass1_neon
|
function \txfm\()16_1d_2x16_pass1_neon
|
||||||
|
push {lr}
|
||||||
|
|
||||||
mov r12, #64
|
mov r12, #64
|
||||||
vmov.s32 q4, #0
|
vmov.s32 q4, #0
|
||||||
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||||
@ -948,7 +952,7 @@ function \txfm\()16_1d_2x16_pass1_neon
|
|||||||
vst1.32 {d8}, [r2,:64], r12
|
vst1.32 {d8}, [r2,:64], r12
|
||||||
.endr
|
.endr
|
||||||
|
|
||||||
\txfm\()16
|
bl \txfm\()16
|
||||||
|
|
||||||
@ Do eight 2x2 transposes. Originally, d16-d31 contain the
|
@ Do eight 2x2 transposes. Originally, d16-d31 contain the
|
||||||
@ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
|
@ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
|
||||||
@ -959,7 +963,7 @@ function \txfm\()16_1d_2x16_pass1_neon
|
|||||||
.irp i, 16, 18, 20, 22, 24, 26, 28, 30, 17, 19, 21, 23, 25, 27, 29, 31
|
.irp i, 16, 18, 20, 22, 24, 26, 28, 30, 17, 19, 21, 23, 25, 27, 29, 31
|
||||||
vst1.32 {d\i}, [r0,:64]!
|
vst1.32 {d\i}, [r0,:64]!
|
||||||
.endr
|
.endr
|
||||||
bx lr
|
pop {pc}
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
|
@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
|
||||||
@ -968,6 +972,8 @@ endfunc
|
|||||||
@ r1 = dst stride
|
@ r1 = dst stride
|
||||||
@ r2 = src (temp buffer)
|
@ r2 = src (temp buffer)
|
||||||
function \txfm\()16_1d_2x16_pass2_neon
|
function \txfm\()16_1d_2x16_pass2_neon
|
||||||
|
push {lr}
|
||||||
|
|
||||||
mov r12, #64
|
mov r12, #64
|
||||||
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||||
vld1.16 {d\i}, [r2,:64], r12
|
vld1.16 {d\i}, [r2,:64], r12
|
||||||
@ -975,7 +981,7 @@ function \txfm\()16_1d_2x16_pass2_neon
|
|||||||
|
|
||||||
add r3, r0, r1
|
add r3, r0, r1
|
||||||
lsl r1, r1, #1
|
lsl r1, r1, #1
|
||||||
\txfm\()16
|
bl \txfm\()16
|
||||||
|
|
||||||
.macro load_add_store coef0, coef1, coef2, coef3
|
.macro load_add_store coef0, coef1, coef2, coef3
|
||||||
vrshr.s32 \coef0, \coef0, #6
|
vrshr.s32 \coef0, \coef0, #6
|
||||||
@ -1019,7 +1025,7 @@ function \txfm\()16_1d_2x16_pass2_neon
|
|||||||
load_add_store q12, q13, q14, q15
|
load_add_store q12, q13, q14, q15
|
||||||
.purgem load_add_store
|
.purgem load_add_store
|
||||||
|
|
||||||
bx lr
|
pop {pc}
|
||||||
endfunc
|
endfunc
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
@ -1193,7 +1199,7 @@ function idct32x32_dc_add_neon
|
|||||||
pop {r4-r9,pc}
|
pop {r4-r9,pc}
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
.macro idct32_odd
|
function idct32_odd
|
||||||
movrel r12, idct_coeffs
|
movrel r12, idct_coeffs
|
||||||
|
|
||||||
@ Overwrite the idct16 coeffs with the stored ones for idct32
|
@ Overwrite the idct16 coeffs with the stored ones for idct32
|
||||||
@ -1262,7 +1268,8 @@ endfunc
|
|||||||
mbutterfly0 d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a
|
mbutterfly0 d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a
|
||||||
mbutterfly0 d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25, d22 = t22
|
mbutterfly0 d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25, d22 = t22
|
||||||
mbutterfly0 d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a
|
mbutterfly0 d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a
|
||||||
.endm
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
@ Do an 32-point IDCT of a 2x32 slice out of a 32x32 matrix.
|
@ Do an 32-point IDCT of a 2x32 slice out of a 32x32 matrix.
|
||||||
@ We don't have register space to do a single pass IDCT of 2x32 though,
|
@ We don't have register space to do a single pass IDCT of 2x32 though,
|
||||||
@ -1274,6 +1281,8 @@ endfunc
|
|||||||
@ r1 = unused
|
@ r1 = unused
|
||||||
@ r2 = src
|
@ r2 = src
|
||||||
function idct32_1d_2x32_pass1_neon
|
function idct32_1d_2x32_pass1_neon
|
||||||
|
push {lr}
|
||||||
|
|
||||||
@ Double stride of the input, since we only read every other line
|
@ Double stride of the input, since we only read every other line
|
||||||
mov r12, #256
|
mov r12, #256
|
||||||
vmov.s32 d8, #0
|
vmov.s32 d8, #0
|
||||||
@ -1284,7 +1293,7 @@ function idct32_1d_2x32_pass1_neon
|
|||||||
vst1.32 {d8}, [r2,:64], r12
|
vst1.32 {d8}, [r2,:64], r12
|
||||||
.endr
|
.endr
|
||||||
|
|
||||||
idct16
|
bl idct16
|
||||||
|
|
||||||
@ Do eight 2x2 transposes. Originally, d16-d31 contain the
|
@ Do eight 2x2 transposes. Originally, d16-d31 contain the
|
||||||
@ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
|
@ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
|
||||||
@ -1319,7 +1328,7 @@ function idct32_1d_2x32_pass1_neon
|
|||||||
vst1.16 {d8}, [r2,:64], r12
|
vst1.16 {d8}, [r2,:64], r12
|
||||||
.endr
|
.endr
|
||||||
|
|
||||||
idct32_odd
|
bl idct32_odd
|
||||||
|
|
||||||
transpose32_8x_2x2 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
|
transpose32_8x_2x2 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
|
||||||
|
|
||||||
@ -1343,7 +1352,7 @@ function idct32_1d_2x32_pass1_neon
|
|||||||
store_rev 31, 29, 27, 25, 23, 21, 19, 17
|
store_rev 31, 29, 27, 25, 23, 21, 19, 17
|
||||||
store_rev 30, 28, 26, 24, 22, 20, 18, 16
|
store_rev 30, 28, 26, 24, 22, 20, 18, 16
|
||||||
.purgem store_rev
|
.purgem store_rev
|
||||||
bx lr
|
pop {pc}
|
||||||
endfunc
|
endfunc
|
||||||
.ltorg
|
.ltorg
|
||||||
|
|
||||||
@ -1354,6 +1363,8 @@ endfunc
|
|||||||
@ r1 = dst stride
|
@ r1 = dst stride
|
||||||
@ r2 = src (temp buffer)
|
@ r2 = src (temp buffer)
|
||||||
function idct32_1d_2x32_pass2_neon
|
function idct32_1d_2x32_pass2_neon
|
||||||
|
push {lr}
|
||||||
|
|
||||||
mov r12, #256
|
mov r12, #256
|
||||||
@ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
|
@ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
|
||||||
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||||
@ -1361,7 +1372,7 @@ function idct32_1d_2x32_pass2_neon
|
|||||||
.endr
|
.endr
|
||||||
sub r2, r2, r12, lsl #4
|
sub r2, r2, r12, lsl #4
|
||||||
|
|
||||||
idct16
|
bl idct16
|
||||||
|
|
||||||
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||||
vst1.32 {d\i}, [r2,:64], r12
|
vst1.32 {d\i}, [r2,:64], r12
|
||||||
@ -1377,7 +1388,7 @@ function idct32_1d_2x32_pass2_neon
|
|||||||
sub r2, r2, r12, lsl #4
|
sub r2, r2, r12, lsl #4
|
||||||
sub r2, r2, #128
|
sub r2, r2, #128
|
||||||
|
|
||||||
idct32_odd
|
bl idct32_odd
|
||||||
|
|
||||||
@ Narrow the ict16 coefficients in q0-q3 into q0-q1, to
|
@ Narrow the ict16 coefficients in q0-q3 into q0-q1, to
|
||||||
@ allow clobbering q2-q3 below.
|
@ allow clobbering q2-q3 below.
|
||||||
@ -1439,7 +1450,7 @@ function idct32_1d_2x32_pass2_neon
|
|||||||
vmovl.s16 q3, d3
|
vmovl.s16 q3, d3
|
||||||
vmovl.s16 q1, d1
|
vmovl.s16 q1, d1
|
||||||
vmovl.s16 q0, d0
|
vmovl.s16 q0, d0
|
||||||
bx lr
|
pop {pc}
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
const min_eob_idct_idct_32, align=4
|
const min_eob_idct_idct_32, align=4
|
||||||
|
Loading…
x
Reference in New Issue
Block a user