arm: vp9itxfm16: Make the larger core transforms standalone functions

This work is sponsored by, and copyright, Google. This reduces the code size of libavcodec/arm/vp9itxfm_16bpp_neon.o from 17500 to 14516 bytes. This gives a small slowdown of a couple tens of cycles, up to around 150 cycles for the full case of the largest transform, but makes it more feasible to add more optimized versions of these transforms. Before: Cortex A7 A8 A9 A53 vp9_inv_dct_dct_16x16_sub4_add_10_neon: 4237.4 3561.5 3971.8 2525.3 vp9_inv_dct_dct_16x16_sub16_add_10_neon: 6371.9 5452.0 5779.3 3910.5 vp9_inv_dct_dct_32x32_sub4_add_10_neon: 22068.8 17867.5 19555.2 13871.6 vp9_inv_dct_dct_32x32_sub32_add_10_neon: 37268.9 38684.2 32314.2 23969.0 After: vp9_inv_dct_dct_16x16_sub4_add_10_neon: 4375.1 3571.9 4283.8 2567.2 vp9_inv_dct_dct_16x16_sub16_add_10_neon: 6415.6 5578.9 5844.6 3948.3 vp9_inv_dct_dct_32x32_sub4_add_10_neon: 22653.7 18079.7 19603.7 13905.3 vp9_inv_dct_dct_32x32_sub32_add_10_neon: 37593.2 38862.2 32235.8 24070.9 Signed-off-by: Martin Storsjö <martin@martin.st>
2017-02-24 16:02:23 +02:00 · 2017-02-24 16:02:23 +02:00 · 0ea603203d
commit 0ea603203d
parent b76533f105
1 changed files with 27 additions and 16 deletions
--- a/libavcodec/arm/vp9itxfm_16bpp_neon.S
+++ b/libavcodec/arm/vp9itxfm_16bpp_neon.S
@ -807,7 +807,7 @@ function idct16x16_dc_add_neon
 endfunc
 .ltorg
-.macro idct16
+function idct16
        mbutterfly0     d16, d24, d16, d24, d8, d10, q4,  q5 @ d16 = t0a,  d24 = t1a
        mbutterfly      d20, d28, d1[0], d1[1], q4,  q5  @ d20 = t2a,  d28 = t3a
        mbutterfly      d18, d30, d2[0], d2[1], q4,  q5  @ d18 = t4a,  d30 = t7a
@ -853,9 +853,10 @@ endfunc
        vmov            d8,  d21                         @ d8  = t10a
        butterfly       d20, d27, d10, d27               @ d20 = out[4], d27 = out[11]
        butterfly       d21, d26, d26, d8                @ d21 = out[5], d26 = out[10]
-.endm
+        bx              lr
 endfunc
-.macro iadst16
+function iadst16
        movrel          r12, iadst16_coeffs
        vld1.16         {q0},  [r12,:128]!
        vmovl.s16       q1,  d1
@ -933,7 +934,8 @@ endfunc
        vmov            d16, d2
        vmov            d30, d4
-.endm
+        bx              lr
 endfunc
 .macro itxfm16_1d_funcs txfm
@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
@ -941,6 +943,8 @@ endfunc
@ r0 = dst (temp buffer)
@ r2 = src
 function \txfm\()16_1d_2x16_pass1_neon
        push            {lr}
        mov             r12, #64
        vmov.s32        q4,  #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@ -948,7 +952,7 @@ function \txfm\()16_1d_2x16_pass1_neon
        vst1.32         {d8},  [r2,:64], r12
 .endr
-        \txfm\()16
+        bl              \txfm\()16
        @ Do eight 2x2 transposes. Originally, d16-d31 contain the
        @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
@ -959,7 +963,7 @@ function \txfm\()16_1d_2x16_pass1_neon
 .irp i, 16, 18, 20, 22, 24, 26, 28, 30, 17, 19, 21, 23, 25, 27, 29, 31
        vst1.32         {d\i}, [r0,:64]!
 .endr
-        bx              lr
+        pop             {pc}
 endfunc
@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
@ -968,6 +972,8 @@ endfunc
@ r1 = dst stride
@ r2 = src (temp buffer)
 function \txfm\()16_1d_2x16_pass2_neon
        push            {lr}
        mov             r12, #64
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
        vld1.16         {d\i}, [r2,:64], r12
@ -975,7 +981,7 @@ function \txfm\()16_1d_2x16_pass2_neon
        add             r3,  r0,  r1
        lsl             r1,  r1,  #1
-        \txfm\()16
+        bl              \txfm\()16
 .macro load_add_store coef0, coef1, coef2, coef3
        vrshr.s32       \coef0, \coef0, #6
@ -1019,7 +1025,7 @@ function \txfm\()16_1d_2x16_pass2_neon
        load_add_store  q12, q13, q14, q15
 .purgem load_add_store
-        bx              lr
+        pop             {pc}
 endfunc
 .endm
@ -1193,7 +1199,7 @@ function idct32x32_dc_add_neon
        pop             {r4-r9,pc}
 endfunc
-.macro idct32_odd
+function idct32_odd
        movrel          r12, idct_coeffs
        @ Overwrite the idct16 coeffs with the stored ones for idct32
@ -1262,7 +1268,8 @@ endfunc
        mbutterfly0     d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a
        mbutterfly0     d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25,  d22 = t22
        mbutterfly0     d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a
-.endm
+        bx              lr
 endfunc
@ Do an 32-point IDCT of a 2x32 slice out of a 32x32 matrix.
@ We don't have register space to do a single pass IDCT of 2x32 though,
@ -1274,6 +1281,8 @@ endfunc
@ r1 = unused
@ r2 = src
 function idct32_1d_2x32_pass1_neon
        push            {lr}
        @ Double stride of the input, since we only read every other line
        mov             r12, #256
        vmov.s32        d8,  #0
@ -1284,7 +1293,7 @@ function idct32_1d_2x32_pass1_neon
        vst1.32         {d8},  [r2,:64], r12
 .endr
-        idct16
+        bl              idct16
        @ Do eight 2x2 transposes. Originally, d16-d31 contain the
        @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
@ -1319,7 +1328,7 @@ function idct32_1d_2x32_pass1_neon
        vst1.16         {d8},  [r2,:64], r12
 .endr
-        idct32_odd
+        bl              idct32_odd
        transpose32_8x_2x2 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
@ -1343,7 +1352,7 @@ function idct32_1d_2x32_pass1_neon
        store_rev       31, 29, 27, 25, 23, 21, 19, 17
        store_rev       30, 28, 26, 24, 22, 20, 18, 16
 .purgem store_rev
-        bx              lr
+        pop             {pc}
 endfunc
 .ltorg
@ -1354,6 +1363,8 @@ endfunc
@ r1 = dst stride
@ r2 = src (temp buffer)
 function idct32_1d_2x32_pass2_neon
        push            {lr}
        mov             r12, #256
        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@ -1361,7 +1372,7 @@ function idct32_1d_2x32_pass2_neon
 .endr
        sub             r2,  r2,  r12, lsl #4
-        idct16
+        bl              idct16
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
        vst1.32         {d\i}, [r2,:64], r12
@ -1377,7 +1388,7 @@ function idct32_1d_2x32_pass2_neon
        sub             r2,  r2,  r12, lsl #4
        sub             r2,  r2,  #128
-        idct32_odd
+        bl              idct32_odd
        @ Narrow the ict16 coefficients in q0-q3 into q0-q1, to
        @ allow clobbering q2-q3 below.
@ -1439,7 +1450,7 @@ function idct32_1d_2x32_pass2_neon
        vmovl.s16       q3,  d3
        vmovl.s16       q1,  d1
        vmovl.s16       q0,  d0
-        bx              lr
+        pop             {pc}
 endfunc
 const min_eob_idct_idct_32, align=4