292 lines
		
	
	
		
			6.8 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			292 lines
		
	
	
		
			6.8 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
/*
 | 
						|
 * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
 | 
						|
 *
 | 
						|
 * This file is part of FFmpeg.
 | 
						|
 *
 | 
						|
 * FFmpeg is free software; you can redistribute it and/or
 | 
						|
 * modify it under the terms of the GNU Lesser General Public
 | 
						|
 * License as published by the Free Software Foundation; either
 | 
						|
 * version 2.1 of the License, or (at your option) any later version.
 | 
						|
 *
 | 
						|
 * FFmpeg is distributed in the hope that it will be useful,
 | 
						|
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
 * Lesser General Public License for more details.
 | 
						|
 *
 | 
						|
 * You should have received a copy of the GNU Lesser General Public
 | 
						|
 * License along with FFmpeg; if not, write to the Free Software
 | 
						|
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | 
						|
 */
 | 
						|
 | 
						|
#include "libavutil/arm/asm.S"
 | 
						|
 | 
						|
.macro alias name, tgt, set=1
 | 
						|
.if \set != 0
 | 
						|
    \name   .req    \tgt
 | 
						|
.else
 | 
						|
    .unreq  \name
 | 
						|
.endif
 | 
						|
.endm
 | 
						|
 | 
						|
.altmacro
 | 
						|
 | 
						|
.macro alias_dw_all qw, dw_l, dw_h
 | 
						|
    alias   q\qw\()_l, d\dw_l
 | 
						|
    alias   q\qw\()_h, d\dw_h
 | 
						|
    .if \qw < 15
 | 
						|
        alias_dw_all  %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
 | 
						|
    .endif
 | 
						|
.endm
 | 
						|
 | 
						|
alias_dw_all    0, 0, 1
 | 
						|
 | 
						|
.noaltmacro
 | 
						|
 | 
						|
.macro alias_qw     name, qw, set=1
 | 
						|
    alias   \name\(), \qw, \set
 | 
						|
    alias   \name\()_l, \qw\()_l, \set
 | 
						|
    alias   \name\()_h, \qw\()_h, \set
 | 
						|
.endm
 | 
						|
 | 
						|
.macro prologue
 | 
						|
    push            {r4-r12, lr}
 | 
						|
    vpush           {q4-q7}
 | 
						|
.endm
 | 
						|
 | 
						|
.macro epilogue
 | 
						|
    vpop            {q4-q7}
 | 
						|
    pop             {r4-r12, pc}
 | 
						|
.endm
 | 
						|
 | 
						|
.macro  load_arg    reg, ix
 | 
						|
    ldr     \reg,   [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma
 | 
						|
 *                  int width, int height,
 | 
						|
 *                  int y_stride, int c_stride, int src_stride,
 | 
						|
 *                  int32_t coeff_table[9]);
 | 
						|
 */
 | 
						|
.macro  alias_loop_420sp set=1
 | 
						|
    alias   src,        r0, \set
 | 
						|
    alias   src0,       src, \set
 | 
						|
    alias   y,          r1, \set
 | 
						|
    alias   y0,         y, \set
 | 
						|
    alias   chroma,     r2, \set
 | 
						|
    alias   width,      r3, \set
 | 
						|
    alias   header,     width, \set
 | 
						|
 | 
						|
    alias   height,     r4, \set
 | 
						|
    alias   y_stride,   r5, \set
 | 
						|
    alias   c_stride,   r6, \set
 | 
						|
    alias   c_padding,  c_stride, \set
 | 
						|
    alias   src_stride, r7, \set
 | 
						|
 | 
						|
    alias   y0_end,     r8, \set
 | 
						|
 | 
						|
    alias   src_padding,r9, \set
 | 
						|
    alias   y_padding,  r10, \set
 | 
						|
 | 
						|
    alias   src1,       r11, \set
 | 
						|
    alias   y1,         r12, \set
 | 
						|
 | 
						|
    alias   coeff_table,r12, \set
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro  loop_420sp s_fmt, d_fmt, init, kernel, precision
 | 
						|
 | 
						|
function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1
 | 
						|
    prologue
 | 
						|
 | 
						|
    alias_loop_420sp
 | 
						|
 | 
						|
    load_arg    height,         4
 | 
						|
    load_arg    y_stride,       5
 | 
						|
    load_arg    c_stride,       6
 | 
						|
    load_arg    src_stride,     7
 | 
						|
    load_arg    coeff_table,    8
 | 
						|
 | 
						|
    \init       coeff_table
 | 
						|
 | 
						|
    sub         y_padding,      y_stride,       width
 | 
						|
    sub         c_padding,      c_stride,       width
 | 
						|
    sub         src_padding,    src_stride,     width, LSL #2
 | 
						|
 | 
						|
    add         y0_end,         y0,             width
 | 
						|
    and         header,         width,          #15
 | 
						|
 | 
						|
    add         y1,             y0,             y_stride
 | 
						|
    add         src1,           src0,           src_stride
 | 
						|
 | 
						|
0:
 | 
						|
    cmp         header,     #0
 | 
						|
    beq         1f
 | 
						|
 | 
						|
    \kernel     \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header
 | 
						|
 | 
						|
1:
 | 
						|
    \kernel     \s_fmt, \d_fmt, src0, src1, y0, y1, chroma
 | 
						|
 | 
						|
    cmp         y0,         y0_end
 | 
						|
    blt         1b
 | 
						|
2:
 | 
						|
    add         y0,         y1,         y_padding
 | 
						|
    add         y0_end,     y1,         y_stride
 | 
						|
    add         chroma,     chroma,     c_padding
 | 
						|
    add         src0,       src1,       src_padding
 | 
						|
 | 
						|
    add         y1,         y0,         y_stride
 | 
						|
    add         src1,       src0,       src_stride
 | 
						|
 | 
						|
    subs        height,     height,     #2
 | 
						|
 | 
						|
    bgt         0b
 | 
						|
 | 
						|
    epilogue
 | 
						|
 | 
						|
    alias_loop_420sp 0
 | 
						|
 | 
						|
endfunc
 | 
						|
.endm
 | 
						|
 | 
						|
.macro downsample
 | 
						|
    vpaddl.u8   r16x8,  r8x16
 | 
						|
    vpaddl.u8   g16x8,  g8x16
 | 
						|
    vpaddl.u8   b16x8,  b8x16
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
/* acculumate and right shift by 2 */
 | 
						|
.macro downsample_ars2
 | 
						|
    vpadal.u8   r16x8,  r8x16
 | 
						|
    vpadal.u8   g16x8,  g8x16
 | 
						|
    vpadal.u8   b16x8,  b8x16
 | 
						|
 | 
						|
    vrshr.u16   r16x8,  r16x8,  #2
 | 
						|
    vrshr.u16   g16x8,  g16x8,  #2
 | 
						|
    vrshr.u16   b16x8,  b16x8,  #2
 | 
						|
.endm
 | 
						|
 | 
						|
.macro store_y8_16x1            dst, count
 | 
						|
.ifc "\count",""
 | 
						|
    vstmia      \dst!,  {y8x16}
 | 
						|
.else
 | 
						|
    vstmia      \dst,   {y8x16}
 | 
						|
    add         \dst,   \dst,           \count
 | 
						|
.endif
 | 
						|
.endm
 | 
						|
 | 
						|
.macro store_chroma_nv12_8x1    dst, count
 | 
						|
.ifc "\count",""
 | 
						|
    vst2.i8     {u8x8, v8x8},   [\dst]!
 | 
						|
.else
 | 
						|
    vst2.i8     {u8x8, v8x8},   [\dst], \count
 | 
						|
.endif
 | 
						|
.endm
 | 
						|
 | 
						|
.macro store_chroma_nv21_8x1    dst, count
 | 
						|
.ifc "\count",""
 | 
						|
    vst2.i8     {v8x8, u8x8},   [\dst]!
 | 
						|
.else
 | 
						|
    vst2.i8     {v8x8, u8x8},   [\dst], \count
 | 
						|
.endif
 | 
						|
.endm
 | 
						|
 | 
						|
.macro load_8888_16x1   a, b, c, d, src, count
 | 
						|
.ifc "\count",""
 | 
						|
    vld4.8      {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l},  [\src]!
 | 
						|
    vld4.8      {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h},  [\src]!
 | 
						|
.else
 | 
						|
    vld4.8      {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l},  [\src]!
 | 
						|
    vld4.8      {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h},  [\src]
 | 
						|
    sub         \src,   \src,   #32
 | 
						|
    add         \src,   \src,   \count, LSL #2
 | 
						|
.endif
 | 
						|
.endm
 | 
						|
 | 
						|
.macro load_rgbx_16x1   src, count
 | 
						|
    load_8888_16x1  r, g, b, x, \src, \count
 | 
						|
.endm
 | 
						|
 | 
						|
.macro load_bgrx_16x1   src, count
 | 
						|
    load_8888_16x1  b, g, r, x, \src, \count
 | 
						|
.endm
 | 
						|
 | 
						|
.macro alias_src_rgbx   set=1
 | 
						|
    alias_src_8888  r, g, b, x, \set
 | 
						|
.endm
 | 
						|
 | 
						|
.macro alias_src_bgrx   set=1
 | 
						|
    alias_src_8888  b, g, r, x, \set
 | 
						|
.endm
 | 
						|
 | 
						|
.macro alias_dst_nv12   set=1
 | 
						|
    alias   u8x8, c8x8x2_l, \set
 | 
						|
    alias   v8x8, c8x8x2_h, \set
 | 
						|
.endm
 | 
						|
 | 
						|
.macro alias_dst_nv21   set=1
 | 
						|
    alias   v8x8, c8x8x2_l, \set
 | 
						|
    alias   u8x8, c8x8x2_h, \set
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
// common aliases
 | 
						|
 | 
						|
alias   CO_R    d0
 | 
						|
CO_RY   .dn     d0.s16[0]
 | 
						|
CO_RU   .dn     d0.s16[1]
 | 
						|
CO_RV   .dn     d0.s16[2]
 | 
						|
 | 
						|
alias   CO_G    d1
 | 
						|
CO_GY   .dn     d1.s16[0]
 | 
						|
CO_GU   .dn     d1.s16[1]
 | 
						|
CO_GV   .dn     d1.s16[2]
 | 
						|
 | 
						|
alias   CO_B    d2
 | 
						|
CO_BY   .dn     d2.s16[0]
 | 
						|
CO_BU   .dn     d2.s16[1]
 | 
						|
CO_BV   .dn     d2.s16[2]
 | 
						|
 | 
						|
alias   BIAS_U, d3
 | 
						|
alias   BIAS_V, BIAS_U
 | 
						|
 | 
						|
alias   BIAS_Y, q2
 | 
						|
 | 
						|
 | 
						|
/* q3-q6 R8G8B8X8 x16 */
 | 
						|
 | 
						|
.macro alias_src_8888   a, b, c, d, set
 | 
						|
    alias_qw  \a\()8x16, q3, \set
 | 
						|
    alias_qw  \b\()8x16, q4, \set
 | 
						|
    alias_qw  \c\()8x16, q5, \set
 | 
						|
    alias_qw  \d\()8x16, q6, \set
 | 
						|
.endm
 | 
						|
 | 
						|
.macro kernel_420_16x2  rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count
 | 
						|
    alias_src_\rgb_fmt
 | 
						|
    alias_dst_\yuv_fmt
 | 
						|
 | 
						|
    load_\rgb_fmt\()_16x1   \rgb0, \count
 | 
						|
 | 
						|
    downsample
 | 
						|
    compute_y_16x1
 | 
						|
    store_y8_16x1   \y0, \count
 | 
						|
 | 
						|
 | 
						|
    load_\rgb_fmt\()_16x1   \rgb1, \count
 | 
						|
    downsample_ars2
 | 
						|
    compute_y_16x1
 | 
						|
    store_y8_16x1   \y1, \count
 | 
						|
 | 
						|
    compute_chroma_8x1  u, U
 | 
						|
    compute_chroma_8x1  v, V
 | 
						|
 | 
						|
    store_chroma_\yuv_fmt\()_8x1 \chroma, \count
 | 
						|
 | 
						|
    alias_dst_\yuv_fmt 0
 | 
						|
    alias_src_\rgb_fmt 0
 | 
						|
.endm
 |