aarch64: vp9: Implement NEON loop filters
This work is sponsored by, and copyright, Google.
These are ported from the ARM version; thanks to the larger
amount of registers available, we can do the loop filters with
16 pixels at a time. The implementation is fully templated, with
a single macro which can generate versions for both 8 and
16 pixels wide, for both 4, 8 and 16 pixels loop filters
(and the 4/8 mixed versions as well).
For the 8 pixel wide versions, it is pretty close in speed (the
v_4_8 and v_8_8 filters are the best examples of this; the h_4_8
and h_8_8 filters seem to get some gain in the load/transpose/store
part). For the 16 pixels wide ones, we get a speedup of around
1.2-1.4x compared to the 32 bit version.
Examples of runtimes vs the 32 bit version, on a Cortex A53:
                                       ARM AArch64
vp9_loop_filter_h_4_8_neon:          144.0   127.2
vp9_loop_filter_h_8_8_neon:          207.0   182.5
vp9_loop_filter_h_16_8_neon:         415.0   328.7
vp9_loop_filter_h_16_16_neon:        672.0   558.6
vp9_loop_filter_mix2_h_44_16_neon:   302.0   203.5
vp9_loop_filter_mix2_h_48_16_neon:   365.0   305.2
vp9_loop_filter_mix2_h_84_16_neon:   365.0   305.2
vp9_loop_filter_mix2_h_88_16_neon:   376.0   305.2
vp9_loop_filter_mix2_v_44_16_neon:   193.2   128.2
vp9_loop_filter_mix2_v_48_16_neon:   246.7   218.4
vp9_loop_filter_mix2_v_84_16_neon:   248.0   218.5
vp9_loop_filter_mix2_v_88_16_neon:   302.0   218.2
vp9_loop_filter_v_4_8_neon:           89.0    88.7
vp9_loop_filter_v_8_8_neon:          141.0   137.7
vp9_loop_filter_v_16_8_neon:         295.0   272.7
vp9_loop_filter_v_16_16_neon:        546.0   453.7
The speedup vs C code in checkasm tests is around 2-7x, which is
pretty much the same as for the 32 bit version. Even if these functions
are faster than their 32 bit equivalent, the C version that we compare
to also became around 1.3-1.7x faster than the C version in 32 bit.
Based on START_TIMER/STOP_TIMER wrapping around a few individual
functions, the speedup vs C code is around 4-5x.
Examples of runtimes vs C on a Cortex A57 (for a slightly older version
of the patch):
                         A57 gcc-5.3  neon
loop_filter_h_4_8_neon:        256.6  93.4
loop_filter_h_8_8_neon:        307.3 139.1
loop_filter_h_16_8_neon:       340.1 254.1
loop_filter_h_16_16_neon:      827.0 407.9
loop_filter_mix2_h_44_16_neon: 524.5 155.4
loop_filter_mix2_h_48_16_neon: 644.5 173.3
loop_filter_mix2_h_84_16_neon: 630.5 222.0
loop_filter_mix2_h_88_16_neon: 697.3 222.0
loop_filter_mix2_v_44_16_neon: 598.5 100.6
loop_filter_mix2_v_48_16_neon: 651.5 127.0
loop_filter_mix2_v_84_16_neon: 591.5 167.1
loop_filter_mix2_v_88_16_neon: 855.1 166.7
loop_filter_v_4_8_neon:        271.7  65.3
loop_filter_v_8_8_neon:        312.5 106.9
loop_filter_v_16_8_neon:       473.3 206.5
loop_filter_v_16_16_neon:      976.1 327.8
The speed-up compared to the C functions is 2.5 to 6 and the cortex-a57
is again 30-50% faster than the cortex-a53.
This is an adapted cherry-pick from libav commits
9d2afd1eb8c5cc0633062430e66326dbf98c99e0 and
31756abe29eb039a11c59a42cb12e0cc2aef3b97.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
			
			
This commit is contained in:
		
							parent
							
								
									f43079e11c
								
							
						
					
					
						commit
						f1212e472b
					
				@ -43,4 +43,5 @@ NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 | 
			
		||||
NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/synth_filter_neon.o
 | 
			
		||||
NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
 | 
			
		||||
NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_neon.o             \
 | 
			
		||||
                                           aarch64/vp9lpf_neon.o               \
 | 
			
		||||
                                           aarch64/vp9mc_neon.o
 | 
			
		||||
 | 
			
		||||
@ -201,8 +201,56 @@ static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp, int bpp)
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define define_loop_filter(dir, wd, len) \
 | 
			
		||||
void ff_vp9_loop_filter_##dir##_##wd##_##len##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
 | 
			
		||||
 | 
			
		||||
#define define_loop_filters(wd, len) \
 | 
			
		||||
    define_loop_filter(h, wd, len);  \
 | 
			
		||||
    define_loop_filter(v, wd, len)
 | 
			
		||||
 | 
			
		||||
define_loop_filters(4, 8);
 | 
			
		||||
define_loop_filters(8, 8);
 | 
			
		||||
define_loop_filters(16, 8);
 | 
			
		||||
 | 
			
		||||
define_loop_filters(16, 16);
 | 
			
		||||
 | 
			
		||||
define_loop_filters(44, 16);
 | 
			
		||||
define_loop_filters(48, 16);
 | 
			
		||||
define_loop_filters(84, 16);
 | 
			
		||||
define_loop_filters(88, 16);
 | 
			
		||||
 | 
			
		||||
static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp, int bpp)
 | 
			
		||||
{
 | 
			
		||||
    int cpu_flags = av_get_cpu_flags();
 | 
			
		||||
 | 
			
		||||
    if (bpp != 8)
 | 
			
		||||
        return;
 | 
			
		||||
 | 
			
		||||
    if (have_neon(cpu_flags)) {
 | 
			
		||||
        dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_neon;
 | 
			
		||||
        dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_neon;
 | 
			
		||||
        dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_neon;
 | 
			
		||||
        dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_neon;
 | 
			
		||||
        dsp->loop_filter_8[2][1] = ff_vp9_loop_filter_v_16_8_neon;
 | 
			
		||||
        dsp->loop_filter_8[2][0] = ff_vp9_loop_filter_h_16_8_neon;
 | 
			
		||||
 | 
			
		||||
        dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon;
 | 
			
		||||
        dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon;
 | 
			
		||||
 | 
			
		||||
        dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon;
 | 
			
		||||
        dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon;
 | 
			
		||||
        dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_neon;
 | 
			
		||||
        dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_neon;
 | 
			
		||||
        dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_neon;
 | 
			
		||||
        dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_neon;
 | 
			
		||||
        dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_neon;
 | 
			
		||||
        dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_neon;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp)
 | 
			
		||||
{
 | 
			
		||||
    vp9dsp_mc_init_aarch64(dsp, bpp);
 | 
			
		||||
    vp9dsp_loopfilter_init_aarch64(dsp, bpp);
 | 
			
		||||
    vp9dsp_itxfm_init_aarch64(dsp, bpp);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										1355
									
								
								libavcodec/aarch64/vp9lpf_neon.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1355
									
								
								libavcodec/aarch64/vp9lpf_neon.S
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user