avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm
This enable that the asm optimization can be reused by VVC Signed-off-by: Wu Jianhua <toqsxw@outlook.com>
This commit is contained in:
		
							parent
							
								
									04c2e246a3
								
							
						
					
					
						commit
						7d9f1f5485
					
				@ -167,6 +167,7 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER)     += x86/hevc_add_res.o            \
 | 
				
			|||||||
                                          x86/hevc_deblock.o            \
 | 
					                                          x86/hevc_deblock.o            \
 | 
				
			||||||
                                          x86/hevc_idct.o               \
 | 
					                                          x86/hevc_idct.o               \
 | 
				
			||||||
                                          x86/hevc_mc.o                 \
 | 
					                                          x86/hevc_mc.o                 \
 | 
				
			||||||
 | 
					                                          x86/h26x/h2656_inter.o        \
 | 
				
			||||||
                                          x86/hevc_sao.o                \
 | 
					                                          x86/hevc_sao.o                \
 | 
				
			||||||
                                          x86/hevc_sao_10bit.o
 | 
					                                          x86/hevc_sao_10bit.o
 | 
				
			||||||
X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
 | 
					X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										1145
									
								
								libavcodec/x86/h26x/h2656_inter.asm
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1145
									
								
								libavcodec/x86/h26x/h2656_inter.asm
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										98
									
								
								libavcodec/x86/h26x/h2656dsp.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										98
									
								
								libavcodec/x86/h26x/h2656dsp.c
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,98 @@
 | 
				
			|||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * DSP for HEVC/VVC
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Copyright (C) 2022-2024 Nuo Mi
 | 
				
			||||||
 | 
					 * Copyright (c) 2023-2024 Wu Jianhua
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * This file is part of FFmpeg.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * FFmpeg is free software; you can redistribute it and/or
 | 
				
			||||||
 | 
					 * modify it under the terms of the GNU Lesser General Public
 | 
				
			||||||
 | 
					 * License as published by the Free Software Foundation; either
 | 
				
			||||||
 | 
					 * version 2.1 of the License, or (at your option) any later version.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * FFmpeg is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
				
			||||||
 | 
					 * Lesser General Public License for more details.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * You should have received a copy of the GNU Lesser General Public
 | 
				
			||||||
 | 
					 * License along with FFmpeg; if not, write to the Free Software
 | 
				
			||||||
 | 
					 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include "h2656dsp.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define mc_rep_func(name, bitd, step, W, opt) \
 | 
				
			||||||
 | 
					void ff_h2656_put_##name##W##_##bitd##_##opt(int16_t *_dst,                                                     \
 | 
				
			||||||
 | 
					    const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width)       \
 | 
				
			||||||
 | 
					{                                                                                                               \
 | 
				
			||||||
 | 
					    int i;                                                                                                      \
 | 
				
			||||||
 | 
					    int16_t *dst;                                                                                               \
 | 
				
			||||||
 | 
					    for (i = 0; i < W; i += step) {                                                                             \
 | 
				
			||||||
 | 
					        const uint8_t *src  = _src + (i * ((bitd + 7) / 8));                                                    \
 | 
				
			||||||
 | 
					        dst = _dst + i;                                                                                         \
 | 
				
			||||||
 | 
					        ff_h2656_put_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, hf, vf, width);                \
 | 
				
			||||||
 | 
					    }                                                                                                           \
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define mc_rep_uni_func(name, bitd, step, W, opt) \
 | 
				
			||||||
 | 
					void ff_h2656_put_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride,                            \
 | 
				
			||||||
 | 
					    const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width)       \
 | 
				
			||||||
 | 
					{                                                                                                               \
 | 
				
			||||||
 | 
					    int i;                                                                                                      \
 | 
				
			||||||
 | 
					    uint8_t *dst;                                                                                               \
 | 
				
			||||||
 | 
					    for (i = 0; i < W; i += step) {                                                                             \
 | 
				
			||||||
 | 
					        const uint8_t *src = _src + (i * ((bitd + 7) / 8));                                                     \
 | 
				
			||||||
 | 
					        dst = _dst + (i * ((bitd + 7) / 8));                                                                    \
 | 
				
			||||||
 | 
					        ff_h2656_put_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride,                         \
 | 
				
			||||||
 | 
					                                                          height, hf, vf, width);                               \
 | 
				
			||||||
 | 
					    }                                                                                                           \
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define mc_rep_funcs(name, bitd, step, W, opt)      \
 | 
				
			||||||
 | 
					    mc_rep_func(name, bitd, step, W, opt)           \
 | 
				
			||||||
 | 
					    mc_rep_uni_func(name, bitd, step, W, opt)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MC_REP_FUNCS_SSE4(fname)                 \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname,  8, 16,128, sse4)        \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname,  8, 16, 64, sse4)        \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname,  8, 16, 32, sse4)        \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname, 10,  8,128, sse4)        \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname, 10,  8, 64, sse4)        \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname, 10,  8, 32, sse4)        \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname, 10,  8, 16, sse4)        \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname, 12,  8,128, sse4)        \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname, 12,  8, 64, sse4)        \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname, 12,  8, 32, sse4)        \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname, 12,  8, 16, sse4)        \
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					MC_REP_FUNCS_SSE4(pixels)
 | 
				
			||||||
 | 
					MC_REP_FUNCS_SSE4(4tap_h)
 | 
				
			||||||
 | 
					MC_REP_FUNCS_SSE4(4tap_v)
 | 
				
			||||||
 | 
					MC_REP_FUNCS_SSE4(4tap_hv)
 | 
				
			||||||
 | 
					MC_REP_FUNCS_SSE4(8tap_h)
 | 
				
			||||||
 | 
					MC_REP_FUNCS_SSE4(8tap_v)
 | 
				
			||||||
 | 
					MC_REP_FUNCS_SSE4(8tap_hv)
 | 
				
			||||||
 | 
					mc_rep_funcs(8tap_hv, 8, 8, 16, sse4)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if HAVE_AVX2_EXTERNAL
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MC_REP_FUNCS_AVX2(fname)               \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname, 8, 32, 64, avx2)       \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname, 8, 32,128, avx2)       \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname,10, 16, 32, avx2)       \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname,10, 16, 64, avx2)       \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname,10, 16,128, avx2)       \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname,12, 16, 32, avx2)       \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname,12, 16, 64, avx2)       \
 | 
				
			||||||
 | 
					    mc_rep_funcs(fname,12, 16,128, avx2)       \
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					MC_REP_FUNCS_AVX2(pixels)
 | 
				
			||||||
 | 
					MC_REP_FUNCS_AVX2(8tap_h)
 | 
				
			||||||
 | 
					MC_REP_FUNCS_AVX2(8tap_v)
 | 
				
			||||||
 | 
					MC_REP_FUNCS_AVX2(8tap_hv)
 | 
				
			||||||
 | 
					MC_REP_FUNCS_AVX2(4tap_h)
 | 
				
			||||||
 | 
					MC_REP_FUNCS_AVX2(4tap_v)
 | 
				
			||||||
 | 
					MC_REP_FUNCS_AVX2(4tap_hv)
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
							
								
								
									
										103
									
								
								libavcodec/x86/h26x/h2656dsp.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										103
									
								
								libavcodec/x86/h26x/h2656dsp.h
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,103 @@
 | 
				
			|||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * DSP for HEVC/VVC
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Copyright (C) 2022-2024 Nuo Mi
 | 
				
			||||||
 | 
					 * Copyright (c) 2023-2024 Wu Jianhua
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * This file is part of FFmpeg.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * FFmpeg is free software; you can redistribute it and/or
 | 
				
			||||||
 | 
					 * modify it under the terms of the GNU Lesser General Public
 | 
				
			||||||
 | 
					 * License as published by the Free Software Foundation; either
 | 
				
			||||||
 | 
					 * version 2.1 of the License, or (at your option) any later version.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * FFmpeg is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
				
			||||||
 | 
					 * Lesser General Public License for more details.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * You should have received a copy of the GNU Lesser General Public
 | 
				
			||||||
 | 
					 * License along with FFmpeg; if not, write to the Free Software
 | 
				
			||||||
 | 
					 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifndef AVCODEC_X86_H26X_H2656DSP_H
 | 
				
			||||||
 | 
					#define AVCODEC_X86_H26X_H2656DSP_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include "config.h"
 | 
				
			||||||
 | 
					#include "libavutil/x86/asm.h"
 | 
				
			||||||
 | 
					#include "libavutil/x86/cpu.h"
 | 
				
			||||||
 | 
					#include <stdlib.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define H2656_PEL_PROTOTYPE(name, D, opt) \
 | 
				
			||||||
 | 
					void ff_h2656_put_ ## name ## _ ## D ## _##opt(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width);                               \
 | 
				
			||||||
 | 
					void ff_h2656_put_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width);    \
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define H2656_MC_8TAP_PROTOTYPES(fname, bitd, opt)    \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##4,  bitd, opt);        \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##6,  bitd, opt);        \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##8,  bitd, opt);        \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##12,  bitd, opt);       \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##16, bitd, opt);        \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##32, bitd, opt);        \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##64, bitd, opt);        \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##128, bitd, opt)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES(pixels  ,  8, sse4);
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES(pixels  , 10, sse4);
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES(pixels  , 12, sse4);
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES(8tap_h  ,  8, sse4);
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES(8tap_h  , 10, sse4);
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES(8tap_h  , 12, sse4);
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES(8tap_v  ,  8, sse4);
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES(8tap_v  , 10, sse4);
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES(8tap_v  , 12, sse4);
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES(8tap_hv ,  8, sse4);
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES(8tap_hv , 10, sse4);
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES(8tap_hv , 12, sse4);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define H2656_MC_4TAP_PROTOTYPES(fname, bitd, opt)    \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##2,  bitd, opt);        \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##4,  bitd, opt);        \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##6,  bitd, opt);        \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##8,  bitd, opt);        \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##12, bitd, opt);        \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##16, bitd, opt);        \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##32, bitd, opt);        \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##64, bitd, opt);        \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##128, bitd, opt)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define H2656_MC_4TAP_PROTOTYPES_SSE4(bitd)           \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(pixels2, bitd, sse4);         \
 | 
				
			||||||
 | 
					    H2656_MC_4TAP_PROTOTYPES(4tap_h, bitd, sse4);     \
 | 
				
			||||||
 | 
					    H2656_MC_4TAP_PROTOTYPES(4tap_v, bitd, sse4);     \
 | 
				
			||||||
 | 
					    H2656_MC_4TAP_PROTOTYPES(4tap_hv, bitd, sse4);    \
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					H2656_MC_4TAP_PROTOTYPES_SSE4(8)
 | 
				
			||||||
 | 
					H2656_MC_4TAP_PROTOTYPES_SSE4(10)
 | 
				
			||||||
 | 
					H2656_MC_4TAP_PROTOTYPES_SSE4(12)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define H2656_MC_8TAP_PROTOTYPES_AVX2(fname)              \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##32 , 8, avx2);             \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##64 , 8, avx2);             \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##128, 8, avx2);             \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##16 ,10, avx2);             \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##32 ,10, avx2);             \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##64 ,10, avx2);             \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##128,10, avx2);             \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##16 ,12, avx2);             \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##32 ,12, avx2);             \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##64 ,12, avx2);             \
 | 
				
			||||||
 | 
					    H2656_PEL_PROTOTYPE(fname##128,12, avx2)              \
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES_AVX2(pixels);
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES_AVX2(8tap_h);
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES_AVX2(8tap_v);
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES_AVX2(8tap_hv);
 | 
				
			||||||
 | 
					H2656_PEL_PROTOTYPE(8tap_hv16, 8, avx2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES_AVX2(4tap_h);
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES_AVX2(4tap_v);
 | 
				
			||||||
 | 
					H2656_MC_8TAP_PROTOTYPES_AVX2(4tap_hv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
@ -715,35 +715,6 @@ SECTION .text
 | 
				
			|||||||
;                         int height, int mx, int my)
 | 
					;                         int height, int mx, int my)
 | 
				
			||||||
; ******************************
 | 
					; ******************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
%macro HEVC_PUT_HEVC_PEL_PIXELS 2
 | 
					 | 
				
			||||||
HEVC_PEL_PIXELS     %1, %2
 | 
					 | 
				
			||||||
HEVC_UNI_PEL_PIXELS %1, %2
 | 
					 | 
				
			||||||
HEVC_BI_PEL_PIXELS  %1, %2
 | 
					 | 
				
			||||||
%endmacro
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
%macro HEVC_PEL_PIXELS 2
 | 
					 | 
				
			||||||
cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
 | 
					 | 
				
			||||||
    pxor               m2, m2
 | 
					 | 
				
			||||||
.loop:
 | 
					 | 
				
			||||||
    SIMPLE_LOAD       %1, %2, srcq, m0
 | 
					 | 
				
			||||||
    MC_PIXEL_COMPUTE  %1, %2, 1
 | 
					 | 
				
			||||||
    PEL_10STORE%1     dstq, m0, m1
 | 
					 | 
				
			||||||
    LOOP_END         dst, src, srcstride
 | 
					 | 
				
			||||||
    RET
 | 
					 | 
				
			||||||
 %endmacro
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
%macro HEVC_UNI_PEL_PIXELS 2
 | 
					 | 
				
			||||||
cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
 | 
					 | 
				
			||||||
.loop:
 | 
					 | 
				
			||||||
    SIMPLE_LOAD       %1, %2, srcq, m0
 | 
					 | 
				
			||||||
    PEL_%2STORE%1   dstq, m0, m1
 | 
					 | 
				
			||||||
    add             dstq, dststrideq             ; dst += dststride
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq             ; src += srcstride
 | 
					 | 
				
			||||||
    dec          heightd                         ; cmp height
 | 
					 | 
				
			||||||
    jnz               .loop                      ; height loop
 | 
					 | 
				
			||||||
    RET
 | 
					 | 
				
			||||||
%endmacro
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
%macro HEVC_BI_PEL_PIXELS 2
 | 
					%macro HEVC_BI_PEL_PIXELS 2
 | 
				
			||||||
cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
 | 
					cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
 | 
				
			||||||
    pxor              m2, m2
 | 
					    pxor              m2, m2
 | 
				
			||||||
@ -777,32 +748,8 @@ cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstrid
 | 
				
			|||||||
%define XMM_REGS  8
 | 
					%define XMM_REGS  8
 | 
				
			||||||
%endif
 | 
					%endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter
 | 
					 | 
				
			||||||
%assign %%stride ((%2 + 7)/8)
 | 
					 | 
				
			||||||
    EPEL_FILTER       %2, mx, m4, m5, rfilter
 | 
					 | 
				
			||||||
.loop:
 | 
					 | 
				
			||||||
    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | 
					 | 
				
			||||||
    EPEL_COMPUTE      %2, %1, m4, m5, 1
 | 
					 | 
				
			||||||
    PEL_10STORE%1      dstq, m0, m1
 | 
					 | 
				
			||||||
    LOOP_END         dst, src, srcstride
 | 
					 | 
				
			||||||
    RET
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter
 | 
					 | 
				
			||||||
%assign %%stride ((%2 + 7)/8)
 | 
					 | 
				
			||||||
    movdqa            m6, [pw_%2]
 | 
					 | 
				
			||||||
    EPEL_FILTER       %2, mx, m4, m5, rfilter
 | 
					 | 
				
			||||||
.loop:
 | 
					 | 
				
			||||||
    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | 
					 | 
				
			||||||
    EPEL_COMPUTE      %2, %1, m4, m5
 | 
					 | 
				
			||||||
    UNI_COMPUTE       %1, %2, m0, m1, m6
 | 
					 | 
				
			||||||
    PEL_%2STORE%1   dstq, m0, m1
 | 
					 | 
				
			||||||
    add             dstq, dststrideq             ; dst += dststride
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq             ; src += srcstride
 | 
					 | 
				
			||||||
    dec          heightd                         ; cmp height
 | 
					 | 
				
			||||||
    jnz               .loop                      ; height loop
 | 
					 | 
				
			||||||
    RET
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter
 | 
					cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter
 | 
				
			||||||
 | 
					%assign %%stride ((%2 + 7)/8)
 | 
				
			||||||
    movdqa            m6, [pw_bi_%2]
 | 
					    movdqa            m6, [pw_bi_%2]
 | 
				
			||||||
    EPEL_FILTER       %2, mx, m4, m5, rfilter
 | 
					    EPEL_FILTER       %2, mx, m4, m5, rfilter
 | 
				
			||||||
.loop:
 | 
					.loop:
 | 
				
			||||||
@ -824,36 +771,6 @@ cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcst
 | 
				
			|||||||
;                      int height, int mx, int my, int width)
 | 
					;                      int height, int mx, int my, int width)
 | 
				
			||||||
; ******************************
 | 
					; ******************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my
 | 
					 | 
				
			||||||
    movifnidn        myd, mym
 | 
					 | 
				
			||||||
    sub             srcq, srcstrideq
 | 
					 | 
				
			||||||
    EPEL_FILTER       %2, my, m4, m5, r3src
 | 
					 | 
				
			||||||
    lea           r3srcq, [srcstrideq*3]
 | 
					 | 
				
			||||||
.loop:
 | 
					 | 
				
			||||||
    EPEL_LOAD         %2, srcq, srcstride, %1
 | 
					 | 
				
			||||||
    EPEL_COMPUTE      %2, %1, m4, m5, 1
 | 
					 | 
				
			||||||
    PEL_10STORE%1     dstq, m0, m1
 | 
					 | 
				
			||||||
    LOOP_END          dst, src, srcstride
 | 
					 | 
				
			||||||
    RET
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my
 | 
					 | 
				
			||||||
    movifnidn        myd, mym
 | 
					 | 
				
			||||||
    movdqa            m6, [pw_%2]
 | 
					 | 
				
			||||||
    sub             srcq, srcstrideq
 | 
					 | 
				
			||||||
    EPEL_FILTER       %2, my, m4, m5, r3src
 | 
					 | 
				
			||||||
    lea           r3srcq, [srcstrideq*3]
 | 
					 | 
				
			||||||
.loop:
 | 
					 | 
				
			||||||
    EPEL_LOAD         %2, srcq, srcstride, %1
 | 
					 | 
				
			||||||
    EPEL_COMPUTE      %2, %1, m4, m5
 | 
					 | 
				
			||||||
    UNI_COMPUTE       %1, %2, m0, m1, m6
 | 
					 | 
				
			||||||
    PEL_%2STORE%1   dstq, m0, m1
 | 
					 | 
				
			||||||
    add             dstq, dststrideq             ; dst += dststride
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq             ; src += srcstride
 | 
					 | 
				
			||||||
    dec          heightd                         ; cmp height
 | 
					 | 
				
			||||||
    jnz               .loop                      ; height loop
 | 
					 | 
				
			||||||
    RET
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my
 | 
					cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my
 | 
				
			||||||
    movifnidn        myd, mym
 | 
					    movifnidn        myd, mym
 | 
				
			||||||
    movdqa            m6, [pw_bi_%2]
 | 
					    movdqa            m6, [pw_bi_%2]
 | 
				
			||||||
@ -882,135 +799,6 @@ cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcst
 | 
				
			|||||||
; ******************************
 | 
					; ******************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
%macro HEVC_PUT_HEVC_EPEL_HV 2
 | 
					%macro HEVC_PUT_HEVC_EPEL_HV 2
 | 
				
			||||||
cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src
 | 
					 | 
				
			||||||
%assign %%stride ((%2 + 7)/8)
 | 
					 | 
				
			||||||
    sub             srcq, srcstrideq
 | 
					 | 
				
			||||||
    EPEL_HV_FILTER    %2
 | 
					 | 
				
			||||||
    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | 
					 | 
				
			||||||
    EPEL_COMPUTE      %2, %1, m14, m15
 | 
					 | 
				
			||||||
%if (%1 > 8 && (%2 == 8))
 | 
					 | 
				
			||||||
    SWAP              m8, m1
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    SWAP              m4, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | 
					 | 
				
			||||||
    EPEL_COMPUTE      %2, %1, m14, m15
 | 
					 | 
				
			||||||
%if (%1 > 8 && (%2 == 8))
 | 
					 | 
				
			||||||
    SWAP              m9, m1
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    SWAP              m5, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | 
					 | 
				
			||||||
    EPEL_COMPUTE      %2, %1, m14, m15
 | 
					 | 
				
			||||||
%if (%1 > 8 && (%2 == 8))
 | 
					 | 
				
			||||||
    SWAP             m10, m1
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    SWAP              m6, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
.loop:
 | 
					 | 
				
			||||||
    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | 
					 | 
				
			||||||
    EPEL_COMPUTE      %2, %1, m14, m15
 | 
					 | 
				
			||||||
%if (%1 > 8 && (%2 == 8))
 | 
					 | 
				
			||||||
    SWAP             m11, m1
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    SWAP              m7, m0
 | 
					 | 
				
			||||||
    punpcklwd         m0, m4, m5
 | 
					 | 
				
			||||||
    punpcklwd         m2, m6, m7
 | 
					 | 
				
			||||||
%if %1 > 4
 | 
					 | 
				
			||||||
    punpckhwd         m1, m4, m5
 | 
					 | 
				
			||||||
    punpckhwd         m3, m6, m7
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    EPEL_COMPUTE      14, %1, m12, m13
 | 
					 | 
				
			||||||
%if (%1 > 8 && (%2 == 8))
 | 
					 | 
				
			||||||
    punpcklwd         m4, m8, m9
 | 
					 | 
				
			||||||
    punpcklwd         m2, m10, m11
 | 
					 | 
				
			||||||
    punpckhwd         m8, m8, m9
 | 
					 | 
				
			||||||
    punpckhwd         m3, m10, m11
 | 
					 | 
				
			||||||
    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
 | 
					 | 
				
			||||||
%if cpuflag(avx2)
 | 
					 | 
				
			||||||
    vinserti128       m2, m0, xm4, 1
 | 
					 | 
				
			||||||
    vperm2i128        m3, m0, m4, q0301
 | 
					 | 
				
			||||||
    PEL_10STORE%1     dstq, m2, m3
 | 
					 | 
				
			||||||
%else
 | 
					 | 
				
			||||||
    PEL_10STORE%1     dstq, m0, m4
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
%else
 | 
					 | 
				
			||||||
    PEL_10STORE%1     dstq, m0, m1
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    movdqa            m4, m5
 | 
					 | 
				
			||||||
    movdqa            m5, m6
 | 
					 | 
				
			||||||
    movdqa            m6, m7
 | 
					 | 
				
			||||||
%if (%1 > 8 && (%2 == 8))
 | 
					 | 
				
			||||||
    mova              m8, m9
 | 
					 | 
				
			||||||
    mova              m9, m10
 | 
					 | 
				
			||||||
    mova             m10, m11
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    LOOP_END         dst, src, srcstride
 | 
					 | 
				
			||||||
    RET
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src
 | 
					 | 
				
			||||||
%assign %%stride ((%2 + 7)/8)
 | 
					 | 
				
			||||||
    sub             srcq, srcstrideq
 | 
					 | 
				
			||||||
    EPEL_HV_FILTER    %2
 | 
					 | 
				
			||||||
    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | 
					 | 
				
			||||||
    EPEL_COMPUTE      %2, %1, m14, m15
 | 
					 | 
				
			||||||
%if (%1 > 8 && (%2 == 8))
 | 
					 | 
				
			||||||
    SWAP              m8, m1
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    SWAP              m4, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | 
					 | 
				
			||||||
    EPEL_COMPUTE      %2, %1, m14, m15
 | 
					 | 
				
			||||||
%if (%1 > 8 && (%2 == 8))
 | 
					 | 
				
			||||||
    SWAP              m9, m1
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    SWAP              m5, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | 
					 | 
				
			||||||
    EPEL_COMPUTE      %2, %1, m14, m15
 | 
					 | 
				
			||||||
%if (%1 > 8 && (%2 == 8))
 | 
					 | 
				
			||||||
    SWAP             m10, m1
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    SWAP              m6, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
.loop:
 | 
					 | 
				
			||||||
    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | 
					 | 
				
			||||||
    EPEL_COMPUTE      %2, %1, m14, m15
 | 
					 | 
				
			||||||
%if (%1 > 8 && (%2 == 8))
 | 
					 | 
				
			||||||
    SWAP             m11, m1
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    mova              m7, m0
 | 
					 | 
				
			||||||
    punpcklwd         m0, m4, m5
 | 
					 | 
				
			||||||
    punpcklwd         m2, m6, m7
 | 
					 | 
				
			||||||
%if %1 > 4
 | 
					 | 
				
			||||||
    punpckhwd         m1, m4, m5
 | 
					 | 
				
			||||||
    punpckhwd         m3, m6, m7
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    EPEL_COMPUTE      14, %1, m12, m13
 | 
					 | 
				
			||||||
%if (%1 > 8 && (%2 == 8))
 | 
					 | 
				
			||||||
    punpcklwd         m4, m8, m9
 | 
					 | 
				
			||||||
    punpcklwd         m2, m10, m11
 | 
					 | 
				
			||||||
    punpckhwd         m8, m8, m9
 | 
					 | 
				
			||||||
    punpckhwd         m3, m10, m11
 | 
					 | 
				
			||||||
    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
 | 
					 | 
				
			||||||
    UNI_COMPUTE       %1, %2, m0, m4, [pw_%2]
 | 
					 | 
				
			||||||
%else
 | 
					 | 
				
			||||||
    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    PEL_%2STORE%1   dstq, m0, m1
 | 
					 | 
				
			||||||
    mova              m4, m5
 | 
					 | 
				
			||||||
    mova              m5, m6
 | 
					 | 
				
			||||||
    mova              m6, m7
 | 
					 | 
				
			||||||
%if (%1 > 8 && (%2 == 8))
 | 
					 | 
				
			||||||
    mova              m8, m9
 | 
					 | 
				
			||||||
    mova              m9, m10
 | 
					 | 
				
			||||||
    mova             m10, m11
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    add             dstq, dststrideq             ; dst += dststride
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq             ; src += srcstride
 | 
					 | 
				
			||||||
    dec          heightd                         ; cmp height
 | 
					 | 
				
			||||||
    jnz               .loop                      ; height loop
 | 
					 | 
				
			||||||
    RET
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
 | 
					cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
 | 
				
			||||||
%assign %%stride ((%2 + 7)/8)
 | 
					%assign %%stride ((%2 + 7)/8)
 | 
				
			||||||
@ -1093,34 +881,6 @@ cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride,
 | 
				
			|||||||
; ******************************
 | 
					; ******************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
%macro HEVC_PUT_HEVC_QPEL 2
 | 
					%macro HEVC_PUT_HEVC_QPEL 2
 | 
				
			||||||
cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
 | 
					 | 
				
			||||||
    QPEL_FILTER       %2, mx
 | 
					 | 
				
			||||||
.loop:
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 10
 | 
					 | 
				
			||||||
    QPEL_COMPUTE      %1, %2, 1
 | 
					 | 
				
			||||||
%if %2 > 8
 | 
					 | 
				
			||||||
    packssdw          m0, m1
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    PEL_10STORE%1     dstq, m0, m1
 | 
					 | 
				
			||||||
    LOOP_END          dst, src, srcstride
 | 
					 | 
				
			||||||
    RET
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
 | 
					 | 
				
			||||||
    mova              m9, [pw_%2]
 | 
					 | 
				
			||||||
    QPEL_FILTER       %2, mx
 | 
					 | 
				
			||||||
.loop:
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 10
 | 
					 | 
				
			||||||
    QPEL_COMPUTE      %1, %2
 | 
					 | 
				
			||||||
%if %2 > 8
 | 
					 | 
				
			||||||
    packssdw          m0, m1
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    UNI_COMPUTE       %1, %2, m0, m1, m9
 | 
					 | 
				
			||||||
    PEL_%2STORE%1   dstq, m0, m1
 | 
					 | 
				
			||||||
    add             dstq, dststrideq             ; dst += dststride
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq             ; src += srcstride
 | 
					 | 
				
			||||||
    dec          heightd                         ; cmp height
 | 
					 | 
				
			||||||
    jnz               .loop                      ; height loop
 | 
					 | 
				
			||||||
    RET
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
 | 
					cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
 | 
				
			||||||
    movdqa            m9, [pw_bi_%2]
 | 
					    movdqa            m9, [pw_bi_%2]
 | 
				
			||||||
@ -1148,38 +908,6 @@ cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride,
 | 
				
			|||||||
;                       int height, int mx, int my, int width)
 | 
					;                       int height, int mx, int my, int width)
 | 
				
			||||||
; ******************************
 | 
					; ******************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
 | 
					 | 
				
			||||||
    movifnidn        myd, mym
 | 
					 | 
				
			||||||
    lea           r3srcq, [srcstrideq*3]
 | 
					 | 
				
			||||||
    QPEL_FILTER       %2, my
 | 
					 | 
				
			||||||
.loop:
 | 
					 | 
				
			||||||
    QPEL_V_LOAD       %2, srcq, srcstride, %1, r7
 | 
					 | 
				
			||||||
    QPEL_COMPUTE      %1, %2, 1
 | 
					 | 
				
			||||||
%if %2 > 8
 | 
					 | 
				
			||||||
    packssdw          m0, m1
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    PEL_10STORE%1     dstq, m0, m1
 | 
					 | 
				
			||||||
    LOOP_END         dst, src, srcstride
 | 
					 | 
				
			||||||
    RET
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
 | 
					 | 
				
			||||||
    movifnidn        myd, mym
 | 
					 | 
				
			||||||
    movdqa            m9, [pw_%2]
 | 
					 | 
				
			||||||
    lea           r3srcq, [srcstrideq*3]
 | 
					 | 
				
			||||||
    QPEL_FILTER       %2, my
 | 
					 | 
				
			||||||
.loop:
 | 
					 | 
				
			||||||
    QPEL_V_LOAD       %2, srcq, srcstride, %1, r8
 | 
					 | 
				
			||||||
    QPEL_COMPUTE      %1, %2
 | 
					 | 
				
			||||||
%if %2 > 8
 | 
					 | 
				
			||||||
    packssdw          m0, m1
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    UNI_COMPUTE       %1, %2, m0, m1, m9
 | 
					 | 
				
			||||||
    PEL_%2STORE%1   dstq, m0, m1
 | 
					 | 
				
			||||||
    add             dstq, dststrideq             ; dst += dststride
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq             ; src += srcstride
 | 
					 | 
				
			||||||
    dec          heightd                         ; cmp height
 | 
					 | 
				
			||||||
    jnz               .loop                      ; height loop
 | 
					 | 
				
			||||||
    RET
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
 | 
					cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
 | 
				
			||||||
    movifnidn        myd, mym
 | 
					    movifnidn        myd, mym
 | 
				
			||||||
@ -1210,162 +938,6 @@ cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride,
 | 
				
			|||||||
;                       int height, int mx, int my)
 | 
					;                       int height, int mx, int my)
 | 
				
			||||||
; ******************************
 | 
					; ******************************
 | 
				
			||||||
%macro HEVC_PUT_HEVC_QPEL_HV 2
 | 
					%macro HEVC_PUT_HEVC_QPEL_HV 2
 | 
				
			||||||
cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
 | 
					 | 
				
			||||||
%if cpuflag(avx2)
 | 
					 | 
				
			||||||
%assign %%shift  4
 | 
					 | 
				
			||||||
%else
 | 
					 | 
				
			||||||
%assign %%shift  3
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    sub              mxq, 1
 | 
					 | 
				
			||||||
    sub              myq, 1
 | 
					 | 
				
			||||||
    shl              mxq, %%shift                ; multiply by 32
 | 
					 | 
				
			||||||
    shl              myq, %%shift                ; multiply by 32
 | 
					 | 
				
			||||||
    lea           r3srcq, [srcstrideq*3]
 | 
					 | 
				
			||||||
    sub             srcq, r3srcq
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 15
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | 
					 | 
				
			||||||
    SWAP              m8, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 15
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | 
					 | 
				
			||||||
    SWAP              m9, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 15
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | 
					 | 
				
			||||||
    SWAP             m10, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 15
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | 
					 | 
				
			||||||
    SWAP             m11, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 15
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | 
					 | 
				
			||||||
    SWAP             m12, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 15
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | 
					 | 
				
			||||||
    SWAP             m13, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 15
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | 
					 | 
				
			||||||
    SWAP             m14, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
.loop:
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 15
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | 
					 | 
				
			||||||
    SWAP             m15, m0
 | 
					 | 
				
			||||||
    punpcklwd         m0, m8, m9
 | 
					 | 
				
			||||||
    punpcklwd         m2, m10, m11
 | 
					 | 
				
			||||||
    punpcklwd         m4, m12, m13
 | 
					 | 
				
			||||||
    punpcklwd         m6, m14, m15
 | 
					 | 
				
			||||||
%if %1 > 4
 | 
					 | 
				
			||||||
    punpckhwd         m1, m8, m9
 | 
					 | 
				
			||||||
    punpckhwd         m3, m10, m11
 | 
					 | 
				
			||||||
    punpckhwd         m5, m12, m13
 | 
					 | 
				
			||||||
    punpckhwd         m7, m14, m15
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
 | 
					 | 
				
			||||||
    PEL_10STORE%1     dstq, m0, m1
 | 
					 | 
				
			||||||
%if %1 <= 4
 | 
					 | 
				
			||||||
    movq              m8, m9
 | 
					 | 
				
			||||||
    movq              m9, m10
 | 
					 | 
				
			||||||
    movq             m10, m11
 | 
					 | 
				
			||||||
    movq             m11, m12
 | 
					 | 
				
			||||||
    movq             m12, m13
 | 
					 | 
				
			||||||
    movq             m13, m14
 | 
					 | 
				
			||||||
    movq             m14, m15
 | 
					 | 
				
			||||||
%else
 | 
					 | 
				
			||||||
    movdqa            m8, m9
 | 
					 | 
				
			||||||
    movdqa            m9, m10
 | 
					 | 
				
			||||||
    movdqa           m10, m11
 | 
					 | 
				
			||||||
    movdqa           m11, m12
 | 
					 | 
				
			||||||
    movdqa           m12, m13
 | 
					 | 
				
			||||||
    movdqa           m13, m14
 | 
					 | 
				
			||||||
    movdqa           m14, m15
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    LOOP_END         dst, src, srcstride
 | 
					 | 
				
			||||||
    RET
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
 | 
					 | 
				
			||||||
%if cpuflag(avx2)
 | 
					 | 
				
			||||||
%assign %%shift  4
 | 
					 | 
				
			||||||
%else
 | 
					 | 
				
			||||||
%assign %%shift  3
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    sub              mxq, 1
 | 
					 | 
				
			||||||
    sub              myq, 1
 | 
					 | 
				
			||||||
    shl              mxq, %%shift                ; multiply by 32
 | 
					 | 
				
			||||||
    shl              myq, %%shift                ; multiply by 32
 | 
					 | 
				
			||||||
    lea           r3srcq, [srcstrideq*3]
 | 
					 | 
				
			||||||
    sub             srcq, r3srcq
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 15
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | 
					 | 
				
			||||||
    SWAP              m8, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 15
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | 
					 | 
				
			||||||
    SWAP              m9, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 15
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | 
					 | 
				
			||||||
    SWAP             m10, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 15
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | 
					 | 
				
			||||||
    SWAP             m11, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 15
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | 
					 | 
				
			||||||
    SWAP             m12, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 15
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | 
					 | 
				
			||||||
    SWAP             m13, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 15
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | 
					 | 
				
			||||||
    SWAP             m14, m0
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq
 | 
					 | 
				
			||||||
.loop:
 | 
					 | 
				
			||||||
    QPEL_H_LOAD       %2, srcq, %1, 15
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | 
					 | 
				
			||||||
    SWAP             m15, m0
 | 
					 | 
				
			||||||
    punpcklwd         m0, m8, m9
 | 
					 | 
				
			||||||
    punpcklwd         m2, m10, m11
 | 
					 | 
				
			||||||
    punpcklwd         m4, m12, m13
 | 
					 | 
				
			||||||
    punpcklwd         m6, m14, m15
 | 
					 | 
				
			||||||
%if %1 > 4
 | 
					 | 
				
			||||||
    punpckhwd         m1, m8, m9
 | 
					 | 
				
			||||||
    punpckhwd         m3, m10, m11
 | 
					 | 
				
			||||||
    punpckhwd         m5, m12, m13
 | 
					 | 
				
			||||||
    punpckhwd         m7, m14, m15
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    QPEL_HV_COMPUTE   %1, 14, my, ackusdw
 | 
					 | 
				
			||||||
    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
 | 
					 | 
				
			||||||
    PEL_%2STORE%1   dstq, m0, m1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
%if %1 <= 4
 | 
					 | 
				
			||||||
    movq              m8, m9
 | 
					 | 
				
			||||||
    movq              m9, m10
 | 
					 | 
				
			||||||
    movq             m10, m11
 | 
					 | 
				
			||||||
    movq             m11, m12
 | 
					 | 
				
			||||||
    movq             m12, m13
 | 
					 | 
				
			||||||
    movq             m13, m14
 | 
					 | 
				
			||||||
    movq             m14, m15
 | 
					 | 
				
			||||||
%else
 | 
					 | 
				
			||||||
    mova            m8, m9
 | 
					 | 
				
			||||||
    mova            m9, m10
 | 
					 | 
				
			||||||
    mova           m10, m11
 | 
					 | 
				
			||||||
    mova           m11, m12
 | 
					 | 
				
			||||||
    mova           m12, m13
 | 
					 | 
				
			||||||
    mova           m13, m14
 | 
					 | 
				
			||||||
    mova           m14, m15
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
    add             dstq, dststrideq             ; dst += dststride
 | 
					 | 
				
			||||||
    add             srcq, srcstrideq             ; src += srcstride
 | 
					 | 
				
			||||||
    dec          heightd                         ; cmp height
 | 
					 | 
				
			||||||
    jnz               .loop                      ; height loop
 | 
					 | 
				
			||||||
    RET
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
 | 
					cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
 | 
				
			||||||
%if cpuflag(avx2)
 | 
					%if cpuflag(avx2)
 | 
				
			||||||
@ -1613,22 +1185,22 @@ WEIGHTING_FUNCS 4, 12
 | 
				
			|||||||
WEIGHTING_FUNCS 6, 12
 | 
					WEIGHTING_FUNCS 6, 12
 | 
				
			||||||
WEIGHTING_FUNCS 8, 12
 | 
					WEIGHTING_FUNCS 8, 12
 | 
				
			||||||
 | 
					
 | 
				
			||||||
HEVC_PUT_HEVC_PEL_PIXELS  2, 8
 | 
					HEVC_BI_PEL_PIXELS  2, 8
 | 
				
			||||||
HEVC_PUT_HEVC_PEL_PIXELS  4, 8
 | 
					HEVC_BI_PEL_PIXELS  4, 8
 | 
				
			||||||
HEVC_PUT_HEVC_PEL_PIXELS  6, 8
 | 
					HEVC_BI_PEL_PIXELS  6, 8
 | 
				
			||||||
HEVC_PUT_HEVC_PEL_PIXELS  8, 8
 | 
					HEVC_BI_PEL_PIXELS  8, 8
 | 
				
			||||||
HEVC_PUT_HEVC_PEL_PIXELS 12, 8
 | 
					HEVC_BI_PEL_PIXELS 12, 8
 | 
				
			||||||
HEVC_PUT_HEVC_PEL_PIXELS 16, 8
 | 
					HEVC_BI_PEL_PIXELS 16, 8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
HEVC_PUT_HEVC_PEL_PIXELS 2, 10
 | 
					HEVC_BI_PEL_PIXELS 2, 10
 | 
				
			||||||
HEVC_PUT_HEVC_PEL_PIXELS 4, 10
 | 
					HEVC_BI_PEL_PIXELS 4, 10
 | 
				
			||||||
HEVC_PUT_HEVC_PEL_PIXELS 6, 10
 | 
					HEVC_BI_PEL_PIXELS 6, 10
 | 
				
			||||||
HEVC_PUT_HEVC_PEL_PIXELS 8, 10
 | 
					HEVC_BI_PEL_PIXELS 8, 10
 | 
				
			||||||
 | 
					
 | 
				
			||||||
HEVC_PUT_HEVC_PEL_PIXELS 2, 12
 | 
					HEVC_BI_PEL_PIXELS 2, 12
 | 
				
			||||||
HEVC_PUT_HEVC_PEL_PIXELS 4, 12
 | 
					HEVC_BI_PEL_PIXELS 4, 12
 | 
				
			||||||
HEVC_PUT_HEVC_PEL_PIXELS 6, 12
 | 
					HEVC_BI_PEL_PIXELS 6, 12
 | 
				
			||||||
HEVC_PUT_HEVC_PEL_PIXELS 8, 12
 | 
					HEVC_BI_PEL_PIXELS 8, 12
 | 
				
			||||||
 | 
					
 | 
				
			||||||
HEVC_PUT_HEVC_EPEL 2,  8
 | 
					HEVC_PUT_HEVC_EPEL 2,  8
 | 
				
			||||||
HEVC_PUT_HEVC_EPEL 4,  8
 | 
					HEVC_PUT_HEVC_EPEL 4,  8
 | 
				
			||||||
@ -1693,8 +1265,8 @@ HEVC_PUT_HEVC_QPEL_HV 8, 12
 | 
				
			|||||||
%if HAVE_AVX2_EXTERNAL
 | 
					%if HAVE_AVX2_EXTERNAL
 | 
				
			||||||
INIT_YMM avx2  ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
 | 
					INIT_YMM avx2  ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
HEVC_PUT_HEVC_PEL_PIXELS 32, 8
 | 
					HEVC_BI_PEL_PIXELS 32, 8
 | 
				
			||||||
HEVC_PUT_HEVC_PEL_PIXELS 16, 10
 | 
					HEVC_BI_PEL_PIXELS 16, 10
 | 
				
			||||||
 | 
					
 | 
				
			||||||
HEVC_PUT_HEVC_EPEL 32, 8
 | 
					HEVC_PUT_HEVC_EPEL 32, 8
 | 
				
			||||||
HEVC_PUT_HEVC_EPEL 16, 10
 | 
					HEVC_PUT_HEVC_EPEL 16, 10
 | 
				
			||||||
 | 
				
			|||||||
@ -1,6 +1,7 @@
 | 
				
			|||||||
/*
 | 
					/*
 | 
				
			||||||
 * Copyright (c) 2013 Seppo Tomperi
 | 
					 * Copyright (c) 2013 Seppo Tomperi
 | 
				
			||||||
 * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
 | 
					 * Copyright (c) 2013-2014 Pierre-Edouard Lepere
 | 
				
			||||||
 | 
					 * Copyright (c) 2023-2024 Wu Jianhua
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * This file is part of FFmpeg.
 | 
					 * This file is part of FFmpeg.
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
@ -27,6 +28,7 @@
 | 
				
			|||||||
#include "libavutil/x86/cpu.h"
 | 
					#include "libavutil/x86/cpu.h"
 | 
				
			||||||
#include "libavcodec/hevcdsp.h"
 | 
					#include "libavcodec/hevcdsp.h"
 | 
				
			||||||
#include "libavcodec/x86/hevcdsp.h"
 | 
					#include "libavcodec/x86/hevcdsp.h"
 | 
				
			||||||
 | 
					#include "libavcodec/x86/h26x/h2656dsp.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define LFC_FUNC(DIR, DEPTH, OPT) \
 | 
					#define LFC_FUNC(DIR, DEPTH, OPT) \
 | 
				
			||||||
void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, const int *tc, const uint8_t *no_p, const uint8_t *no_q);
 | 
					void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, const int *tc, const uint8_t *no_p, const uint8_t *no_q);
 | 
				
			||||||
@ -83,6 +85,110 @@ void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit);
 | 
				
			|||||||
IDCT_FUNCS(sse2)
 | 
					IDCT_FUNCS(sse2)
 | 
				
			||||||
IDCT_FUNCS(avx)
 | 
					IDCT_FUNCS(avx)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ff_hevc_pel_filters ff_hevc_qpel_filters
 | 
				
			||||||
 | 
					#define DECL_HV_FILTER(f)                                  \
 | 
				
			||||||
 | 
					    const uint8_t *hf = ff_hevc_ ## f ## _filters[mx - 1]; \
 | 
				
			||||||
 | 
					    const uint8_t *vf = ff_hevc_ ## f ## _filters[my - 1];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define FW_PUT(p, a, b, depth, opt) \
 | 
				
			||||||
 | 
					void ff_hevc_put_hevc_ ## a ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride,   \
 | 
				
			||||||
 | 
					                                                    int height, intptr_t mx, intptr_t my,int width)          \
 | 
				
			||||||
 | 
					{                                                                                                            \
 | 
				
			||||||
 | 
					    DECL_HV_FILTER(p)                                                                                        \
 | 
				
			||||||
 | 
					    ff_h2656_put_ ## b ## _ ## depth ## _##opt(dst, src, srcstride, height, hf, vf, width);                  \
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define FW_PUT_UNI(p, a, b, depth, opt) \
 | 
				
			||||||
 | 
					void ff_hevc_put_hevc_uni_ ## a ## _ ## depth ## _##opt(uint8_t *dst, ptrdiff_t dststride,                   \
 | 
				
			||||||
 | 
					                                                        const uint8_t *src, ptrdiff_t srcstride,             \
 | 
				
			||||||
 | 
					                                                        int height, intptr_t mx, intptr_t my, int width)     \
 | 
				
			||||||
 | 
					{                                                                                                            \
 | 
				
			||||||
 | 
					    DECL_HV_FILTER(p)                                                                                        \
 | 
				
			||||||
 | 
					    ff_h2656_put_uni_ ## b ## _ ## depth ## _##opt(dst, dststride, src, srcstride, height, hf, vf, width);   \
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define FW_PUT_FUNCS(p, a, b, depth, opt) \
 | 
				
			||||||
 | 
					    FW_PUT(p, a, b, depth, opt) \
 | 
				
			||||||
 | 
					    FW_PUT_UNI(p, a, b, depth, opt)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define FW_PEL(w, depth, opt) FW_PUT_FUNCS(pel, pel_pixels##w, pixels##w, depth, opt)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define FW_DIR(npel, n, w, depth, opt) \
 | 
				
			||||||
 | 
					    FW_PUT_FUNCS(npel, npel ## _h##w,  n ## tap_h##w,  depth, opt) \
 | 
				
			||||||
 | 
					    FW_PUT_FUNCS(npel, npel ## _v##w,  n ## tap_v##w,  depth, opt)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define FW_DIR_HV(npel, n, w, depth, opt) \
 | 
				
			||||||
 | 
					    FW_PUT_FUNCS(npel, npel ## _hv##w,  n ## tap_hv##w,  depth, opt)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					FW_PEL(4,   8, sse4);
 | 
				
			||||||
 | 
					FW_PEL(6,   8, sse4);
 | 
				
			||||||
 | 
					FW_PEL(8,   8, sse4);
 | 
				
			||||||
 | 
					FW_PEL(12,  8, sse4);
 | 
				
			||||||
 | 
					FW_PEL(16,  8, sse4);
 | 
				
			||||||
 | 
					FW_PEL(4,  10, sse4);
 | 
				
			||||||
 | 
					FW_PEL(6,  10, sse4);
 | 
				
			||||||
 | 
					FW_PEL(8,  10, sse4);
 | 
				
			||||||
 | 
					FW_PEL(4,  12, sse4);
 | 
				
			||||||
 | 
					FW_PEL(6,  12, sse4);
 | 
				
			||||||
 | 
					FW_PEL(8,  12, sse4);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define FW_EPEL(w, depth, opt) FW_DIR(epel, 4, w, depth, opt)
 | 
				
			||||||
 | 
					#define FW_EPEL_HV(w, depth, opt) FW_DIR_HV(epel, 4, w, depth, opt)
 | 
				
			||||||
 | 
					#define FW_EPEL_FUNCS(w, depth, opt) \
 | 
				
			||||||
 | 
					    FW_EPEL(w, depth, opt)           \
 | 
				
			||||||
 | 
					    FW_EPEL_HV(w, depth, opt)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					FW_EPEL(12,  8, sse4);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					FW_EPEL_FUNCS(4,   8, sse4);
 | 
				
			||||||
 | 
					FW_EPEL_FUNCS(6,   8, sse4);
 | 
				
			||||||
 | 
					FW_EPEL_FUNCS(8,   8, sse4);
 | 
				
			||||||
 | 
					FW_EPEL_FUNCS(16,  8, sse4);
 | 
				
			||||||
 | 
					FW_EPEL_FUNCS(4,  10, sse4);
 | 
				
			||||||
 | 
					FW_EPEL_FUNCS(6,  10, sse4);
 | 
				
			||||||
 | 
					FW_EPEL_FUNCS(8,  10, sse4);
 | 
				
			||||||
 | 
					FW_EPEL_FUNCS(4,  12, sse4);
 | 
				
			||||||
 | 
					FW_EPEL_FUNCS(6,  12, sse4);
 | 
				
			||||||
 | 
					FW_EPEL_FUNCS(8,  12, sse4);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define FW_QPEL(w, depth, opt) FW_DIR(qpel, 8, w, depth, opt)
 | 
				
			||||||
 | 
					#define FW_QPEL_HV(w, depth, opt) FW_DIR_HV(qpel, 8, w, depth, opt)
 | 
				
			||||||
 | 
					#define FW_QPEL_FUNCS(w, depth, opt) \
 | 
				
			||||||
 | 
					    FW_QPEL(w, depth, opt)           \
 | 
				
			||||||
 | 
					    FW_QPEL_HV(w, depth, opt)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					FW_QPEL(12, 8, sse4);
 | 
				
			||||||
 | 
					FW_QPEL(16, 8, sse4);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					FW_QPEL_FUNCS(4,   8, sse4);
 | 
				
			||||||
 | 
					FW_QPEL_FUNCS(8,   8, sse4);
 | 
				
			||||||
 | 
					FW_QPEL_FUNCS(4,  10, sse4);
 | 
				
			||||||
 | 
					FW_QPEL_FUNCS(8,  10, sse4);
 | 
				
			||||||
 | 
					FW_QPEL_FUNCS(4,  12, sse4);
 | 
				
			||||||
 | 
					FW_QPEL_FUNCS(8,  12, sse4);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef HAVE_AVX2_EXTERNAL
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					FW_PEL(32,  8, avx2);
 | 
				
			||||||
 | 
					FW_PUT(pel, pel_pixels16, pixels16, 10, avx2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					FW_EPEL(32,  8, avx2);
 | 
				
			||||||
 | 
					FW_EPEL(16, 10, avx2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					FW_EPEL_HV(32,  8, avx2);
 | 
				
			||||||
 | 
					FW_EPEL_HV(16, 10, avx2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					FW_QPEL(32,  8, avx2);
 | 
				
			||||||
 | 
					FW_QPEL(16, 10, avx2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					FW_QPEL_HV(16, 10, avx2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define mc_rep_func(name, bitd, step, W, opt) \
 | 
					#define mc_rep_func(name, bitd, step, W, opt) \
 | 
				
			||||||
void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst,                                                 \
 | 
					void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst,                                                 \
 | 
				
			||||||
                                                 const uint8_t *_src, ptrdiff_t _srcstride, int height,         \
 | 
					                                                 const uint8_t *_src, ptrdiff_t _srcstride, int height,         \
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user