avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm

This enable that the asm optimization can be reused by VVC

Signed-off-by: Wu Jianhua <toqsxw@outlook.com>
This commit is contained in:
Wu Jianhua 2024-01-24 02:17:06 +08:00 committed by Nuo Mi
parent 04c2e246a3
commit 7d9f1f5485
6 changed files with 1471 additions and 446 deletions

View File

@ -167,6 +167,7 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o \
x86/hevc_deblock.o \
x86/hevc_idct.o \
x86/hevc_mc.o \
x86/h26x/h2656_inter.o \
x86/hevc_sao.o \
x86/hevc_sao_10bit.o
X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,98 @@
/*
* DSP for HEVC/VVC
*
* Copyright (C) 2022-2024 Nuo Mi
* Copyright (c) 2023-2024 Wu Jianhua
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "h2656dsp.h"
#define mc_rep_func(name, bitd, step, W, opt) \
void ff_h2656_put_##name##W##_##bitd##_##opt(int16_t *_dst, \
const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width) \
{ \
int i; \
int16_t *dst; \
for (i = 0; i < W; i += step) { \
const uint8_t *src = _src + (i * ((bitd + 7) / 8)); \
dst = _dst + i; \
ff_h2656_put_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, hf, vf, width); \
} \
}
#define mc_rep_uni_func(name, bitd, step, W, opt) \
void ff_h2656_put_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, \
const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width) \
{ \
int i; \
uint8_t *dst; \
for (i = 0; i < W; i += step) { \
const uint8_t *src = _src + (i * ((bitd + 7) / 8)); \
dst = _dst + (i * ((bitd + 7) / 8)); \
ff_h2656_put_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, \
height, hf, vf, width); \
} \
}
#define mc_rep_funcs(name, bitd, step, W, opt) \
mc_rep_func(name, bitd, step, W, opt) \
mc_rep_uni_func(name, bitd, step, W, opt)
#define MC_REP_FUNCS_SSE4(fname) \
mc_rep_funcs(fname, 8, 16,128, sse4) \
mc_rep_funcs(fname, 8, 16, 64, sse4) \
mc_rep_funcs(fname, 8, 16, 32, sse4) \
mc_rep_funcs(fname, 10, 8,128, sse4) \
mc_rep_funcs(fname, 10, 8, 64, sse4) \
mc_rep_funcs(fname, 10, 8, 32, sse4) \
mc_rep_funcs(fname, 10, 8, 16, sse4) \
mc_rep_funcs(fname, 12, 8,128, sse4) \
mc_rep_funcs(fname, 12, 8, 64, sse4) \
mc_rep_funcs(fname, 12, 8, 32, sse4) \
mc_rep_funcs(fname, 12, 8, 16, sse4) \
MC_REP_FUNCS_SSE4(pixels)
MC_REP_FUNCS_SSE4(4tap_h)
MC_REP_FUNCS_SSE4(4tap_v)
MC_REP_FUNCS_SSE4(4tap_hv)
MC_REP_FUNCS_SSE4(8tap_h)
MC_REP_FUNCS_SSE4(8tap_v)
MC_REP_FUNCS_SSE4(8tap_hv)
mc_rep_funcs(8tap_hv, 8, 8, 16, sse4)
#if HAVE_AVX2_EXTERNAL
#define MC_REP_FUNCS_AVX2(fname) \
mc_rep_funcs(fname, 8, 32, 64, avx2) \
mc_rep_funcs(fname, 8, 32,128, avx2) \
mc_rep_funcs(fname,10, 16, 32, avx2) \
mc_rep_funcs(fname,10, 16, 64, avx2) \
mc_rep_funcs(fname,10, 16,128, avx2) \
mc_rep_funcs(fname,12, 16, 32, avx2) \
mc_rep_funcs(fname,12, 16, 64, avx2) \
mc_rep_funcs(fname,12, 16,128, avx2) \
MC_REP_FUNCS_AVX2(pixels)
MC_REP_FUNCS_AVX2(8tap_h)
MC_REP_FUNCS_AVX2(8tap_v)
MC_REP_FUNCS_AVX2(8tap_hv)
MC_REP_FUNCS_AVX2(4tap_h)
MC_REP_FUNCS_AVX2(4tap_v)
MC_REP_FUNCS_AVX2(4tap_hv)
#endif

View File

@ -0,0 +1,103 @@
/*
* DSP for HEVC/VVC
*
* Copyright (C) 2022-2024 Nuo Mi
* Copyright (c) 2023-2024 Wu Jianhua
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_H26X_H2656DSP_H
#define AVCODEC_X86_H26X_H2656DSP_H
#include "config.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include <stdlib.h>
#define H2656_PEL_PROTOTYPE(name, D, opt) \
void ff_h2656_put_ ## name ## _ ## D ## _##opt(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width); \
void ff_h2656_put_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width); \
#define H2656_MC_8TAP_PROTOTYPES(fname, bitd, opt) \
H2656_PEL_PROTOTYPE(fname##4, bitd, opt); \
H2656_PEL_PROTOTYPE(fname##6, bitd, opt); \
H2656_PEL_PROTOTYPE(fname##8, bitd, opt); \
H2656_PEL_PROTOTYPE(fname##12, bitd, opt); \
H2656_PEL_PROTOTYPE(fname##16, bitd, opt); \
H2656_PEL_PROTOTYPE(fname##32, bitd, opt); \
H2656_PEL_PROTOTYPE(fname##64, bitd, opt); \
H2656_PEL_PROTOTYPE(fname##128, bitd, opt)
H2656_MC_8TAP_PROTOTYPES(pixels , 8, sse4);
H2656_MC_8TAP_PROTOTYPES(pixels , 10, sse4);
H2656_MC_8TAP_PROTOTYPES(pixels , 12, sse4);
H2656_MC_8TAP_PROTOTYPES(8tap_h , 8, sse4);
H2656_MC_8TAP_PROTOTYPES(8tap_h , 10, sse4);
H2656_MC_8TAP_PROTOTYPES(8tap_h , 12, sse4);
H2656_MC_8TAP_PROTOTYPES(8tap_v , 8, sse4);
H2656_MC_8TAP_PROTOTYPES(8tap_v , 10, sse4);
H2656_MC_8TAP_PROTOTYPES(8tap_v , 12, sse4);
H2656_MC_8TAP_PROTOTYPES(8tap_hv , 8, sse4);
H2656_MC_8TAP_PROTOTYPES(8tap_hv , 10, sse4);
H2656_MC_8TAP_PROTOTYPES(8tap_hv , 12, sse4);
#define H2656_MC_4TAP_PROTOTYPES(fname, bitd, opt) \
H2656_PEL_PROTOTYPE(fname##2, bitd, opt); \
H2656_PEL_PROTOTYPE(fname##4, bitd, opt); \
H2656_PEL_PROTOTYPE(fname##6, bitd, opt); \
H2656_PEL_PROTOTYPE(fname##8, bitd, opt); \
H2656_PEL_PROTOTYPE(fname##12, bitd, opt); \
H2656_PEL_PROTOTYPE(fname##16, bitd, opt); \
H2656_PEL_PROTOTYPE(fname##32, bitd, opt); \
H2656_PEL_PROTOTYPE(fname##64, bitd, opt); \
H2656_PEL_PROTOTYPE(fname##128, bitd, opt)
#define H2656_MC_4TAP_PROTOTYPES_SSE4(bitd) \
H2656_PEL_PROTOTYPE(pixels2, bitd, sse4); \
H2656_MC_4TAP_PROTOTYPES(4tap_h, bitd, sse4); \
H2656_MC_4TAP_PROTOTYPES(4tap_v, bitd, sse4); \
H2656_MC_4TAP_PROTOTYPES(4tap_hv, bitd, sse4); \
H2656_MC_4TAP_PROTOTYPES_SSE4(8)
H2656_MC_4TAP_PROTOTYPES_SSE4(10)
H2656_MC_4TAP_PROTOTYPES_SSE4(12)
#define H2656_MC_8TAP_PROTOTYPES_AVX2(fname) \
H2656_PEL_PROTOTYPE(fname##32 , 8, avx2); \
H2656_PEL_PROTOTYPE(fname##64 , 8, avx2); \
H2656_PEL_PROTOTYPE(fname##128, 8, avx2); \
H2656_PEL_PROTOTYPE(fname##16 ,10, avx2); \
H2656_PEL_PROTOTYPE(fname##32 ,10, avx2); \
H2656_PEL_PROTOTYPE(fname##64 ,10, avx2); \
H2656_PEL_PROTOTYPE(fname##128,10, avx2); \
H2656_PEL_PROTOTYPE(fname##16 ,12, avx2); \
H2656_PEL_PROTOTYPE(fname##32 ,12, avx2); \
H2656_PEL_PROTOTYPE(fname##64 ,12, avx2); \
H2656_PEL_PROTOTYPE(fname##128,12, avx2) \
H2656_MC_8TAP_PROTOTYPES_AVX2(pixels);
H2656_MC_8TAP_PROTOTYPES_AVX2(8tap_h);
H2656_MC_8TAP_PROTOTYPES_AVX2(8tap_v);
H2656_MC_8TAP_PROTOTYPES_AVX2(8tap_hv);
H2656_PEL_PROTOTYPE(8tap_hv16, 8, avx2);
H2656_MC_8TAP_PROTOTYPES_AVX2(4tap_h);
H2656_MC_8TAP_PROTOTYPES_AVX2(4tap_v);
H2656_MC_8TAP_PROTOTYPES_AVX2(4tap_hv);
#endif

View File

@ -715,35 +715,6 @@ SECTION .text
; int height, int mx, int my)
; ******************************
%macro HEVC_PUT_HEVC_PEL_PIXELS 2
HEVC_PEL_PIXELS %1, %2
HEVC_UNI_PEL_PIXELS %1, %2
HEVC_BI_PEL_PIXELS %1, %2
%endmacro
%macro HEVC_PEL_PIXELS 2
cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
pxor m2, m2
.loop:
SIMPLE_LOAD %1, %2, srcq, m0
MC_PIXEL_COMPUTE %1, %2, 1
PEL_10STORE%1 dstq, m0, m1
LOOP_END dst, src, srcstride
RET
%endmacro
%macro HEVC_UNI_PEL_PIXELS 2
cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
.loop:
SIMPLE_LOAD %1, %2, srcq, m0
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
%endmacro
%macro HEVC_BI_PEL_PIXELS 2
cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
pxor m2, m2
@ -777,32 +748,8 @@ cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstrid
%define XMM_REGS 8
%endif
cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter
%assign %%stride ((%2 + 7)/8)
EPEL_FILTER %2, mx, m4, m5, rfilter
.loop:
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m4, m5, 1
PEL_10STORE%1 dstq, m0, m1
LOOP_END dst, src, srcstride
RET
cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter
%assign %%stride ((%2 + 7)/8)
movdqa m6, [pw_%2]
EPEL_FILTER %2, mx, m4, m5, rfilter
.loop:
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m4, m5
UNI_COMPUTE %1, %2, m0, m1, m6
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter
%assign %%stride ((%2 + 7)/8)
movdqa m6, [pw_bi_%2]
EPEL_FILTER %2, mx, m4, m5, rfilter
.loop:
@ -824,36 +771,6 @@ cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcst
; int height, int mx, int my, int width)
; ******************************
cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my
movifnidn myd, mym
sub srcq, srcstrideq
EPEL_FILTER %2, my, m4, m5, r3src
lea r3srcq, [srcstrideq*3]
.loop:
EPEL_LOAD %2, srcq, srcstride, %1
EPEL_COMPUTE %2, %1, m4, m5, 1
PEL_10STORE%1 dstq, m0, m1
LOOP_END dst, src, srcstride
RET
cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my
movifnidn myd, mym
movdqa m6, [pw_%2]
sub srcq, srcstrideq
EPEL_FILTER %2, my, m4, m5, r3src
lea r3srcq, [srcstrideq*3]
.loop:
EPEL_LOAD %2, srcq, srcstride, %1
EPEL_COMPUTE %2, %1, m4, m5
UNI_COMPUTE %1, %2, m0, m1, m6
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my
movifnidn myd, mym
movdqa m6, [pw_bi_%2]
@ -882,135 +799,6 @@ cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcst
; ******************************
%macro HEVC_PUT_HEVC_EPEL_HV 2
cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src
%assign %%stride ((%2 + 7)/8)
sub srcq, srcstrideq
EPEL_HV_FILTER %2
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m8, m1
%endif
SWAP m4, m0
add srcq, srcstrideq
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m9, m1
%endif
SWAP m5, m0
add srcq, srcstrideq
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m10, m1
%endif
SWAP m6, m0
add srcq, srcstrideq
.loop:
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m11, m1
%endif
SWAP m7, m0
punpcklwd m0, m4, m5
punpcklwd m2, m6, m7
%if %1 > 4
punpckhwd m1, m4, m5
punpckhwd m3, m6, m7
%endif
EPEL_COMPUTE 14, %1, m12, m13
%if (%1 > 8 && (%2 == 8))
punpcklwd m4, m8, m9
punpcklwd m2, m10, m11
punpckhwd m8, m8, m9
punpckhwd m3, m10, m11
EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
%if cpuflag(avx2)
vinserti128 m2, m0, xm4, 1
vperm2i128 m3, m0, m4, q0301
PEL_10STORE%1 dstq, m2, m3
%else
PEL_10STORE%1 dstq, m0, m4
%endif
%else
PEL_10STORE%1 dstq, m0, m1
%endif
movdqa m4, m5
movdqa m5, m6
movdqa m6, m7
%if (%1 > 8 && (%2 == 8))
mova m8, m9
mova m9, m10
mova m10, m11
%endif
LOOP_END dst, src, srcstride
RET
cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src
%assign %%stride ((%2 + 7)/8)
sub srcq, srcstrideq
EPEL_HV_FILTER %2
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m8, m1
%endif
SWAP m4, m0
add srcq, srcstrideq
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m9, m1
%endif
SWAP m5, m0
add srcq, srcstrideq
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m10, m1
%endif
SWAP m6, m0
add srcq, srcstrideq
.loop:
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m11, m1
%endif
mova m7, m0
punpcklwd m0, m4, m5
punpcklwd m2, m6, m7
%if %1 > 4
punpckhwd m1, m4, m5
punpckhwd m3, m6, m7
%endif
EPEL_COMPUTE 14, %1, m12, m13
%if (%1 > 8 && (%2 == 8))
punpcklwd m4, m8, m9
punpcklwd m2, m10, m11
punpckhwd m8, m8, m9
punpckhwd m3, m10, m11
EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
UNI_COMPUTE %1, %2, m0, m4, [pw_%2]
%else
UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
%endif
PEL_%2STORE%1 dstq, m0, m1
mova m4, m5
mova m5, m6
mova m6, m7
%if (%1 > 8 && (%2 == 8))
mova m8, m9
mova m9, m10
mova m10, m11
%endif
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
%assign %%stride ((%2 + 7)/8)
@ -1093,34 +881,6 @@ cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride,
; ******************************
%macro HEVC_PUT_HEVC_QPEL 2
cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
QPEL_FILTER %2, mx
.loop:
QPEL_H_LOAD %2, srcq, %1, 10
QPEL_COMPUTE %1, %2, 1
%if %2 > 8
packssdw m0, m1
%endif
PEL_10STORE%1 dstq, m0, m1
LOOP_END dst, src, srcstride
RET
cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
mova m9, [pw_%2]
QPEL_FILTER %2, mx
.loop:
QPEL_H_LOAD %2, srcq, %1, 10
QPEL_COMPUTE %1, %2
%if %2 > 8
packssdw m0, m1
%endif
UNI_COMPUTE %1, %2, m0, m1, m9
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
movdqa m9, [pw_bi_%2]
@ -1148,38 +908,6 @@ cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride,
; int height, int mx, int my, int width)
; ******************************
cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
movifnidn myd, mym
lea r3srcq, [srcstrideq*3]
QPEL_FILTER %2, my
.loop:
QPEL_V_LOAD %2, srcq, srcstride, %1, r7
QPEL_COMPUTE %1, %2, 1
%if %2 > 8
packssdw m0, m1
%endif
PEL_10STORE%1 dstq, m0, m1
LOOP_END dst, src, srcstride
RET
cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
movifnidn myd, mym
movdqa m9, [pw_%2]
lea r3srcq, [srcstrideq*3]
QPEL_FILTER %2, my
.loop:
QPEL_V_LOAD %2, srcq, srcstride, %1, r8
QPEL_COMPUTE %1, %2
%if %2 > 8
packssdw m0, m1
%endif
UNI_COMPUTE %1, %2, m0, m1, m9
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
movifnidn myd, mym
@ -1210,162 +938,6 @@ cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride,
; int height, int mx, int my)
; ******************************
%macro HEVC_PUT_HEVC_QPEL_HV 2
cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
%if cpuflag(avx2)
%assign %%shift 4
%else
%assign %%shift 3
%endif
sub mxq, 1
sub myq, 1
shl mxq, %%shift ; multiply by 32
shl myq, %%shift ; multiply by 32
lea r3srcq, [srcstrideq*3]
sub srcq, r3srcq
QPEL_H_LOAD %2, srcq, %1, 15
QPEL_HV_COMPUTE %1, %2, mx, ackssdw
SWAP m8, m0
add srcq, srcstrideq
QPEL_H_LOAD %2, srcq, %1, 15
QPEL_HV_COMPUTE %1, %2, mx, ackssdw
SWAP m9, m0
add srcq, srcstrideq
QPEL_H_LOAD %2, srcq, %1, 15
QPEL_HV_COMPUTE %1, %2, mx, ackssdw
SWAP m10, m0
add srcq, srcstrideq
QPEL_H_LOAD %2, srcq, %1, 15
QPEL_HV_COMPUTE %1, %2, mx, ackssdw
SWAP m11, m0
add srcq, srcstrideq
QPEL_H_LOAD %2, srcq, %1, 15
QPEL_HV_COMPUTE %1, %2, mx, ackssdw
SWAP m12, m0
add srcq, srcstrideq
QPEL_H_LOAD %2, srcq, %1, 15
QPEL_HV_COMPUTE %1, %2, mx, ackssdw
SWAP m13, m0
add srcq, srcstrideq
QPEL_H_LOAD %2, srcq, %1, 15
QPEL_HV_COMPUTE %1, %2, mx, ackssdw
SWAP m14, m0
add srcq, srcstrideq
.loop:
QPEL_H_LOAD %2, srcq, %1, 15
QPEL_HV_COMPUTE %1, %2, mx, ackssdw
SWAP m15, m0
punpcklwd m0, m8, m9
punpcklwd m2, m10, m11
punpcklwd m4, m12, m13
punpcklwd m6, m14, m15
%if %1 > 4
punpckhwd m1, m8, m9
punpckhwd m3, m10, m11
punpckhwd m5, m12, m13
punpckhwd m7, m14, m15
%endif
QPEL_HV_COMPUTE %1, 14, my, ackssdw
PEL_10STORE%1 dstq, m0, m1
%if %1 <= 4
movq m8, m9
movq m9, m10
movq m10, m11
movq m11, m12
movq m12, m13
movq m13, m14
movq m14, m15
%else
movdqa m8, m9
movdqa m9, m10
movdqa m10, m11
movdqa m11, m12
movdqa m12, m13
movdqa m13, m14
movdqa m14, m15
%endif
LOOP_END dst, src, srcstride
RET
cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
%if cpuflag(avx2)
%assign %%shift 4
%else
%assign %%shift 3
%endif
sub mxq, 1
sub myq, 1
shl mxq, %%shift ; multiply by 32
shl myq, %%shift ; multiply by 32
lea r3srcq, [srcstrideq*3]
sub srcq, r3srcq
QPEL_H_LOAD %2, srcq, %1, 15
QPEL_HV_COMPUTE %1, %2, mx, ackssdw
SWAP m8, m0
add srcq, srcstrideq
QPEL_H_LOAD %2, srcq, %1, 15
QPEL_HV_COMPUTE %1, %2, mx, ackssdw
SWAP m9, m0
add srcq, srcstrideq
QPEL_H_LOAD %2, srcq, %1, 15
QPEL_HV_COMPUTE %1, %2, mx, ackssdw
SWAP m10, m0
add srcq, srcstrideq
QPEL_H_LOAD %2, srcq, %1, 15
QPEL_HV_COMPUTE %1, %2, mx, ackssdw
SWAP m11, m0
add srcq, srcstrideq
QPEL_H_LOAD %2, srcq, %1, 15
QPEL_HV_COMPUTE %1, %2, mx, ackssdw
SWAP m12, m0
add srcq, srcstrideq
QPEL_H_LOAD %2, srcq, %1, 15
QPEL_HV_COMPUTE %1, %2, mx, ackssdw
SWAP m13, m0
add srcq, srcstrideq
QPEL_H_LOAD %2, srcq, %1, 15
QPEL_HV_COMPUTE %1, %2, mx, ackssdw
SWAP m14, m0
add srcq, srcstrideq
.loop:
QPEL_H_LOAD %2, srcq, %1, 15
QPEL_HV_COMPUTE %1, %2, mx, ackssdw
SWAP m15, m0
punpcklwd m0, m8, m9
punpcklwd m2, m10, m11
punpcklwd m4, m12, m13
punpcklwd m6, m14, m15
%if %1 > 4
punpckhwd m1, m8, m9
punpckhwd m3, m10, m11
punpckhwd m5, m12, m13
punpckhwd m7, m14, m15
%endif
QPEL_HV_COMPUTE %1, 14, my, ackusdw
UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
PEL_%2STORE%1 dstq, m0, m1
%if %1 <= 4
movq m8, m9
movq m9, m10
movq m10, m11
movq m11, m12
movq m12, m13
movq m13, m14
movq m14, m15
%else
mova m8, m9
mova m9, m10
mova m10, m11
mova m11, m12
mova m12, m13
mova m13, m14
mova m14, m15
%endif
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
%if cpuflag(avx2)
@ -1613,22 +1185,22 @@ WEIGHTING_FUNCS 4, 12
WEIGHTING_FUNCS 6, 12
WEIGHTING_FUNCS 8, 12
HEVC_PUT_HEVC_PEL_PIXELS 2, 8
HEVC_PUT_HEVC_PEL_PIXELS 4, 8
HEVC_PUT_HEVC_PEL_PIXELS 6, 8
HEVC_PUT_HEVC_PEL_PIXELS 8, 8
HEVC_PUT_HEVC_PEL_PIXELS 12, 8
HEVC_PUT_HEVC_PEL_PIXELS 16, 8
HEVC_BI_PEL_PIXELS 2, 8
HEVC_BI_PEL_PIXELS 4, 8
HEVC_BI_PEL_PIXELS 6, 8
HEVC_BI_PEL_PIXELS 8, 8
HEVC_BI_PEL_PIXELS 12, 8
HEVC_BI_PEL_PIXELS 16, 8
HEVC_PUT_HEVC_PEL_PIXELS 2, 10
HEVC_PUT_HEVC_PEL_PIXELS 4, 10
HEVC_PUT_HEVC_PEL_PIXELS 6, 10
HEVC_PUT_HEVC_PEL_PIXELS 8, 10
HEVC_BI_PEL_PIXELS 2, 10
HEVC_BI_PEL_PIXELS 4, 10
HEVC_BI_PEL_PIXELS 6, 10
HEVC_BI_PEL_PIXELS 8, 10
HEVC_PUT_HEVC_PEL_PIXELS 2, 12
HEVC_PUT_HEVC_PEL_PIXELS 4, 12
HEVC_PUT_HEVC_PEL_PIXELS 6, 12
HEVC_PUT_HEVC_PEL_PIXELS 8, 12
HEVC_BI_PEL_PIXELS 2, 12
HEVC_BI_PEL_PIXELS 4, 12
HEVC_BI_PEL_PIXELS 6, 12
HEVC_BI_PEL_PIXELS 8, 12
HEVC_PUT_HEVC_EPEL 2, 8
HEVC_PUT_HEVC_EPEL 4, 8
@ -1693,8 +1265,8 @@ HEVC_PUT_HEVC_QPEL_HV 8, 12
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2 ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
HEVC_PUT_HEVC_PEL_PIXELS 32, 8
HEVC_PUT_HEVC_PEL_PIXELS 16, 10
HEVC_BI_PEL_PIXELS 32, 8
HEVC_BI_PEL_PIXELS 16, 10
HEVC_PUT_HEVC_EPEL 32, 8
HEVC_PUT_HEVC_EPEL 16, 10

View File

@ -1,6 +1,7 @@
/*
* Copyright (c) 2013 Seppo Tomperi
* Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
* Copyright (c) 2013-2014 Pierre-Edouard Lepere
* Copyright (c) 2023-2024 Wu Jianhua
*
* This file is part of FFmpeg.
*
@ -27,6 +28,7 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/hevcdsp.h"
#include "libavcodec/x86/hevcdsp.h"
#include "libavcodec/x86/h26x/h2656dsp.h"
#define LFC_FUNC(DIR, DEPTH, OPT) \
void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, const int *tc, const uint8_t *no_p, const uint8_t *no_q);
@ -83,6 +85,110 @@ void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit);
IDCT_FUNCS(sse2)
IDCT_FUNCS(avx)
#define ff_hevc_pel_filters ff_hevc_qpel_filters
#define DECL_HV_FILTER(f) \
const uint8_t *hf = ff_hevc_ ## f ## _filters[mx - 1]; \
const uint8_t *vf = ff_hevc_ ## f ## _filters[my - 1];
#define FW_PUT(p, a, b, depth, opt) \
void ff_hevc_put_hevc_ ## a ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \
int height, intptr_t mx, intptr_t my,int width) \
{ \
DECL_HV_FILTER(p) \
ff_h2656_put_ ## b ## _ ## depth ## _##opt(dst, src, srcstride, height, hf, vf, width); \
}
#define FW_PUT_UNI(p, a, b, depth, opt) \
void ff_hevc_put_hevc_uni_ ## a ## _ ## depth ## _##opt(uint8_t *dst, ptrdiff_t dststride, \
const uint8_t *src, ptrdiff_t srcstride, \
int height, intptr_t mx, intptr_t my, int width) \
{ \
DECL_HV_FILTER(p) \
ff_h2656_put_uni_ ## b ## _ ## depth ## _##opt(dst, dststride, src, srcstride, height, hf, vf, width); \
}
#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
#define FW_PUT_FUNCS(p, a, b, depth, opt) \
FW_PUT(p, a, b, depth, opt) \
FW_PUT_UNI(p, a, b, depth, opt)
#define FW_PEL(w, depth, opt) FW_PUT_FUNCS(pel, pel_pixels##w, pixels##w, depth, opt)
#define FW_DIR(npel, n, w, depth, opt) \
FW_PUT_FUNCS(npel, npel ## _h##w, n ## tap_h##w, depth, opt) \
FW_PUT_FUNCS(npel, npel ## _v##w, n ## tap_v##w, depth, opt)
#define FW_DIR_HV(npel, n, w, depth, opt) \
FW_PUT_FUNCS(npel, npel ## _hv##w, n ## tap_hv##w, depth, opt)
FW_PEL(4, 8, sse4);
FW_PEL(6, 8, sse4);
FW_PEL(8, 8, sse4);
FW_PEL(12, 8, sse4);
FW_PEL(16, 8, sse4);
FW_PEL(4, 10, sse4);
FW_PEL(6, 10, sse4);
FW_PEL(8, 10, sse4);
FW_PEL(4, 12, sse4);
FW_PEL(6, 12, sse4);
FW_PEL(8, 12, sse4);
#define FW_EPEL(w, depth, opt) FW_DIR(epel, 4, w, depth, opt)
#define FW_EPEL_HV(w, depth, opt) FW_DIR_HV(epel, 4, w, depth, opt)
#define FW_EPEL_FUNCS(w, depth, opt) \
FW_EPEL(w, depth, opt) \
FW_EPEL_HV(w, depth, opt)
FW_EPEL(12, 8, sse4);
FW_EPEL_FUNCS(4, 8, sse4);
FW_EPEL_FUNCS(6, 8, sse4);
FW_EPEL_FUNCS(8, 8, sse4);
FW_EPEL_FUNCS(16, 8, sse4);
FW_EPEL_FUNCS(4, 10, sse4);
FW_EPEL_FUNCS(6, 10, sse4);
FW_EPEL_FUNCS(8, 10, sse4);
FW_EPEL_FUNCS(4, 12, sse4);
FW_EPEL_FUNCS(6, 12, sse4);
FW_EPEL_FUNCS(8, 12, sse4);
#define FW_QPEL(w, depth, opt) FW_DIR(qpel, 8, w, depth, opt)
#define FW_QPEL_HV(w, depth, opt) FW_DIR_HV(qpel, 8, w, depth, opt)
#define FW_QPEL_FUNCS(w, depth, opt) \
FW_QPEL(w, depth, opt) \
FW_QPEL_HV(w, depth, opt)
FW_QPEL(12, 8, sse4);
FW_QPEL(16, 8, sse4);
FW_QPEL_FUNCS(4, 8, sse4);
FW_QPEL_FUNCS(8, 8, sse4);
FW_QPEL_FUNCS(4, 10, sse4);
FW_QPEL_FUNCS(8, 10, sse4);
FW_QPEL_FUNCS(4, 12, sse4);
FW_QPEL_FUNCS(8, 12, sse4);
#ifdef HAVE_AVX2_EXTERNAL
FW_PEL(32, 8, avx2);
FW_PUT(pel, pel_pixels16, pixels16, 10, avx2);
FW_EPEL(32, 8, avx2);
FW_EPEL(16, 10, avx2);
FW_EPEL_HV(32, 8, avx2);
FW_EPEL_HV(16, 10, avx2);
FW_QPEL(32, 8, avx2);
FW_QPEL(16, 10, avx2);
FW_QPEL_HV(16, 10, avx2);
#endif
#endif
#define mc_rep_func(name, bitd, step, W, opt) \
void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, \
const uint8_t *_src, ptrdiff_t _srcstride, int height, \