avcodec/utvideodec : add SIMD (SSSE3 and AVX2) for gradient_pred
This commit is contained in:
parent
4353c35067
commit
630967ef63
@ -98,6 +98,16 @@ static int add_left_pred_int16_c(uint16_t *dst, const uint16_t *src, unsigned ma
|
|||||||
return acc;
|
return acc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void add_gradient_pred_c(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width){
|
||||||
|
int A, B, C, i;
|
||||||
|
|
||||||
|
for (i = 0; i < width; i++) {
|
||||||
|
A = src[i - stride];
|
||||||
|
B = src[i - (stride + 1)];
|
||||||
|
C = src[i - 1];
|
||||||
|
src[i] = (A - B + C + src[i]) & 0xFF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void ff_llviddsp_init(LLVidDSPContext *c)
|
void ff_llviddsp_init(LLVidDSPContext *c)
|
||||||
{
|
{
|
||||||
@ -106,6 +116,7 @@ void ff_llviddsp_init(LLVidDSPContext *c)
|
|||||||
c->add_left_pred = add_left_pred_c;
|
c->add_left_pred = add_left_pred_c;
|
||||||
|
|
||||||
c->add_left_pred_int16 = add_left_pred_int16_c;
|
c->add_left_pred_int16 = add_left_pred_int16_c;
|
||||||
|
c->add_gradient_pred = add_gradient_pred_c;
|
||||||
|
|
||||||
if (ARCH_PPC)
|
if (ARCH_PPC)
|
||||||
ff_llviddsp_init_ppc(c);
|
ff_llviddsp_init_ppc(c);
|
||||||
|
@ -39,6 +39,7 @@ typedef struct LLVidDSPContext {
|
|||||||
|
|
||||||
int (*add_left_pred_int16)(uint16_t *dst, const uint16_t *src,
|
int (*add_left_pred_int16)(uint16_t *dst, const uint16_t *src,
|
||||||
unsigned mask, ptrdiff_t w, unsigned left);
|
unsigned mask, ptrdiff_t w, unsigned left);
|
||||||
|
void (*add_gradient_pred)(uint8_t *src /* align 32 */, const ptrdiff_t stride, const ptrdiff_t width);
|
||||||
} LLVidDSPContext;
|
} LLVidDSPContext;
|
||||||
|
|
||||||
void ff_llviddsp_init(LLVidDSPContext *llviddsp);
|
void ff_llviddsp_init(LLVidDSPContext *llviddsp);
|
||||||
|
@ -460,6 +460,7 @@ static void restore_gradient_planar(UtvideoContext *c, uint8_t *src, ptrdiff_t s
|
|||||||
uint8_t *bsrc;
|
uint8_t *bsrc;
|
||||||
int slice_start, slice_height;
|
int slice_start, slice_height;
|
||||||
const int cmask = ~rmode;
|
const int cmask = ~rmode;
|
||||||
|
int min_width = FFMIN(width, 32);
|
||||||
|
|
||||||
for (slice = 0; slice < slices; slice++) {
|
for (slice = 0; slice < slices; slice++) {
|
||||||
slice_start = ((slice * height) / slices) & cmask;
|
slice_start = ((slice * height) / slices) & cmask;
|
||||||
@ -479,12 +480,14 @@ static void restore_gradient_planar(UtvideoContext *c, uint8_t *src, ptrdiff_t s
|
|||||||
for (j = 1; j < slice_height; j++) {
|
for (j = 1; j < slice_height; j++) {
|
||||||
// second line - first element has top prediction, the rest uses gradient
|
// second line - first element has top prediction, the rest uses gradient
|
||||||
bsrc[0] = (bsrc[0] + bsrc[-stride]) & 0xFF;
|
bsrc[0] = (bsrc[0] + bsrc[-stride]) & 0xFF;
|
||||||
for (i = 1; i < width; i++) {
|
for (i = 1; i < min_width; i++) { /* dsp need align 32 */
|
||||||
A = bsrc[i - stride];
|
A = bsrc[i - stride];
|
||||||
B = bsrc[i - (stride + 1)];
|
B = bsrc[i - (stride + 1)];
|
||||||
C = bsrc[i - 1];
|
C = bsrc[i - 1];
|
||||||
bsrc[i] = (A - B + C + bsrc[i]) & 0xFF;
|
bsrc[i] = (A - B + C + bsrc[i]) & 0xFF;
|
||||||
}
|
}
|
||||||
|
if (width > 32)
|
||||||
|
c->llviddsp.add_gradient_pred(bsrc + 32, stride, width - 32);
|
||||||
bsrc += stride;
|
bsrc += stride;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
;* SIMD lossless video DSP utils
|
;* SIMD lossless video DSP utils
|
||||||
;* Copyright (c) 2008 Loren Merritt
|
;* Copyright (c) 2008 Loren Merritt
|
||||||
;* Copyright (c) 2014 Michael Niedermayer
|
;* Copyright (c) 2014 Michael Niedermayer
|
||||||
|
;* Copyright (c) 2017 Jokyo Images
|
||||||
;*
|
;*
|
||||||
;* This file is part of FFmpeg.
|
;* This file is part of FFmpeg.
|
||||||
;*
|
;*
|
||||||
@ -325,3 +326,82 @@ cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
|
|||||||
ADD_HFYU_LEFT_LOOP_INT16 u, a
|
ADD_HFYU_LEFT_LOOP_INT16 u, a
|
||||||
.src_unaligned:
|
.src_unaligned:
|
||||||
ADD_HFYU_LEFT_LOOP_INT16 u, u
|
ADD_HFYU_LEFT_LOOP_INT16 u, u
|
||||||
|
|
||||||
|
|
||||||
|
;---------------------------------------------------------------------------------------------
|
||||||
|
; void add_gradient_pred(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width)
|
||||||
|
;---------------------------------------------------------------------------------------------
|
||||||
|
%macro ADD_GRADIENT_PRED 0
|
||||||
|
cglobal add_gradient_pred, 3,4,5, src, stride, width, tmp
|
||||||
|
mova xm0, [pb_15]
|
||||||
|
|
||||||
|
;load src - 1 in xm1
|
||||||
|
movd xm1, [srcq-1]
|
||||||
|
%if cpuflag(avx2)
|
||||||
|
vpbroadcastb xm1, xm1
|
||||||
|
%else
|
||||||
|
pxor xm2, xm2
|
||||||
|
pshufb xm1, xm2
|
||||||
|
%endif
|
||||||
|
|
||||||
|
add srcq, widthq
|
||||||
|
neg widthq
|
||||||
|
neg strideq
|
||||||
|
|
||||||
|
.loop:
|
||||||
|
lea tmpq, [srcq + strideq]
|
||||||
|
mova m2, [tmpq + widthq] ; A = src[x-stride]
|
||||||
|
movu m3, [tmpq + widthq - 1] ; B = src[x - (stride + 1)]
|
||||||
|
mova m4, [srcq + widthq] ; current val (src[x])
|
||||||
|
|
||||||
|
psubb m2, m3; A - B
|
||||||
|
|
||||||
|
; prefix sum A-B
|
||||||
|
pslldq m3, m2, 1
|
||||||
|
paddb m2, m3
|
||||||
|
pslldq m3, m2, 2
|
||||||
|
paddb m2, m3
|
||||||
|
pslldq m3, m2, 4
|
||||||
|
paddb m2, m3
|
||||||
|
pslldq m3, m2, 8
|
||||||
|
paddb m2, m3
|
||||||
|
|
||||||
|
; prefix sum current val
|
||||||
|
pslldq m3, m4, 1
|
||||||
|
paddb m4, m3
|
||||||
|
pslldq m3, m4, 2
|
||||||
|
paddb m4, m3
|
||||||
|
pslldq m3, m4, 4
|
||||||
|
paddb m4, m3
|
||||||
|
pslldq m3, m4, 8
|
||||||
|
paddb m4, m3
|
||||||
|
|
||||||
|
; last sum
|
||||||
|
paddb m2, m4 ; current + (A - B)
|
||||||
|
|
||||||
|
paddb xm1, xm2 ; += C
|
||||||
|
mova [srcq + widthq], xm1 ; store
|
||||||
|
|
||||||
|
pshufb xm1, xm0 ; put last val in all val of xm1
|
||||||
|
|
||||||
|
%if mmsize == 32
|
||||||
|
vextracti128 xm2, m2, 1 ; get second lane of the ymm
|
||||||
|
paddb xm1, xm2; += C
|
||||||
|
|
||||||
|
mova [srcq + widthq + 16], xm1 ; store
|
||||||
|
pshufb xm1, xm0 ; put last val in all val of m1
|
||||||
|
%endif
|
||||||
|
|
||||||
|
add widthq, mmsize
|
||||||
|
jl .loop
|
||||||
|
RET
|
||||||
|
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_XMM ssse3
|
||||||
|
ADD_GRADIENT_PRED
|
||||||
|
|
||||||
|
%if HAVE_AVX2_EXTERNAL
|
||||||
|
INIT_YMM avx2
|
||||||
|
ADD_GRADIENT_PRED
|
||||||
|
%endif
|
||||||
|
@ -44,6 +44,9 @@ int ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
|
|||||||
int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
|
int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
|
||||||
int ff_add_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
|
int ff_add_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
|
||||||
|
|
||||||
|
void ff_add_gradient_pred_ssse3(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
|
||||||
|
void ff_add_gradient_pred_avx2(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
|
||||||
|
|
||||||
#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
|
#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
|
||||||
static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top,
|
static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top,
|
||||||
const uint8_t *diff, ptrdiff_t w,
|
const uint8_t *diff, ptrdiff_t w,
|
||||||
@ -109,6 +112,7 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c)
|
|||||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||||
c->add_left_pred = ff_add_left_pred_ssse3;
|
c->add_left_pred = ff_add_left_pred_ssse3;
|
||||||
c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3;
|
c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3;
|
||||||
|
c->add_gradient_pred = ff_add_gradient_pred_ssse3;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
|
if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
|
||||||
@ -121,5 +125,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c)
|
|||||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||||
c->add_bytes = ff_add_bytes_avx2;
|
c->add_bytes = ff_add_bytes_avx2;
|
||||||
c->add_left_pred = ff_add_left_pred_unaligned_avx2;
|
c->add_left_pred = ff_add_left_pred_unaligned_avx2;
|
||||||
|
c->add_gradient_pred = ff_add_gradient_pred_avx2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user