ff_add_hfyu_median_prediction_mmx2
overall ffvhuff decoding speedup: 28% on core2, 25% on k8. Originally committed as revision 17059 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
6166516d1f
commit
3daa434a40
@ -3542,6 +3542,23 @@ static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
|
|||||||
dst[i+0] = src1[i+0]-src2[i+0];
|
dst[i+0] = src1[i+0]-src2[i+0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void add_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *diff, int w, int *left, int *left_top){
|
||||||
|
int i;
|
||||||
|
uint8_t l, lt;
|
||||||
|
|
||||||
|
l= *left;
|
||||||
|
lt= *left_top;
|
||||||
|
|
||||||
|
for(i=0; i<w; i++){
|
||||||
|
l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
|
||||||
|
lt= src1[i];
|
||||||
|
dst[i]= l;
|
||||||
|
}
|
||||||
|
|
||||||
|
*left= l;
|
||||||
|
*left_top= lt;
|
||||||
|
}
|
||||||
|
|
||||||
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
|
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
|
||||||
int i;
|
int i;
|
||||||
uint8_t l, lt;
|
uint8_t l, lt;
|
||||||
@ -4554,6 +4571,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
|
|||||||
c->add_bytes= add_bytes_c;
|
c->add_bytes= add_bytes_c;
|
||||||
c->add_bytes_l2= add_bytes_l2_c;
|
c->add_bytes_l2= add_bytes_l2_c;
|
||||||
c->diff_bytes= diff_bytes_c;
|
c->diff_bytes= diff_bytes_c;
|
||||||
|
c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
|
||||||
c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
|
c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
|
||||||
c->bswap_buf= bswap_buf;
|
c->bswap_buf= bswap_buf;
|
||||||
#if CONFIG_PNG_DECODER
|
#if CONFIG_PNG_DECODER
|
||||||
|
@ -345,6 +345,7 @@ typedef struct DSPContext {
|
|||||||
* note, this might read from src1[-1], src2[-1]
|
* note, this might read from src1[-1], src2[-1]
|
||||||
*/
|
*/
|
||||||
void (*sub_hfyu_median_prediction)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top);
|
void (*sub_hfyu_median_prediction)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top);
|
||||||
|
void (*add_hfyu_median_prediction)(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top);
|
||||||
/* this might write to dst[w] */
|
/* this might write to dst[w] */
|
||||||
void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
|
void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
|
||||||
void (*bswap_buf)(uint32_t *dst, const uint32_t *src, int w);
|
void (*bswap_buf)(uint32_t *dst, const uint32_t *src, int w);
|
||||||
|
@ -31,7 +31,6 @@
|
|||||||
#include "avcodec.h"
|
#include "avcodec.h"
|
||||||
#include "bitstream.h"
|
#include "bitstream.h"
|
||||||
#include "dsputil.h"
|
#include "dsputil.h"
|
||||||
#include "mathops.h"
|
|
||||||
|
|
||||||
#define VLC_BITS 11
|
#define VLC_BITS 11
|
||||||
|
|
||||||
@ -148,23 +147,6 @@ static inline int add_left_prediction(uint8_t *dst, uint8_t *src, int w, int acc
|
|||||||
return acc;
|
return acc;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void add_median_prediction(uint8_t *dst, uint8_t *src1, uint8_t *diff, int w, int *left, int *left_top){
|
|
||||||
int i;
|
|
||||||
uint8_t l, lt;
|
|
||||||
|
|
||||||
l= *left;
|
|
||||||
lt= *left_top;
|
|
||||||
|
|
||||||
for(i=0; i<w; i++){
|
|
||||||
l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
|
|
||||||
lt= src1[i];
|
|
||||||
dst[i]= l;
|
|
||||||
}
|
|
||||||
|
|
||||||
*left= l;
|
|
||||||
*left_top= lt;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void add_left_prediction_bgr32(uint8_t *dst, uint8_t *src, int w, int *red, int *green, int *blue){
|
static inline void add_left_prediction_bgr32(uint8_t *dst, uint8_t *src, int w, int *red, int *green, int *blue){
|
||||||
int i;
|
int i;
|
||||||
int r,g,b;
|
int r,g,b;
|
||||||
@ -1106,12 +1088,12 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, const
|
|||||||
/* next line except the first 4 pixels is median predicted */
|
/* next line except the first 4 pixels is median predicted */
|
||||||
lefttopy= p->data[0][3];
|
lefttopy= p->data[0][3];
|
||||||
decode_422_bitstream(s, width-4);
|
decode_422_bitstream(s, width-4);
|
||||||
add_median_prediction(p->data[0] + fake_ystride+4, p->data[0]+4, s->temp[0], width-4, &lefty, &lefttopy);
|
s->dsp.add_hfyu_median_prediction(p->data[0] + fake_ystride+4, p->data[0]+4, s->temp[0], width-4, &lefty, &lefttopy);
|
||||||
if(!(s->flags&CODEC_FLAG_GRAY)){
|
if(!(s->flags&CODEC_FLAG_GRAY)){
|
||||||
lefttopu= p->data[1][1];
|
lefttopu= p->data[1][1];
|
||||||
lefttopv= p->data[2][1];
|
lefttopv= p->data[2][1];
|
||||||
add_median_prediction(p->data[1] + fake_ustride+2, p->data[1]+2, s->temp[1], width2-2, &leftu, &lefttopu);
|
s->dsp.add_hfyu_median_prediction(p->data[1] + fake_ustride+2, p->data[1]+2, s->temp[1], width2-2, &leftu, &lefttopu);
|
||||||
add_median_prediction(p->data[2] + fake_vstride+2, p->data[2]+2, s->temp[2], width2-2, &leftv, &lefttopv);
|
s->dsp.add_hfyu_median_prediction(p->data[2] + fake_vstride+2, p->data[2]+2, s->temp[2], width2-2, &leftv, &lefttopv);
|
||||||
}
|
}
|
||||||
y++; cy++;
|
y++; cy++;
|
||||||
|
|
||||||
@ -1122,7 +1104,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, const
|
|||||||
while(2*cy > y){
|
while(2*cy > y){
|
||||||
decode_gray_bitstream(s, width);
|
decode_gray_bitstream(s, width);
|
||||||
ydst= p->data[0] + p->linesize[0]*y;
|
ydst= p->data[0] + p->linesize[0]*y;
|
||||||
add_median_prediction(ydst, ydst - fake_ystride, s->temp[0], width, &lefty, &lefttopy);
|
s->dsp.add_hfyu_median_prediction(ydst, ydst - fake_ystride, s->temp[0], width, &lefty, &lefttopy);
|
||||||
y++;
|
y++;
|
||||||
}
|
}
|
||||||
if(y>=height) break;
|
if(y>=height) break;
|
||||||
@ -1135,10 +1117,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, const
|
|||||||
udst= p->data[1] + p->linesize[1]*cy;
|
udst= p->data[1] + p->linesize[1]*cy;
|
||||||
vdst= p->data[2] + p->linesize[2]*cy;
|
vdst= p->data[2] + p->linesize[2]*cy;
|
||||||
|
|
||||||
add_median_prediction(ydst, ydst - fake_ystride, s->temp[0], width, &lefty, &lefttopy);
|
s->dsp.add_hfyu_median_prediction(ydst, ydst - fake_ystride, s->temp[0], width, &lefty, &lefttopy);
|
||||||
if(!(s->flags&CODEC_FLAG_GRAY)){
|
if(!(s->flags&CODEC_FLAG_GRAY)){
|
||||||
add_median_prediction(udst, udst - fake_ustride, s->temp[1], width2, &leftu, &lefttopu);
|
s->dsp.add_hfyu_median_prediction(udst, udst - fake_ustride, s->temp[1], width2, &leftu, &lefttopu);
|
||||||
add_median_prediction(vdst, vdst - fake_vstride, s->temp[2], width2, &leftv, &lefttopv);
|
s->dsp.add_hfyu_median_prediction(vdst, vdst - fake_vstride, s->temp[2], width2, &leftv, &lefttopv);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -548,6 +548,41 @@ static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
|
|||||||
dst[i] = src1[i] + src2[i];
|
dst[i] = src1[i] + src2[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if HAVE_7REGS
|
||||||
|
static void add_hfyu_median_prediction_cmov(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top) {
|
||||||
|
x86_reg w2 = -w;
|
||||||
|
x86_reg x;
|
||||||
|
int l = *left & 0xff;
|
||||||
|
int tl = *left_top & 0xff;
|
||||||
|
int t;
|
||||||
|
__asm__ volatile(
|
||||||
|
"mov %7, %3 \n"
|
||||||
|
"1: \n"
|
||||||
|
"movzx (%3,%4), %2 \n"
|
||||||
|
"mov %2, %k3 \n"
|
||||||
|
"sub %b1, %b3 \n"
|
||||||
|
"add %b0, %b3 \n"
|
||||||
|
"mov %2, %1 \n"
|
||||||
|
"cmp %0, %2 \n"
|
||||||
|
"cmovg %0, %2 \n"
|
||||||
|
"cmovg %1, %0 \n"
|
||||||
|
"cmp %k3, %0 \n"
|
||||||
|
"cmovg %k3, %0 \n"
|
||||||
|
"mov %7, %3 \n"
|
||||||
|
"cmp %2, %0 \n"
|
||||||
|
"cmovl %2, %0 \n"
|
||||||
|
"add (%6,%4), %b0 \n"
|
||||||
|
"mov %b0, (%5,%4) \n"
|
||||||
|
"inc %4 \n"
|
||||||
|
"jl 1b \n"
|
||||||
|
:"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
|
||||||
|
:"r"(dst+w), "r"(diff+w), "rm"(top+w)
|
||||||
|
);
|
||||||
|
*left = l;
|
||||||
|
*left_top = tl;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#define H263_LOOP_FILTER \
|
#define H263_LOOP_FILTER \
|
||||||
"pxor %%mm7, %%mm7 \n\t"\
|
"pxor %%mm7, %%mm7 \n\t"\
|
||||||
"movq %0, %%mm0 \n\t"\
|
"movq %0, %%mm0 \n\t"\
|
||||||
@ -2328,6 +2363,7 @@ static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
|
|||||||
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
|
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
|
||||||
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
|
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
|
||||||
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
|
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
|
||||||
|
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top);
|
||||||
void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
|
void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
|
||||||
void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
|
void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
|
||||||
void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
|
void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
|
||||||
@ -2760,6 +2796,14 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
|||||||
c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
|
c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
|
||||||
c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
|
c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
|
||||||
|
|
||||||
|
#if HAVE_YASM
|
||||||
|
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
|
||||||
|
#endif
|
||||||
|
#if HAVE_7REGS
|
||||||
|
if( mm_flags&FF_MM_3DNOW )
|
||||||
|
c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
|
||||||
|
#endif
|
||||||
|
|
||||||
if (CONFIG_CAVS_DECODER)
|
if (CONFIG_CAVS_DECODER)
|
||||||
ff_cavsdsp_init_mmx2(c, avctx);
|
ff_cavsdsp_init_mmx2(c, avctx);
|
||||||
|
|
||||||
|
@ -90,3 +90,63 @@ FLOAT_TO_INT16_INTERLEAVE6 3dnow
|
|||||||
FLOAT_TO_INT16_INTERLEAVE6 3dn2
|
FLOAT_TO_INT16_INTERLEAVE6 3dn2
|
||||||
%undef cvtps2pi
|
%undef cvtps2pi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top)
|
||||||
|
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
|
||||||
|
movq mm0, [topq]
|
||||||
|
movq mm2, mm0
|
||||||
|
movd mm4, [left_topq]
|
||||||
|
psllq mm2, 8
|
||||||
|
movq mm1, mm0
|
||||||
|
por mm4, mm2
|
||||||
|
movd mm3, [leftq]
|
||||||
|
psubb mm0, mm4 ; t-tl
|
||||||
|
add dstq, wq
|
||||||
|
add topq, wq
|
||||||
|
add diffq, wq
|
||||||
|
neg wq
|
||||||
|
jmp .skip
|
||||||
|
.loop:
|
||||||
|
movq mm4, [topq+wq]
|
||||||
|
movq mm0, mm4
|
||||||
|
psllq mm4, 8
|
||||||
|
por mm4, mm1
|
||||||
|
movq mm1, mm0 ; t
|
||||||
|
psubb mm0, mm4 ; t-tl
|
||||||
|
.skip:
|
||||||
|
movq mm2, [diffq+wq]
|
||||||
|
%assign i 0
|
||||||
|
%rep 8
|
||||||
|
movq mm4, mm0
|
||||||
|
paddb mm4, mm3 ; t-tl+l
|
||||||
|
movq mm5, mm3
|
||||||
|
pmaxub mm3, mm1
|
||||||
|
pminub mm5, mm1
|
||||||
|
pminub mm3, mm4
|
||||||
|
pmaxub mm3, mm5 ; median
|
||||||
|
paddb mm3, mm2 ; +residual
|
||||||
|
%if i==0
|
||||||
|
movq mm7, mm3
|
||||||
|
psllq mm7, 56
|
||||||
|
%else
|
||||||
|
movq mm6, mm3
|
||||||
|
psrlq mm7, 8
|
||||||
|
psllq mm6, 56
|
||||||
|
por mm7, mm6
|
||||||
|
%endif
|
||||||
|
%if i<7
|
||||||
|
psrlq mm0, 8
|
||||||
|
psrlq mm1, 8
|
||||||
|
psrlq mm2, 8
|
||||||
|
%endif
|
||||||
|
%assign i i+1
|
||||||
|
%endrep
|
||||||
|
movq [dstq+wq], mm7
|
||||||
|
add wq, 8
|
||||||
|
jl .loop
|
||||||
|
movzx r2d, byte [dstq-1]
|
||||||
|
mov [leftq], r2d
|
||||||
|
movzx r2d, byte [topq-1]
|
||||||
|
mov [left_topq], r2d
|
||||||
|
RET
|
||||||
|
Loading…
x
Reference in New Issue
Block a user