vp8: fix PPC assembly and bilinear C code to work if src_stride != dst_stride.
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
		
							parent
							
								
									3f3867ca27
								
							
						
					
					
						commit
						34b429d5ba
					
				@ -241,15 +241,15 @@ void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define EPEL_HV(WIDTH, HTAPS, VTAPS) \
 | 
			
		||||
static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s, int h, int mx, int my) \
 | 
			
		||||
static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \
 | 
			
		||||
{ \
 | 
			
		||||
    DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \
 | 
			
		||||
    if (VTAPS == 6) { \
 | 
			
		||||
        put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16,     src-2*stride, stride, h+5, mx, my); \
 | 
			
		||||
        put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, stride, tmp+2*16,     16,     h,   mx, my); \
 | 
			
		||||
        put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16,      src-2*sstride, sstride, h+5, mx, my); \
 | 
			
		||||
        put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+2*16,      16,      h,   mx, my); \
 | 
			
		||||
    } else { \
 | 
			
		||||
        put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16,     src-stride, stride, h+4, mx, my); \
 | 
			
		||||
        put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, stride, tmp+16,     16,     h,   mx, my); \
 | 
			
		||||
        put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16,      src-sstride, sstride, h+4, mx, my); \
 | 
			
		||||
        put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+16,      16,      h,   mx, my); \
 | 
			
		||||
    } \
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -269,9 +269,44 @@ EPEL_HV(4,  4,6)
 | 
			
		||||
EPEL_HV(4,  6,4)
 | 
			
		||||
EPEL_HV(4,  4,4)
 | 
			
		||||
 | 
			
		||||
static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s, int h, int mx, int my)
 | 
			
		||||
static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
 | 
			
		||||
{
 | 
			
		||||
    ff_put_pixels16_altivec(dst, src, stride, h);
 | 
			
		||||
    register vector unsigned char pixelsv1, pixelsv2;
 | 
			
		||||
    register vector unsigned char pixelsv1B, pixelsv2B;
 | 
			
		||||
    register vector unsigned char pixelsv1C, pixelsv2C;
 | 
			
		||||
    register vector unsigned char pixelsv1D, pixelsv2D;
 | 
			
		||||
 | 
			
		||||
    register vector unsigned char perm = vec_lvsl(0, src);
 | 
			
		||||
    int i;
 | 
			
		||||
    register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1;
 | 
			
		||||
    register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2;
 | 
			
		||||
    register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2;
 | 
			
		||||
 | 
			
		||||
// hand-unrolling the loop by 4 gains about 15%
 | 
			
		||||
// mininum execution time goes from 74 to 60 cycles
 | 
			
		||||
// it's faster than -funroll-loops, but using
 | 
			
		||||
// -funroll-loops w/ this is bad - 74 cycles again.
 | 
			
		||||
// all this is on a 7450, tuning for the 7450
 | 
			
		||||
    for (i = 0; i < h; i += 4) {
 | 
			
		||||
        pixelsv1  = vec_ld( 0, src);
 | 
			
		||||
        pixelsv2  = vec_ld(15, src);
 | 
			
		||||
        pixelsv1B = vec_ld(sstride, src);
 | 
			
		||||
        pixelsv2B = vec_ld(15 + sstride, src);
 | 
			
		||||
        pixelsv1C = vec_ld(sstride2, src);
 | 
			
		||||
        pixelsv2C = vec_ld(15 + sstride2, src);
 | 
			
		||||
        pixelsv1D = vec_ld(sstride3, src);
 | 
			
		||||
        pixelsv2D = vec_ld(15 + sstride3, src);
 | 
			
		||||
        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
 | 
			
		||||
               0, (unsigned char*)dst);
 | 
			
		||||
        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
 | 
			
		||||
               dstride, (unsigned char*)dst);
 | 
			
		||||
        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
 | 
			
		||||
               dstride2, (unsigned char*)dst);
 | 
			
		||||
        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
 | 
			
		||||
               dstride3, (unsigned char*)dst);
 | 
			
		||||
        src += sstride4;
 | 
			
		||||
        dst += dstride4;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif /* HAVE_ALTIVEC */
 | 
			
		||||
 | 
			
		||||
@ -415,7 +415,7 @@ VP8_EPEL_HV(8,  6, 6)
 | 
			
		||||
VP8_EPEL_HV(4,  6, 6)
 | 
			
		||||
 | 
			
		||||
#define VP8_BILINEAR(SIZE) \
 | 
			
		||||
static void put_vp8_bilinear ## SIZE ## _h_c(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s2, int h, int mx, int my) \
 | 
			
		||||
static void put_vp8_bilinear ## SIZE ## _h_c(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \
 | 
			
		||||
{ \
 | 
			
		||||
    int a = 8-mx, b = mx; \
 | 
			
		||||
    int x, y; \
 | 
			
		||||
@ -423,24 +423,24 @@ static void put_vp8_bilinear ## SIZE ## _h_c(uint8_t *dst, ptrdiff_t stride, uin
 | 
			
		||||
    for (y = 0; y < h; y++) { \
 | 
			
		||||
        for (x = 0; x < SIZE; x++) \
 | 
			
		||||
            dst[x] = (a*src[x] + b*src[x+1] + 4) >> 3; \
 | 
			
		||||
        dst += stride; \
 | 
			
		||||
        src += stride; \
 | 
			
		||||
        dst += dstride; \
 | 
			
		||||
        src += sstride; \
 | 
			
		||||
    } \
 | 
			
		||||
} \
 | 
			
		||||
static void put_vp8_bilinear ## SIZE ## _v_c(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s2, int h, int mx, int my) \
 | 
			
		||||
static void put_vp8_bilinear ## SIZE ## _v_c(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \
 | 
			
		||||
{ \
 | 
			
		||||
    int c = 8-my, d = my; \
 | 
			
		||||
    int x, y; \
 | 
			
		||||
\
 | 
			
		||||
    for (y = 0; y < h; y++) { \
 | 
			
		||||
        for (x = 0; x < SIZE; x++) \
 | 
			
		||||
            dst[x] = (c*src[x] + d*src[x+stride] + 4) >> 3; \
 | 
			
		||||
        dst += stride; \
 | 
			
		||||
        src += stride; \
 | 
			
		||||
            dst[x] = (c*src[x] + d*src[x+sstride] + 4) >> 3; \
 | 
			
		||||
        dst += dstride; \
 | 
			
		||||
        src += sstride; \
 | 
			
		||||
    } \
 | 
			
		||||
} \
 | 
			
		||||
\
 | 
			
		||||
static void put_vp8_bilinear ## SIZE ## _hv_c(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s2, int h, int mx, int my) \
 | 
			
		||||
static void put_vp8_bilinear ## SIZE ## _hv_c(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \
 | 
			
		||||
{ \
 | 
			
		||||
    int a = 8-mx, b = mx; \
 | 
			
		||||
    int c = 8-my, d = my; \
 | 
			
		||||
@ -452,7 +452,7 @@ static void put_vp8_bilinear ## SIZE ## _hv_c(uint8_t *dst, ptrdiff_t stride, ui
 | 
			
		||||
        for (x = 0; x < SIZE; x++) \
 | 
			
		||||
            tmp[x] = (a*src[x] + b*src[x+1] + 4) >> 3; \
 | 
			
		||||
        tmp += SIZE; \
 | 
			
		||||
        src += stride; \
 | 
			
		||||
        src += sstride; \
 | 
			
		||||
    } \
 | 
			
		||||
\
 | 
			
		||||
    tmp = tmp_array; \
 | 
			
		||||
@ -460,7 +460,7 @@ static void put_vp8_bilinear ## SIZE ## _hv_c(uint8_t *dst, ptrdiff_t stride, ui
 | 
			
		||||
    for (y = 0; y < h; y++) { \
 | 
			
		||||
        for (x = 0; x < SIZE; x++) \
 | 
			
		||||
            dst[x] = (c*tmp[x] + d*tmp[x+SIZE] + 4) >> 3; \
 | 
			
		||||
        dst += stride; \
 | 
			
		||||
        dst += dstride; \
 | 
			
		||||
        tmp += SIZE; \
 | 
			
		||||
    } \
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user