fmtconvert: port float_to_int16_interleave() 2-channel x86 inline asm to yasm
This commit is contained in:
		
							parent
							
								
									4e8e262476
								
							
						
					
					
						commit
						aad3429d4e
					
				@ -112,6 +112,58 @@ FLOAT_TO_INT16 3dnow, 0
 | 
			
		||||
%undef cvtps2pi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
;-------------------------------------------------------------------------------
 | 
			
		||||
; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
 | 
			
		||||
;-------------------------------------------------------------------------------
 | 
			
		||||
%macro FLOAT_TO_INT16_INTERLEAVE2 1
 | 
			
		||||
cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len
 | 
			
		||||
    lea      lenq, [4*r2q]
 | 
			
		||||
    mov     src1q, [src0q+gprsize]
 | 
			
		||||
    mov     src0q, [src0q]
 | 
			
		||||
    add      dstq, lenq
 | 
			
		||||
    add     src0q, lenq
 | 
			
		||||
    add     src1q, lenq
 | 
			
		||||
    neg      lenq
 | 
			
		||||
.loop:
 | 
			
		||||
%ifidn %1, sse2
 | 
			
		||||
    cvtps2dq   m0, [src0q+lenq]
 | 
			
		||||
    cvtps2dq   m1, [src1q+lenq]
 | 
			
		||||
    packssdw   m0, m1
 | 
			
		||||
    movhlps    m1, m0
 | 
			
		||||
    punpcklwd  m0, m1
 | 
			
		||||
    mova  [dstq+lenq], m0
 | 
			
		||||
%else
 | 
			
		||||
    cvtps2pi   m0, [src0q+lenq  ]
 | 
			
		||||
    cvtps2pi   m1, [src0q+lenq+8]
 | 
			
		||||
    cvtps2pi   m2, [src1q+lenq  ]
 | 
			
		||||
    cvtps2pi   m3, [src1q+lenq+8]
 | 
			
		||||
    packssdw   m0, m1
 | 
			
		||||
    packssdw   m2, m3
 | 
			
		||||
    mova       m1, m0
 | 
			
		||||
    punpcklwd  m0, m2
 | 
			
		||||
    punpckhwd  m1, m2
 | 
			
		||||
    mova  [dstq+lenq  ], m0
 | 
			
		||||
    mova  [dstq+lenq+8], m1
 | 
			
		||||
%endif
 | 
			
		||||
    add      lenq, 16
 | 
			
		||||
    js .loop
 | 
			
		||||
%ifnidn %1, sse2
 | 
			
		||||
    emms
 | 
			
		||||
%endif
 | 
			
		||||
    REP_RET
 | 
			
		||||
%endmacro
 | 
			
		||||
 | 
			
		||||
INIT_MMX
 | 
			
		||||
%define cvtps2pi pf2id
 | 
			
		||||
FLOAT_TO_INT16_INTERLEAVE2 3dnow
 | 
			
		||||
%undef cvtps2pi
 | 
			
		||||
%define movdqa movaps
 | 
			
		||||
FLOAT_TO_INT16_INTERLEAVE2 sse
 | 
			
		||||
%undef movdqa
 | 
			
		||||
INIT_XMM
 | 
			
		||||
FLOAT_TO_INT16_INTERLEAVE2 sse2
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
%macro PSWAPD_SSE 2
 | 
			
		||||
    pshufw %1, %2, 0x4e
 | 
			
		||||
%endmacro
 | 
			
		||||
 | 
			
		||||
@ -35,13 +35,17 @@ void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
 | 
			
		||||
void ff_float_to_int16_sse  (int16_t *dst, const float *src, long len);
 | 
			
		||||
void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
 | 
			
		||||
 | 
			
		||||
void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len);
 | 
			
		||||
void ff_float_to_int16_interleave2_sse  (int16_t *dst, const float **src, long len);
 | 
			
		||||
void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len);
 | 
			
		||||
 | 
			
		||||
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
 | 
			
		||||
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
 | 
			
		||||
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
 | 
			
		||||
 | 
			
		||||
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
 | 
			
		||||
 | 
			
		||||
#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
 | 
			
		||||
#define FLOAT_TO_INT16_INTERLEAVE(cpu) \
 | 
			
		||||
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
 | 
			
		||||
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
 | 
			
		||||
    DECLARE_ALIGNED(16, int16_t, tmp)[len];\
 | 
			
		||||
@ -57,71 +61,16 @@ static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, lon
 | 
			
		||||
    if(channels==1)\
 | 
			
		||||
        ff_float_to_int16_##cpu(dst, src[0], len);\
 | 
			
		||||
    else if(channels==2){\
 | 
			
		||||
        x86_reg reglen = len; \
 | 
			
		||||
        const float *src0 = src[0];\
 | 
			
		||||
        const float *src1 = src[1];\
 | 
			
		||||
        __asm__ volatile(\
 | 
			
		||||
            "shl $2, %0 \n"\
 | 
			
		||||
            "add %0, %1 \n"\
 | 
			
		||||
            "add %0, %2 \n"\
 | 
			
		||||
            "add %0, %3 \n"\
 | 
			
		||||
            "neg %0 \n"\
 | 
			
		||||
            body\
 | 
			
		||||
            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
 | 
			
		||||
        );\
 | 
			
		||||
        ff_float_to_int16_interleave2_##cpu(dst, src, len);\
 | 
			
		||||
    }else if(channels==6){\
 | 
			
		||||
        ff_float_to_int16_interleave6_##cpu(dst, src, len);\
 | 
			
		||||
    }else\
 | 
			
		||||
        float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
FLOAT_TO_INT16_INTERLEAVE(3dnow,
 | 
			
		||||
    "1:                         \n"
 | 
			
		||||
    "pf2id     (%2,%0), %%mm0   \n"
 | 
			
		||||
    "pf2id    8(%2,%0), %%mm1   \n"
 | 
			
		||||
    "pf2id     (%3,%0), %%mm2   \n"
 | 
			
		||||
    "pf2id    8(%3,%0), %%mm3   \n"
 | 
			
		||||
    "packssdw    %%mm1, %%mm0   \n"
 | 
			
		||||
    "packssdw    %%mm3, %%mm2   \n"
 | 
			
		||||
    "movq        %%mm0, %%mm1   \n"
 | 
			
		||||
    "punpcklwd   %%mm2, %%mm0   \n"
 | 
			
		||||
    "punpckhwd   %%mm2, %%mm1   \n"
 | 
			
		||||
    "movq        %%mm0,  (%1,%0)\n"
 | 
			
		||||
    "movq        %%mm1, 8(%1,%0)\n"
 | 
			
		||||
    "add $16, %0                \n"
 | 
			
		||||
    "js 1b                      \n"
 | 
			
		||||
    "femms                      \n"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
FLOAT_TO_INT16_INTERLEAVE(sse,
 | 
			
		||||
    "1:                         \n"
 | 
			
		||||
    "cvtps2pi  (%2,%0), %%mm0   \n"
 | 
			
		||||
    "cvtps2pi 8(%2,%0), %%mm1   \n"
 | 
			
		||||
    "cvtps2pi  (%3,%0), %%mm2   \n"
 | 
			
		||||
    "cvtps2pi 8(%3,%0), %%mm3   \n"
 | 
			
		||||
    "packssdw    %%mm1, %%mm0   \n"
 | 
			
		||||
    "packssdw    %%mm3, %%mm2   \n"
 | 
			
		||||
    "movq        %%mm0, %%mm1   \n"
 | 
			
		||||
    "punpcklwd   %%mm2, %%mm0   \n"
 | 
			
		||||
    "punpckhwd   %%mm2, %%mm1   \n"
 | 
			
		||||
    "movq        %%mm0,  (%1,%0)\n"
 | 
			
		||||
    "movq        %%mm1, 8(%1,%0)\n"
 | 
			
		||||
    "add $16, %0                \n"
 | 
			
		||||
    "js 1b                      \n"
 | 
			
		||||
    "emms                       \n"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
FLOAT_TO_INT16_INTERLEAVE(sse2,
 | 
			
		||||
    "1:                         \n"
 | 
			
		||||
    "cvtps2dq  (%2,%0), %%xmm0  \n"
 | 
			
		||||
    "cvtps2dq  (%3,%0), %%xmm1  \n"
 | 
			
		||||
    "packssdw   %%xmm1, %%xmm0  \n"
 | 
			
		||||
    "movhlps    %%xmm0, %%xmm1  \n"
 | 
			
		||||
    "punpcklwd  %%xmm1, %%xmm0  \n"
 | 
			
		||||
    "movdqa     %%xmm0, (%1,%0) \n"
 | 
			
		||||
    "add $16, %0                \n"
 | 
			
		||||
    "js 1b                      \n"
 | 
			
		||||
)
 | 
			
		||||
FLOAT_TO_INT16_INTERLEAVE(3dnow)
 | 
			
		||||
FLOAT_TO_INT16_INTERLEAVE(sse)
 | 
			
		||||
FLOAT_TO_INT16_INTERLEAVE(sse2)
 | 
			
		||||
 | 
			
		||||
static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
 | 
			
		||||
    if(channels==6)
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user