ac3enc: Add x86-optimized function to speed up log2_tab().
AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute value of each element in an array of int16_t. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
		
							parent
							
								
									1a973feb45
								
							
						
					
					
						commit
						fbb6b49dab
					
				@ -42,9 +42,18 @@ static void ac3_exponent_min_c(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int ac3_max_msb_abs_int16_c(const int16_t *src, int len)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    int i, v = 0;
 | 
				
			||||||
 | 
					    for (i = 0; i < len; i++)
 | 
				
			||||||
 | 
					        v |= abs(src[i]);
 | 
				
			||||||
 | 
					    return v;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
av_cold void ff_ac3dsp_init(AC3DSPContext *c)
 | 
					av_cold void ff_ac3dsp_init(AC3DSPContext *c)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    c->ac3_exponent_min = ac3_exponent_min_c;
 | 
					    c->ac3_exponent_min = ac3_exponent_min_c;
 | 
				
			||||||
 | 
					    c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (HAVE_MMX)
 | 
					    if (HAVE_MMX)
 | 
				
			||||||
        ff_ac3dsp_init_x86(c);
 | 
					        ff_ac3dsp_init_x86(c);
 | 
				
			||||||
 | 
				
			|||||||
@ -35,6 +35,17 @@ typedef struct AC3DSPContext {
 | 
				
			|||||||
     * @param nb_coefs  number of frequency coefficients.
 | 
					     * @param nb_coefs  number of frequency coefficients.
 | 
				
			||||||
     */
 | 
					     */
 | 
				
			||||||
    void (*ac3_exponent_min)(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
 | 
					    void (*ac3_exponent_min)(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    /**
 | 
				
			||||||
 | 
					     * Calculate the maximum MSB of the absolute value of each element in an
 | 
				
			||||||
 | 
					     * array of int16_t.
 | 
				
			||||||
 | 
					     * @param src input array
 | 
				
			||||||
 | 
					     *            constraints: align 16. values must be in range [-32767,32767]
 | 
				
			||||||
 | 
					     * @param len number of values in the array
 | 
				
			||||||
 | 
					     *            constraints: multiple of 16 greater than 0
 | 
				
			||||||
 | 
					     * @return    a value with the same MSB as max(abs(src[]))
 | 
				
			||||||
 | 
					     */
 | 
				
			||||||
 | 
					    int (*ac3_max_msb_abs_int16)(const int16_t *src, int len);
 | 
				
			||||||
} AC3DSPContext;
 | 
					} AC3DSPContext;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void ff_ac3dsp_init    (AC3DSPContext *c);
 | 
					void ff_ac3dsp_init    (AC3DSPContext *c);
 | 
				
			||||||
 | 
				
			|||||||
@ -270,14 +270,9 @@ static void apply_window(DSPContext *dsp, int16_t *output, const int16_t *input,
 | 
				
			|||||||
 * @param n   number of values in the array
 | 
					 * @param n   number of values in the array
 | 
				
			||||||
 * @return    log2(max(abs(tab[])))
 | 
					 * @return    log2(max(abs(tab[])))
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static int log2_tab(int16_t *tab, int n)
 | 
					static int log2_tab(AC3EncodeContext *s, int16_t *src, int len)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    int i, v;
 | 
					    int v = s->ac3dsp.ac3_max_msb_abs_int16(src, len);
 | 
				
			||||||
 | 
					 | 
				
			||||||
    v = 0;
 | 
					 | 
				
			||||||
    for (i = 0; i < n; i++)
 | 
					 | 
				
			||||||
        v |= abs(tab[i]);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return av_log2(v);
 | 
					    return av_log2(v);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -308,7 +303,7 @@ static void lshift_tab(int16_t *tab, int n, unsigned int lshift)
 | 
				
			|||||||
 */
 | 
					 */
 | 
				
			||||||
static int normalize_samples(AC3EncodeContext *s)
 | 
					static int normalize_samples(AC3EncodeContext *s)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    int v = 14 - log2_tab(s->windowed_samples, AC3_WINDOW_SIZE);
 | 
					    int v = 14 - log2_tab(s, s->windowed_samples, AC3_WINDOW_SIZE);
 | 
				
			||||||
    lshift_tab(s->windowed_samples, AC3_WINDOW_SIZE, v);
 | 
					    lshift_tab(s->windowed_samples, AC3_WINDOW_SIZE, v);
 | 
				
			||||||
    return v - 9;
 | 
					    return v - 9;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -65,3 +65,72 @@ AC3_EXPONENT_MIN sse2
 | 
				
			|||||||
%endif
 | 
					%endif
 | 
				
			||||||
%undef PMINUB
 | 
					%undef PMINUB
 | 
				
			||||||
%undef LOOP_ALIGN
 | 
					%undef LOOP_ALIGN
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					;-----------------------------------------------------------------------------
 | 
				
			||||||
 | 
					; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; This function uses 2 different methods to calculate a valid result.
 | 
				
			||||||
 | 
					; 1) logical 'or' of abs of each element
 | 
				
			||||||
 | 
					;        This is used for ssse3 because of the pabsw instruction.
 | 
				
			||||||
 | 
					;        It is also used for mmx because of the lack of min/max instructions.
 | 
				
			||||||
 | 
					; 2) calculate min/max for the array, then or(abs(min),abs(max))
 | 
				
			||||||
 | 
					;        This is used for mmxext and sse2 because they have pminsw/pmaxsw.
 | 
				
			||||||
 | 
					;-----------------------------------------------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					%macro AC3_MAX_MSB_ABS_INT16 2
 | 
				
			||||||
 | 
					cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len
 | 
				
			||||||
 | 
					    pxor        m2, m2
 | 
				
			||||||
 | 
					    pxor        m3, m3
 | 
				
			||||||
 | 
					.loop:
 | 
				
			||||||
 | 
					%ifidn %2, min_max
 | 
				
			||||||
 | 
					    mova        m0, [srcq]
 | 
				
			||||||
 | 
					    mova        m1, [srcq+mmsize]
 | 
				
			||||||
 | 
					    pminsw      m2, m0
 | 
				
			||||||
 | 
					    pminsw      m2, m1
 | 
				
			||||||
 | 
					    pmaxsw      m3, m0
 | 
				
			||||||
 | 
					    pmaxsw      m3, m1
 | 
				
			||||||
 | 
					%else ; or_abs
 | 
				
			||||||
 | 
					%ifidn %1, mmx
 | 
				
			||||||
 | 
					    mova        m0, [srcq]
 | 
				
			||||||
 | 
					    mova        m1, [srcq+mmsize]
 | 
				
			||||||
 | 
					    ABS2        m0, m1, m3, m4
 | 
				
			||||||
 | 
					%else ; ssse3
 | 
				
			||||||
 | 
					    ; using memory args is faster for ssse3
 | 
				
			||||||
 | 
					    pabsw       m0, [srcq]
 | 
				
			||||||
 | 
					    pabsw       m1, [srcq+mmsize]
 | 
				
			||||||
 | 
					%endif
 | 
				
			||||||
 | 
					    por         m2, m0
 | 
				
			||||||
 | 
					    por         m2, m1
 | 
				
			||||||
 | 
					%endif
 | 
				
			||||||
 | 
					    add       srcq, mmsize*2
 | 
				
			||||||
 | 
					    sub       lend, mmsize
 | 
				
			||||||
 | 
					    ja .loop
 | 
				
			||||||
 | 
					%ifidn %2, min_max
 | 
				
			||||||
 | 
					    ABS2        m2, m3, m0, m1
 | 
				
			||||||
 | 
					    por         m2, m3
 | 
				
			||||||
 | 
					%endif
 | 
				
			||||||
 | 
					%ifidn mmsize, 16
 | 
				
			||||||
 | 
					    mova        m0, m2
 | 
				
			||||||
 | 
					    punpckhqdq  m0, m0
 | 
				
			||||||
 | 
					    por         m2, m0
 | 
				
			||||||
 | 
					%endif
 | 
				
			||||||
 | 
					    PSHUFLW     m0, m2, 0xe
 | 
				
			||||||
 | 
					    por         m2, m0
 | 
				
			||||||
 | 
					    PSHUFLW     m0, m2, 0x1
 | 
				
			||||||
 | 
					    por         m2, m0
 | 
				
			||||||
 | 
					    movd       eax, m2
 | 
				
			||||||
 | 
					    and        eax, 0xFFFF
 | 
				
			||||||
 | 
					    RET
 | 
				
			||||||
 | 
					%endmacro
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					INIT_MMX
 | 
				
			||||||
 | 
					%define ABS2 ABS2_MMX
 | 
				
			||||||
 | 
					%define PSHUFLW pshufw
 | 
				
			||||||
 | 
					AC3_MAX_MSB_ABS_INT16 mmx, or_abs
 | 
				
			||||||
 | 
					%define ABS2 ABS2_MMX2
 | 
				
			||||||
 | 
					AC3_MAX_MSB_ABS_INT16 mmxext, min_max
 | 
				
			||||||
 | 
					INIT_XMM
 | 
				
			||||||
 | 
					%define PSHUFLW pshuflw
 | 
				
			||||||
 | 
					AC3_MAX_MSB_ABS_INT16 sse2, min_max
 | 
				
			||||||
 | 
					%define ABS2 ABS2_SSSE3
 | 
				
			||||||
 | 
					AC3_MAX_MSB_ABS_INT16 ssse3, or_abs
 | 
				
			||||||
 | 
				
			|||||||
@ -27,6 +27,11 @@ extern void ff_ac3_exponent_min_mmx   (uint8_t *exp, int num_reuse_blocks, int n
 | 
				
			|||||||
extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
 | 
					extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
 | 
				
			||||||
extern void ff_ac3_exponent_min_sse2  (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
 | 
					extern void ff_ac3_exponent_min_sse2  (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern int ff_ac3_max_msb_abs_int16_mmx   (const int16_t *src, int len);
 | 
				
			||||||
 | 
					extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
 | 
				
			||||||
 | 
					extern int ff_ac3_max_msb_abs_int16_sse2  (const int16_t *src, int len);
 | 
				
			||||||
 | 
					extern int ff_ac3_max_msb_abs_int16_ssse3 (const int16_t *src, int len);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
 | 
					av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    int mm_flags = av_get_cpu_flags();
 | 
					    int mm_flags = av_get_cpu_flags();
 | 
				
			||||||
@ -34,12 +39,18 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
 | 
				
			|||||||
#if HAVE_YASM
 | 
					#if HAVE_YASM
 | 
				
			||||||
    if (mm_flags & AV_CPU_FLAG_MMX) {
 | 
					    if (mm_flags & AV_CPU_FLAG_MMX) {
 | 
				
			||||||
        c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
 | 
					        c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
 | 
				
			||||||
 | 
					        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
 | 
					    if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
 | 
				
			||||||
        c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
 | 
					        c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
 | 
				
			||||||
 | 
					        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
 | 
					    if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
 | 
				
			||||||
        c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
 | 
					        c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
 | 
				
			||||||
 | 
					        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) {
 | 
				
			||||||
 | 
					        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user