Up to ~4 times faster on x86_64, ~8 times on x86_32 if compiling using x87 fp math. Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: James Almer <jamrial@gmail.com>
		
			
				
	
	
		
			417 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			417 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/*
 | 
						|
 * Copyright (C) 2016 foo86
 | 
						|
 *
 | 
						|
 * This file is part of FFmpeg.
 | 
						|
 *
 | 
						|
 * FFmpeg is free software; you can redistribute it and/or
 | 
						|
 * modify it under the terms of the GNU Lesser General Public
 | 
						|
 * License as published by the Free Software Foundation; either
 | 
						|
 * version 2.1 of the License, or (at your option) any later version.
 | 
						|
 *
 | 
						|
 * FFmpeg is distributed in the hope that it will be useful,
 | 
						|
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
 * Lesser General Public License for more details.
 | 
						|
 *
 | 
						|
 * You should have received a copy of the GNU Lesser General Public
 | 
						|
 * License along with FFmpeg; if not, write to the Free Software
 | 
						|
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | 
						|
 */
 | 
						|
 | 
						|
#include "libavutil/mem.h"
 | 
						|
 | 
						|
#include "dcadsp.h"
 | 
						|
#include "dcamath.h"
 | 
						|
 | 
						|
static void decode_hf_c(int32_t **dst,
 | 
						|
                        const int32_t *vq_index,
 | 
						|
                        const int8_t hf_vq[1024][32],
 | 
						|
                        int32_t scale_factors[32][2],
 | 
						|
                        ptrdiff_t sb_start, ptrdiff_t sb_end,
 | 
						|
                        ptrdiff_t ofs, ptrdiff_t len)
 | 
						|
{
 | 
						|
    int i, j;
 | 
						|
 | 
						|
    for (i = sb_start; i < sb_end; i++) {
 | 
						|
        const int8_t *coeff = hf_vq[vq_index[i]];
 | 
						|
        int32_t scale = scale_factors[i][0];
 | 
						|
        for (j = 0; j < len; j++)
 | 
						|
            dst[i][j + ofs] = clip23(coeff[j] * scale + (1 << 3) >> 4);
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
static void decode_joint_c(int32_t **dst, int32_t **src,
 | 
						|
                           const int32_t *scale_factors,
 | 
						|
                           ptrdiff_t sb_start, ptrdiff_t sb_end,
 | 
						|
                           ptrdiff_t ofs, ptrdiff_t len)
 | 
						|
{
 | 
						|
    int i, j;
 | 
						|
 | 
						|
    for (i = sb_start; i < sb_end; i++) {
 | 
						|
        int32_t scale = scale_factors[i];
 | 
						|
        for (j = 0; j < len; j++)
 | 
						|
            dst[i][j + ofs] = clip23(mul17(src[i][j + ofs], scale));
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
static void lfe_fir_float_c(float *pcm_samples, int32_t *lfe_samples,
 | 
						|
                            const float *filter_coeff, ptrdiff_t npcmblocks,
 | 
						|
                            int dec_select)
 | 
						|
{
 | 
						|
    // Select decimation factor
 | 
						|
    int factor = 64 << dec_select;
 | 
						|
    int ncoeffs = 8 >> dec_select;
 | 
						|
    int nlfesamples = npcmblocks >> (dec_select + 1);
 | 
						|
    int i, j, k;
 | 
						|
 | 
						|
    for (i = 0; i < nlfesamples; i++) {
 | 
						|
        // One decimated sample generates 64 or 128 interpolated ones
 | 
						|
        for (j = 0; j < factor / 2; j++) {
 | 
						|
            float a = 0;
 | 
						|
            float b = 0;
 | 
						|
 | 
						|
            for (k = 0; k < ncoeffs; k++) {
 | 
						|
                a += filter_coeff[      j * ncoeffs + k] * lfe_samples[-k];
 | 
						|
                b += filter_coeff[255 - j * ncoeffs - k] * lfe_samples[-k];
 | 
						|
            }
 | 
						|
 | 
						|
            pcm_samples[             j] = a;
 | 
						|
            pcm_samples[factor / 2 + j] = b;
 | 
						|
        }
 | 
						|
 | 
						|
        lfe_samples++;
 | 
						|
        pcm_samples += factor;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
static void lfe_fir0_float_c(float *pcm_samples, int32_t *lfe_samples,
 | 
						|
                             const float *filter_coeff, ptrdiff_t npcmblocks)
 | 
						|
{
 | 
						|
    lfe_fir_float_c(pcm_samples, lfe_samples, filter_coeff, npcmblocks, 0);
 | 
						|
}
 | 
						|
 | 
						|
static void lfe_fir1_float_c(float *pcm_samples, int32_t *lfe_samples,
 | 
						|
                             const float *filter_coeff, ptrdiff_t npcmblocks)
 | 
						|
{
 | 
						|
    lfe_fir_float_c(pcm_samples, lfe_samples, filter_coeff, npcmblocks, 1);
 | 
						|
}
 | 
						|
 | 
						|
static void lfe_x96_float_c(float *dst, const float *src,
 | 
						|
                            float *hist, ptrdiff_t len)
 | 
						|
{
 | 
						|
    float prev = *hist;
 | 
						|
    int i;
 | 
						|
 | 
						|
    for (i = 0; i < len; i++) {
 | 
						|
        float a = 0.25f * src[i] + 0.75f * prev;
 | 
						|
        float b = 0.75f * src[i] + 0.25f * prev;
 | 
						|
        prev = src[i];
 | 
						|
        *dst++ = a;
 | 
						|
        *dst++ = b;
 | 
						|
    }
 | 
						|
 | 
						|
    *hist = prev;
 | 
						|
}
 | 
						|
 | 
						|
static void sub_qmf32_float_c(SynthFilterContext *synth,
 | 
						|
                              FFTContext *imdct,
 | 
						|
                              float *pcm_samples,
 | 
						|
                              int32_t **subband_samples_lo,
 | 
						|
                              int32_t **subband_samples_hi,
 | 
						|
                              float *hist1, int *offset, float *hist2,
 | 
						|
                              const float *filter_coeff, ptrdiff_t npcmblocks,
 | 
						|
                              float scale)
 | 
						|
{
 | 
						|
    LOCAL_ALIGNED(32, float, input, [32]);
 | 
						|
    int i, j;
 | 
						|
 | 
						|
    for (j = 0; j < npcmblocks; j++) {
 | 
						|
        // Load in one sample from each subband
 | 
						|
        for (i = 0; i < 32; i++) {
 | 
						|
            if ((i - 1) & 2)
 | 
						|
                input[i] = -subband_samples_lo[i][j];
 | 
						|
            else
 | 
						|
                input[i] =  subband_samples_lo[i][j];
 | 
						|
        }
 | 
						|
 | 
						|
        // One subband sample generates 32 interpolated ones
 | 
						|
        synth->synth_filter_float(imdct, hist1, offset,
 | 
						|
                                  hist2, filter_coeff,
 | 
						|
                                  pcm_samples, input, scale);
 | 
						|
        pcm_samples += 32;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
static void sub_qmf64_float_c(SynthFilterContext *synth,
 | 
						|
                              FFTContext *imdct,
 | 
						|
                              float *pcm_samples,
 | 
						|
                              int32_t **subband_samples_lo,
 | 
						|
                              int32_t **subband_samples_hi,
 | 
						|
                              float *hist1, int *offset, float *hist2,
 | 
						|
                              const float *filter_coeff, ptrdiff_t npcmblocks,
 | 
						|
                              float scale)
 | 
						|
{
 | 
						|
    LOCAL_ALIGNED(32, float, input, [64]);
 | 
						|
    int i, j;
 | 
						|
 | 
						|
    if (!subband_samples_hi)
 | 
						|
        memset(&input[32], 0, sizeof(input[0]) * 32);
 | 
						|
 | 
						|
    for (j = 0; j < npcmblocks; j++) {
 | 
						|
        // Load in one sample from each subband
 | 
						|
        if (subband_samples_hi) {
 | 
						|
            // Full 64 subbands, first 32 are residual coded
 | 
						|
            for (i =  0; i < 32; i++) {
 | 
						|
                if ((i - 1) & 2)
 | 
						|
                    input[i] = -subband_samples_lo[i][j] - subband_samples_hi[i][j];
 | 
						|
                else
 | 
						|
                    input[i] =  subband_samples_lo[i][j] + subband_samples_hi[i][j];
 | 
						|
            }
 | 
						|
            for (i = 32; i < 64; i++) {
 | 
						|
                if ((i - 1) & 2)
 | 
						|
                    input[i] = -subband_samples_hi[i][j];
 | 
						|
                else
 | 
						|
                    input[i] =  subband_samples_hi[i][j];
 | 
						|
            }
 | 
						|
        } else {
 | 
						|
            // Only first 32 subbands
 | 
						|
            for (i =  0; i < 32; i++) {
 | 
						|
                if ((i - 1) & 2)
 | 
						|
                    input[i] = -subband_samples_lo[i][j];
 | 
						|
                else
 | 
						|
                    input[i] =  subband_samples_lo[i][j];
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        // One subband sample generates 64 interpolated ones
 | 
						|
        synth->synth_filter_float_64(imdct, hist1, offset,
 | 
						|
                                     hist2, filter_coeff,
 | 
						|
                                     pcm_samples, input, scale);
 | 
						|
        pcm_samples += 64;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
static void lfe_fir_fixed_c(int32_t *pcm_samples, int32_t *lfe_samples,
 | 
						|
                            const int32_t *filter_coeff, ptrdiff_t npcmblocks)
 | 
						|
{
 | 
						|
    // Select decimation factor
 | 
						|
    int nlfesamples = npcmblocks >> 1;
 | 
						|
    int i, j, k;
 | 
						|
 | 
						|
    for (i = 0; i < nlfesamples; i++) {
 | 
						|
        // One decimated sample generates 64 interpolated ones
 | 
						|
        for (j = 0; j < 32; j++) {
 | 
						|
            int64_t a = 0;
 | 
						|
            int64_t b = 0;
 | 
						|
 | 
						|
            for (k = 0; k < 8; k++) {
 | 
						|
                a += (int64_t)filter_coeff[      j * 8 + k] * lfe_samples[-k];
 | 
						|
                b += (int64_t)filter_coeff[255 - j * 8 - k] * lfe_samples[-k];
 | 
						|
            }
 | 
						|
 | 
						|
            pcm_samples[     j] = clip23(norm23(a));
 | 
						|
            pcm_samples[32 + j] = clip23(norm23(b));
 | 
						|
        }
 | 
						|
 | 
						|
        lfe_samples++;
 | 
						|
        pcm_samples += 64;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
static void lfe_x96_fixed_c(int32_t *dst, const int32_t *src,
 | 
						|
                            int32_t *hist, ptrdiff_t len)
 | 
						|
{
 | 
						|
    int32_t prev = *hist;
 | 
						|
    int i;
 | 
						|
 | 
						|
    for (i = 0; i < len; i++) {
 | 
						|
        int64_t a = INT64_C(2097471) * src[i] + INT64_C(6291137) * prev;
 | 
						|
        int64_t b = INT64_C(6291137) * src[i] + INT64_C(2097471) * prev;
 | 
						|
        prev = src[i];
 | 
						|
        *dst++ = clip23(norm23(a));
 | 
						|
        *dst++ = clip23(norm23(b));
 | 
						|
    }
 | 
						|
 | 
						|
    *hist = prev;
 | 
						|
}
 | 
						|
 | 
						|
static void sub_qmf32_fixed_c(SynthFilterContext *synth,
 | 
						|
                              DCADCTContext *imdct,
 | 
						|
                              int32_t *pcm_samples,
 | 
						|
                              int32_t **subband_samples_lo,
 | 
						|
                              int32_t **subband_samples_hi,
 | 
						|
                              int32_t *hist1, int *offset, int32_t *hist2,
 | 
						|
                              const int32_t *filter_coeff, ptrdiff_t npcmblocks)
 | 
						|
{
 | 
						|
    LOCAL_ALIGNED(32, int32_t, input, [32]);
 | 
						|
    int i, j;
 | 
						|
 | 
						|
    for (j = 0; j < npcmblocks; j++) {
 | 
						|
        // Load in one sample from each subband
 | 
						|
        for (i = 0; i < 32; i++)
 | 
						|
            input[i] = subband_samples_lo[i][j];
 | 
						|
 | 
						|
        // One subband sample generates 32 interpolated ones
 | 
						|
        synth->synth_filter_fixed(imdct, hist1, offset,
 | 
						|
                                  hist2, filter_coeff,
 | 
						|
                                  pcm_samples, input);
 | 
						|
        pcm_samples += 32;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
static void sub_qmf64_fixed_c(SynthFilterContext *synth,
 | 
						|
                              DCADCTContext *imdct,
 | 
						|
                              int32_t *pcm_samples,
 | 
						|
                              int32_t **subband_samples_lo,
 | 
						|
                              int32_t **subband_samples_hi,
 | 
						|
                              int32_t *hist1, int *offset, int32_t *hist2,
 | 
						|
                              const int32_t *filter_coeff, ptrdiff_t npcmblocks)
 | 
						|
{
 | 
						|
    LOCAL_ALIGNED(32, int32_t, input, [64]);
 | 
						|
    int i, j;
 | 
						|
 | 
						|
    if (!subband_samples_hi)
 | 
						|
        memset(&input[32], 0, sizeof(input[0]) * 32);
 | 
						|
 | 
						|
    for (j = 0; j < npcmblocks; j++) {
 | 
						|
        // Load in one sample from each subband
 | 
						|
        if (subband_samples_hi) {
 | 
						|
            // Full 64 subbands, first 32 are residual coded
 | 
						|
            for (i =  0; i < 32; i++)
 | 
						|
                input[i] = subband_samples_lo[i][j] + subband_samples_hi[i][j];
 | 
						|
            for (i = 32; i < 64; i++)
 | 
						|
                input[i] = subband_samples_hi[i][j];
 | 
						|
        } else {
 | 
						|
            // Only first 32 subbands
 | 
						|
            for (i =  0; i < 32; i++)
 | 
						|
                input[i] = subband_samples_lo[i][j];
 | 
						|
        }
 | 
						|
 | 
						|
        // One subband sample generates 64 interpolated ones
 | 
						|
        synth->synth_filter_fixed_64(imdct, hist1, offset,
 | 
						|
                                     hist2, filter_coeff,
 | 
						|
                                     pcm_samples, input);
 | 
						|
        pcm_samples += 64;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
static void decor_c(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len)
 | 
						|
{
 | 
						|
    int i;
 | 
						|
 | 
						|
    for (i = 0; i < len; i++)
 | 
						|
        dst[i] += src[i] * coeff + (1 << 2) >> 3;
 | 
						|
}
 | 
						|
 | 
						|
static void dmix_sub_xch_c(int32_t *dst1, int32_t *dst2,
 | 
						|
                           const int32_t *src, ptrdiff_t len)
 | 
						|
{
 | 
						|
    int i;
 | 
						|
 | 
						|
    for (i = 0; i < len; i++) {
 | 
						|
        int32_t cs = mul23(src[i], 5931520 /* M_SQRT1_2 * (1 << 23) */);
 | 
						|
        dst1[i] -= cs;
 | 
						|
        dst2[i] -= cs;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
static void dmix_sub_c(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len)
 | 
						|
{
 | 
						|
    int i;
 | 
						|
 | 
						|
    for (i = 0; i < len; i++)
 | 
						|
        dst[i] -= mul15(src[i], coeff);
 | 
						|
}
 | 
						|
 | 
						|
static void dmix_add_c(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len)
 | 
						|
{
 | 
						|
    int i;
 | 
						|
 | 
						|
    for (i = 0; i < len; i++)
 | 
						|
        dst[i] += mul15(src[i], coeff);
 | 
						|
}
 | 
						|
 | 
						|
static void dmix_scale_c(int32_t *dst, int scale, ptrdiff_t len)
 | 
						|
{
 | 
						|
    int i;
 | 
						|
 | 
						|
    for (i = 0; i < len; i++)
 | 
						|
        dst[i] = mul15(dst[i], scale);
 | 
						|
}
 | 
						|
 | 
						|
static void dmix_scale_inv_c(int32_t *dst, int scale_inv, ptrdiff_t len)
 | 
						|
{
 | 
						|
    int i;
 | 
						|
 | 
						|
    for (i = 0; i < len; i++)
 | 
						|
        dst[i] = mul16(dst[i], scale_inv);
 | 
						|
}
 | 
						|
 | 
						|
static void filter0(int32_t *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
 | 
						|
{
 | 
						|
    int i;
 | 
						|
 | 
						|
    for (i = 0; i < len; i++)
 | 
						|
        dst[i] -= mul22(src[i], coeff);
 | 
						|
}
 | 
						|
 | 
						|
static void filter1(int32_t *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
 | 
						|
{
 | 
						|
    int i;
 | 
						|
 | 
						|
    for (i = 0; i < len; i++)
 | 
						|
        dst[i] -= mul23(src[i], coeff);
 | 
						|
}
 | 
						|
 | 
						|
static void assemble_freq_bands_c(int32_t *dst, int32_t *src0, int32_t *src1,
 | 
						|
                                  const int32_t *coeff, ptrdiff_t len)
 | 
						|
{
 | 
						|
    int i;
 | 
						|
 | 
						|
    filter0(src0, src1, coeff[0], len);
 | 
						|
    filter0(src1, src0, coeff[1], len);
 | 
						|
    filter0(src0, src1, coeff[2], len);
 | 
						|
    filter0(src1, src0, coeff[3], len);
 | 
						|
 | 
						|
    for (i = 0; i < 8; i++, src0--) {
 | 
						|
        filter1(src0, src1, coeff[i +  4], len);
 | 
						|
        filter1(src1, src0, coeff[i + 12], len);
 | 
						|
        filter1(src0, src1, coeff[i +  4], len);
 | 
						|
    }
 | 
						|
 | 
						|
    for (i = 0; i < len; i++) {
 | 
						|
        *dst++ = *src1++;
 | 
						|
        *dst++ = *++src0;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
av_cold void ff_dcadsp_init(DCADSPContext *s)
 | 
						|
{
 | 
						|
    s->decode_hf     = decode_hf_c;
 | 
						|
    s->decode_joint  = decode_joint_c;
 | 
						|
 | 
						|
    s->lfe_fir_float[0] = lfe_fir0_float_c;
 | 
						|
    s->lfe_fir_float[1] = lfe_fir1_float_c;
 | 
						|
    s->lfe_x96_float    = lfe_x96_float_c;
 | 
						|
    s->sub_qmf_float[0] = sub_qmf32_float_c;
 | 
						|
    s->sub_qmf_float[1] = sub_qmf64_float_c;
 | 
						|
 | 
						|
    s->lfe_fir_fixed    = lfe_fir_fixed_c;
 | 
						|
    s->lfe_x96_fixed    = lfe_x96_fixed_c;
 | 
						|
    s->sub_qmf_fixed[0] = sub_qmf32_fixed_c;
 | 
						|
    s->sub_qmf_fixed[1] = sub_qmf64_fixed_c;
 | 
						|
 | 
						|
    s->decor   = decor_c;
 | 
						|
 | 
						|
    s->dmix_sub_xch   = dmix_sub_xch_c;
 | 
						|
    s->dmix_sub       = dmix_sub_c;
 | 
						|
    s->dmix_add       = dmix_add_c;
 | 
						|
    s->dmix_scale     = dmix_scale_c;
 | 
						|
    s->dmix_scale_inv = dmix_scale_inv_c;
 | 
						|
 | 
						|
    s->assemble_freq_bands = assemble_freq_bands_c;
 | 
						|
 | 
						|
    if (ARCH_X86)
 | 
						|
        ff_dcadsp_init_x86(s);
 | 
						|
}
 |