3dnow2 implementation of imdct.
6% faster vorbis and wma. Originally committed as revision 5954 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
		
							parent
							
								
									2c5ad5fd74
								
							
						
					
					
						commit
						bcfa3e58ee
					
				| @ -594,6 +594,8 @@ void get_psnr(uint8_t *orig_image[3], uint8_t *coded_image[3], | |||||||
|    FFTSample type */ |    FFTSample type */ | ||||||
| typedef float FFTSample; | typedef float FFTSample; | ||||||
| 
 | 
 | ||||||
|  | struct MDCTContext; | ||||||
|  | 
 | ||||||
| typedef struct FFTComplex { | typedef struct FFTComplex { | ||||||
|     FFTSample re, im; |     FFTSample re, im; | ||||||
| } FFTComplex; | } FFTComplex; | ||||||
| @ -605,6 +607,8 @@ typedef struct FFTContext { | |||||||
|     FFTComplex *exptab; |     FFTComplex *exptab; | ||||||
|     FFTComplex *exptab1; /* only used by SSE code */ |     FFTComplex *exptab1; /* only used by SSE code */ | ||||||
|     void (*fft_calc)(struct FFTContext *s, FFTComplex *z); |     void (*fft_calc)(struct FFTContext *s, FFTComplex *z); | ||||||
|  |     void (*imdct_calc)(struct MDCTContext *s, FFTSample *output, | ||||||
|  |                        const FFTSample *input, FFTSample *tmp); | ||||||
| } FFTContext; | } FFTContext; | ||||||
| 
 | 
 | ||||||
| int ff_fft_init(FFTContext *s, int nbits, int inverse); | int ff_fft_init(FFTContext *s, int nbits, int inverse); | ||||||
| @ -635,6 +639,8 @@ typedef struct MDCTContext { | |||||||
| int ff_mdct_init(MDCTContext *s, int nbits, int inverse); | int ff_mdct_init(MDCTContext *s, int nbits, int inverse); | ||||||
| void ff_imdct_calc(MDCTContext *s, FFTSample *output, | void ff_imdct_calc(MDCTContext *s, FFTSample *output, | ||||||
|                 const FFTSample *input, FFTSample *tmp); |                 const FFTSample *input, FFTSample *tmp); | ||||||
|  | void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, | ||||||
|  |                         const FFTSample *input, FFTSample *tmp); | ||||||
| void ff_mdct_calc(MDCTContext *s, FFTSample *out, | void ff_mdct_calc(MDCTContext *s, FFTSample *out, | ||||||
|                const FFTSample *input, FFTSample *tmp); |                const FFTSample *input, FFTSample *tmp); | ||||||
| void ff_mdct_end(MDCTContext *s); | void ff_mdct_end(MDCTContext *s); | ||||||
|  | |||||||
| @ -54,6 +54,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse) | |||||||
|         s->exptab[i].im = s1; |         s->exptab[i].im = s1; | ||||||
|     } |     } | ||||||
|     s->fft_calc = ff_fft_calc_c; |     s->fft_calc = ff_fft_calc_c; | ||||||
|  |     s->imdct_calc = ff_imdct_calc; | ||||||
|     s->exptab1 = NULL; |     s->exptab1 = NULL; | ||||||
| 
 | 
 | ||||||
|     /* compute constant table for HAVE_SSE version */ |     /* compute constant table for HAVE_SSE version */ | ||||||
| @ -62,11 +63,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse) | |||||||
|         int has_vectors = 0; |         int has_vectors = 0; | ||||||
| 
 | 
 | ||||||
| #if defined(HAVE_MMX) | #if defined(HAVE_MMX) | ||||||
| #ifdef HAVE_MM3DNOW |  | ||||||
|         has_vectors = mm_support() & (MM_3DNOW | MM_3DNOWEXT | MM_SSE | MM_SSE2); |         has_vectors = mm_support() & (MM_3DNOW | MM_3DNOWEXT | MM_SSE | MM_SSE2); | ||||||
| #else |  | ||||||
|         has_vectors = mm_support() & (MM_SSE | MM_SSE2); |  | ||||||
| #endif |  | ||||||
| #endif | #endif | ||||||
| #if defined(HAVE_ALTIVEC) && !defined(ALTIVEC_USE_REFERENCE_C_CODE) | #if defined(HAVE_ALTIVEC) && !defined(ALTIVEC_USE_REFERENCE_C_CODE) | ||||||
|         has_vectors = mm_support() & MM_ALTIVEC; |         has_vectors = mm_support() & MM_ALTIVEC; | ||||||
| @ -98,6 +95,8 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse) | |||||||
|             } while (nblocks != 0); |             } while (nblocks != 0); | ||||||
|             av_freep(&s->exptab); |             av_freep(&s->exptab); | ||||||
| #if defined(HAVE_MMX) | #if defined(HAVE_MMX) | ||||||
|  |             if (has_vectors & MM_3DNOWEXT) | ||||||
|  |                 s->imdct_calc = ff_imdct_calc_3dn2; | ||||||
| #ifdef HAVE_MM3DNOW | #ifdef HAVE_MM3DNOW | ||||||
|             if (has_vectors & MM_3DNOWEXT) |             if (has_vectors & MM_3DNOWEXT) | ||||||
|                 /* 3DNowEx for Athlon(XP) */ |                 /* 3DNowEx for Athlon(XP) */ | ||||||
|  | |||||||
| @ -1,6 +1,6 @@ | |||||||
| /*
 | /*
 | ||||||
|  * FFT/MDCT transform with Extended 3DNow! optimizations |  * FFT/MDCT transform with Extended 3DNow! optimizations | ||||||
|  * Copyright (c) 2006 Zuxy MENG Jie. |  * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt | ||||||
|  * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard. |  * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard. | ||||||
|  * |  * | ||||||
|  * This library is free software; you can redistribute it and/or |  * This library is free software; you can redistribute it and/or | ||||||
| @ -134,3 +134,84 @@ void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #endif | #endif | ||||||
|  | 
 | ||||||
|  | void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, | ||||||
|  |                         const FFTSample *input, FFTSample *tmp) | ||||||
|  | { | ||||||
|  |     int k, n8, n4, n2, n; | ||||||
|  |     const uint16_t *revtab = s->fft.revtab; | ||||||
|  |     const FFTSample *tcos = s->tcos; | ||||||
|  |     const FFTSample *tsin = s->tsin; | ||||||
|  |     const FFTSample *in1, *in2; | ||||||
|  |     FFTComplex *z = (FFTComplex *)tmp; | ||||||
|  | 
 | ||||||
|  |     n = 1 << s->nbits; | ||||||
|  |     n2 = n >> 1; | ||||||
|  |     n4 = n >> 2; | ||||||
|  |     n8 = n >> 3; | ||||||
|  | 
 | ||||||
|  |     /* pre rotation */ | ||||||
|  |     in1 = input; | ||||||
|  |     in2 = input + n2 - 1; | ||||||
|  |     for(k = 0; k < n4; k++) { | ||||||
|  |         asm volatile( | ||||||
|  |             "movd       %1, %%mm0 \n\t" | ||||||
|  |             "movd       %3, %%mm1 \n\t" | ||||||
|  |             "punpckldq  %2, %%mm0 \n\t" | ||||||
|  |             "punpckldq  %4, %%mm1 \n\t" | ||||||
|  |             "movq    %%mm0, %%mm2 \n\t" | ||||||
|  |             "pfmul   %%mm1, %%mm0 \n\t" | ||||||
|  |             "pswapd  %%mm1, %%mm1 \n\t" | ||||||
|  |             "pfmul   %%mm1, %%mm2 \n\t" | ||||||
|  |             "pfpnacc %%mm2, %%mm0 \n\t" | ||||||
|  |             "movq    %%mm0, %0    \n\t" | ||||||
|  |             :"=m"(z[revtab[k]]) | ||||||
|  |             :"m"(in2[-2*k]), "m"(in1[2*k]), | ||||||
|  |              "m"(tcos[k]), "m"(tsin[k]) | ||||||
|  |         ); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     ff_fft_calc(&s->fft, z); | ||||||
|  | 
 | ||||||
|  |     /* post rotation + reordering */ | ||||||
|  |     for(k = 0; k < n4; k++) { | ||||||
|  |         asm volatile( | ||||||
|  |             "movq       %0, %%mm0 \n\t" | ||||||
|  |             "movd       %1, %%mm1 \n\t" | ||||||
|  |             "punpckldq  %2, %%mm1 \n\t" | ||||||
|  |             "movq    %%mm0, %%mm2 \n\t" | ||||||
|  |             "pfmul   %%mm1, %%mm0 \n\t" | ||||||
|  |             "pswapd  %%mm1, %%mm1 \n\t" | ||||||
|  |             "pfmul   %%mm1, %%mm2 \n\t" | ||||||
|  |             "pfpnacc %%mm2, %%mm0 \n\t" | ||||||
|  |             "movq    %%mm0, %0    \n\t" | ||||||
|  |             :"+m"(z[k]) | ||||||
|  |             :"m"(tcos[k]), "m"(tsin[k]) | ||||||
|  |         ); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     asm volatile("movd %0, %%mm7" ::"r"(1<<31)); | ||||||
|  |     for(k = 0; k < n8; k++) { | ||||||
|  |         asm volatile( | ||||||
|  |             "movq         %4, %%mm0 \n\t" | ||||||
|  |             "pswapd       %5, %%mm1 \n\t" | ||||||
|  |             "movq      %%mm0, %%mm2 \n\t" | ||||||
|  |             "pxor      %%mm7, %%mm2 \n\t" | ||||||
|  |             "punpckldq %%mm1, %%mm2 \n\t" | ||||||
|  |             "pswapd    %%mm2, %%mm3 \n\t" | ||||||
|  |             "punpckhdq %%mm1, %%mm0 \n\t" | ||||||
|  |             "pswapd    %%mm0, %%mm4 \n\t" | ||||||
|  |             "pxor      %%mm7, %%mm0 \n\t" | ||||||
|  |             "pxor      %%mm7, %%mm4 \n\t" | ||||||
|  |             "movq      %%mm0, %0    \n\t" // { -z[n8+k].im, z[n8-1-k].re }
 | ||||||
|  |             "movq      %%mm4, %1    \n\t" // { -z[n8-1-k].re, z[n8+k].im }
 | ||||||
|  |             "movq      %%mm2, %2    \n\t" // { -z[n8+k].re, z[n8-1-k].im }
 | ||||||
|  |             "movq      %%mm3, %3    \n\t" // { z[n8-1-k].im, -z[n8+k].re }
 | ||||||
|  |             :"=m"(output[2*k]), "=m"(output[n2-2-2*k]), | ||||||
|  |              "=m"(output[n2+2*k]), "=m"(output[n-2-2*k]) | ||||||
|  |             :"m"(z[n8+k]), "m"(z[n8-1-k]) | ||||||
|  |             :"memory" | ||||||
|  |         ); | ||||||
|  |     } | ||||||
|  |     asm volatile("emms"); | ||||||
|  | } | ||||||
|  | |||||||
| @ -1598,7 +1598,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc) { | |||||||
| 
 | 
 | ||||||
|         saved_start=vc->saved_start; |         saved_start=vc->saved_start; | ||||||
| 
 | 
 | ||||||
|         ff_imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp); |         vc->mdct0.fft.imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp); | ||||||
| 
 | 
 | ||||||
|         if (vc->modes[mode_number].blockflag) { |         if (vc->modes[mode_number].blockflag) { | ||||||
|             // -- overlap/add
 |             // -- overlap/add
 | ||||||
|  | |||||||
| @ -1113,7 +1113,7 @@ static int wma_decode_block(WMADecodeContext *s) | |||||||
| 
 | 
 | ||||||
|             n = s->block_len; |             n = s->block_len; | ||||||
|             n4 = s->block_len / 2; |             n4 = s->block_len / 2; | ||||||
|             ff_imdct_calc(&s->mdct_ctx[bsize], |             s->mdct_ctx[bsize].fft.imdct_calc(&s->mdct_ctx[bsize], | ||||||
|                           output, s->coefs[ch], s->mdct_tmp); |                           output, s->coefs[ch], s->mdct_tmp); | ||||||
| 
 | 
 | ||||||
|             /* XXX: optimize all that by build the window and
 |             /* XXX: optimize all that by build the window and
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user