We currently mostly do not empty the MMX state in our MMX
DSP functions; instead we only do so before code that might
be using x87 code. This is a violation of the System V i386 ABI
(and maybe of other ABIs, too):
"The CPU shall be in x87 mode upon entry to a function. Therefore,
every function that uses the MMX registers is required to issue an
emms or femms instruction after using MMX registers, before returning
or calling another function." (See 2.2.1 in [1])
This patch does not intend to change all these functions to abide
by the ABI; it only does so for ff_simple_idct_mmx, as this
function can by called by external users, because it is exported
via the avdct API. Without this, the following fragment will
assert (on x86_32, as ff_simple_idct_mmx is not used on x64):
    int16_t __attribute__ ((aligned (16))) block[64];
    AVDCT *dct = avcodec_dct_alloc();
    dct->idct_algo = FF_IDCT_AUTO;
    avcodec_dct_init(dct);
    dct->idct(block);
    av_assert0_fpu();
[1]: https://raw.githubusercontent.com/wiki/hjl-tools/x86-psABI/intel386-psABI-1.1.pdf
Reviewed-by: Kieran Kunhya <kierank@obe.tv>
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
		
	
			
		
			
				
	
	
		
			873 lines
		
	
	
		
			38 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
			
		
		
	
	
			873 lines
		
	
	
		
			38 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
;
 | 
						|
; Simple IDCT MMX
 | 
						|
;
 | 
						|
; Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
 | 
						|
;
 | 
						|
; Conversion from gcc syntax to x264asm syntax with minimal modifications
 | 
						|
; by James Darnley <jdarnley@obe.tv>.
 | 
						|
;
 | 
						|
; This file is part of FFmpeg.
 | 
						|
;
 | 
						|
; FFmpeg is free software; you can redistribute it and/or
 | 
						|
; modify it under the terms of the GNU Lesser General Public
 | 
						|
; License as published by the Free Software Foundation; either
 | 
						|
; version 2.1 of the License, or (at your option) any later version.
 | 
						|
;
 | 
						|
; FFmpeg is distributed in the hope that it will be useful,
 | 
						|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
; Lesser General Public License for more details.
 | 
						|
;
 | 
						|
; You should have received a copy of the GNU Lesser General Public
 | 
						|
; License along with FFmpeg; if not, write to the Free Software
 | 
						|
; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | 
						|
;/
 | 
						|
 | 
						|
%include "libavutil/x86/x86util.asm"
 | 
						|
 | 
						|
SECTION_RODATA
 | 
						|
 | 
						|
%if ARCH_X86_32
 | 
						|
cextern pb_80
 | 
						|
 | 
						|
wm1010: dw 0, 0xffff, 0, 0xffff
 | 
						|
d40000: dd 4 << 16, 0
 | 
						|
 | 
						|
; 23170.475006
 | 
						|
; 22725.260826
 | 
						|
; 21406.727617
 | 
						|
; 19265.545870
 | 
						|
; 16384.000000
 | 
						|
; 12872.826198
 | 
						|
; 8866.956905
 | 
						|
; 4520.335430
 | 
						|
 | 
						|
%define C0 23170 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 | 
						|
%define C1 22725 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 | 
						|
%define C2 21407 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 | 
						|
%define C3 19266 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 | 
						|
%define C4 16383 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
 | 
						|
%define C5 12873 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 | 
						|
%define C6 8867  ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 | 
						|
%define C7 4520  ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 | 
						|
 | 
						|
%define ROW_SHIFT 11
 | 
						|
%define COL_SHIFT 20 ; 6
 | 
						|
 | 
						|
coeffs:
 | 
						|
    dw 1 << (ROW_SHIFT - 1), 0
 | 
						|
    dw 1 << (ROW_SHIFT - 1), 0
 | 
						|
    dw 1 << (ROW_SHIFT - 1), 1
 | 
						|
    dw 1 << (ROW_SHIFT - 1), 0
 | 
						|
 | 
						|
    dw C4,  C4,  C4,  C4
 | 
						|
    dw C4, -C4,  C4, -C4
 | 
						|
 | 
						|
    dw C2,  C6,  C2,  C6
 | 
						|
    dw C6, -C2,  C6, -C2
 | 
						|
 | 
						|
    dw C1,  C3,  C1,  C3
 | 
						|
    dw C5,  C7,  C5,  C7
 | 
						|
 | 
						|
    dw C3, -C7,  C3, -C7
 | 
						|
    dw -C1, -C5, -C1, -C5
 | 
						|
 | 
						|
    dw C5, -C1,  C5, -C1
 | 
						|
    dw C7,  C3,  C7,  C3
 | 
						|
 | 
						|
    dw C7, -C5,  C7, -C5
 | 
						|
    dw C3, -C1,  C3, -C1
 | 
						|
 | 
						|
SECTION .text
 | 
						|
 | 
						|
%macro DC_COND_IDCT 7
 | 
						|
    movq            mm0, [blockq + %1]  ; R4     R0      r4      r0
 | 
						|
    movq            mm1, [blockq + %2]  ; R6     R2      r6      r2
 | 
						|
    movq            mm2, [blockq + %3]  ; R3     R1      r3      r1
 | 
						|
    movq            mm3, [blockq + %4]  ; R7     R5      r7      r5
 | 
						|
    movq            mm4, [wm1010]
 | 
						|
    pand            mm4, mm0
 | 
						|
    por             mm4, mm1
 | 
						|
    por             mm4, mm2
 | 
						|
    por             mm4, mm3
 | 
						|
    packssdw        mm4, mm4
 | 
						|
    movd            t0d, mm4
 | 
						|
    or              t0d, t0d
 | 
						|
    jz              %%1
 | 
						|
    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | 
						|
    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | 
						|
    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
 | 
						|
    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
 | 
						|
    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
 | 
						|
    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
 | 
						|
    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
 | 
						|
    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
 | 
						|
    paddd           mm4, [coeffs + 8]
 | 
						|
    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    paddd           mm4, mm5            ; A0             a0
 | 
						|
    psubd           mm6, mm5            ; A3             a3
 | 
						|
    movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
 | 
						|
    pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
 | 
						|
    paddd           mm0, [coeffs + 8]
 | 
						|
    paddd           mm1, mm0            ; A1             a1
 | 
						|
    paddd           mm0, mm0
 | 
						|
    psubd           mm0, mm1            ; A2             a2
 | 
						|
    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
 | 
						|
    paddd           mm7, mm5            ; B0             b0
 | 
						|
    movq            mm5, [coeffs + 72]  ; -C5    -C1     -C5     -C1
 | 
						|
    pmaddwd         mm5, mm3            ; -C5R7-C1R5     -C5r7-C1r5
 | 
						|
    paddd           mm7, mm4            ; A0+B0          a0+b0
 | 
						|
    paddd           mm4, mm4            ; 2A0            2a0
 | 
						|
    psubd           mm4, mm7            ; A0-B0          a0-b0
 | 
						|
    paddd           mm5, mm2            ; B1             b1
 | 
						|
    psrad           mm7, %7
 | 
						|
    psrad           mm4, %7
 | 
						|
    movq            mm2, mm1            ; A1             a1
 | 
						|
    paddd           mm1, mm5            ; A1+B1          a1+b1
 | 
						|
    psubd           mm2, mm5            ; A1-B1          a1-b1
 | 
						|
    psrad           mm1, %7
 | 
						|
    psrad           mm2, %7
 | 
						|
    packssdw        mm7, mm1            ; A1+B1  a1+b1   A0+B0   a0+b0
 | 
						|
    packssdw        mm2, mm4            ; A0-B0  a0-b0   A1-B1   a1-b1
 | 
						|
    movq           [%5], mm7
 | 
						|
    movq            mm1, [blockq + %3]  ; R3     R1      r3      r1
 | 
						|
    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
 | 
						|
    movq      [24 + %5], mm2
 | 
						|
    pmaddwd         mm4, mm1            ; -C1R3+C5R1     -C1r3+C5r1
 | 
						|
    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
 | 
						|
    pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
 | 
						|
    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
 | 
						|
    movq            mm2, mm0            ; A2             a2
 | 
						|
    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
 | 
						|
    paddd           mm4, mm7            ; B2             b2
 | 
						|
    paddd           mm2, mm4            ; A2+B2          a2+b2
 | 
						|
    psubd           mm0, mm4            ; a2-B2          a2-b2
 | 
						|
    psrad           mm2, %7
 | 
						|
    psrad           mm0, %7
 | 
						|
    movq            mm4, mm6            ; A3             a3
 | 
						|
    paddd           mm3, mm1            ; B3             b3
 | 
						|
    paddd           mm6, mm3            ; A3+B3          a3+b3
 | 
						|
    psubd           mm4, mm3            ; a3-B3          a3-b3
 | 
						|
    psrad           mm6, %7
 | 
						|
    packssdw        mm2, mm6            ; A3+B3  a3+b3   A2+B2   a2+b2
 | 
						|
    movq       [8 + %5], mm2
 | 
						|
    psrad           mm4, %7
 | 
						|
    packssdw        mm4, mm0            ; A2-B2  a2-b2   A3-B3   a3-b3
 | 
						|
    movq      [16 + %5], mm4
 | 
						|
    jmp             %%2
 | 
						|
%%1:
 | 
						|
    pslld           mm0, 16
 | 
						|
    paddd           mm0, [d40000]
 | 
						|
    psrad           mm0, 13
 | 
						|
    packssdw        mm0, mm0
 | 
						|
    movq           [%5], mm0
 | 
						|
    movq       [8 + %5], mm0
 | 
						|
    movq      [16 + %5], mm0
 | 
						|
    movq      [24 + %5], mm0
 | 
						|
%%2:
 | 
						|
%endmacro
 | 
						|
 | 
						|
%macro Z_COND_IDCT 8
 | 
						|
    movq            mm0, [blockq + %1]  ; R4     R0      r4      r0
 | 
						|
    movq            mm1, [blockq + %2]  ; R6     R2      r6      r2
 | 
						|
    movq            mm2, [blockq + %3]  ; R3     R1      r3      r1
 | 
						|
    movq            mm3, [blockq + %4]  ; R7     R5      r7      r5
 | 
						|
    movq            mm4, mm0
 | 
						|
    por             mm4, mm1
 | 
						|
    por             mm4, mm2
 | 
						|
    por             mm4, mm3
 | 
						|
    packssdw        mm4, mm4
 | 
						|
    movd            t0d, mm4
 | 
						|
    or              t0d, t0d
 | 
						|
    jz               %8
 | 
						|
    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | 
						|
    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | 
						|
    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
 | 
						|
    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
 | 
						|
    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
 | 
						|
    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
 | 
						|
    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
 | 
						|
    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
 | 
						|
    paddd           mm4, [coeffs]
 | 
						|
    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    paddd           mm4, mm5            ; A0             a0
 | 
						|
    psubd           mm6, mm5            ; A3             a3
 | 
						|
    movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
 | 
						|
    pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
 | 
						|
    paddd           mm0, [coeffs]
 | 
						|
    paddd           mm1, mm0            ; A1             a1
 | 
						|
    paddd           mm0, mm0
 | 
						|
    psubd           mm0, mm1            ; A2             a2
 | 
						|
    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
 | 
						|
    paddd           mm7, mm5            ; B0             b0
 | 
						|
    movq            mm5, [coeffs + 72]  ; -C5    -C1     -C5     -C1
 | 
						|
    pmaddwd         mm5, mm3            ; -C5R7-C1R5     -C5r7-C1r5
 | 
						|
    paddd           mm7, mm4            ; A0+B0          a0+b0
 | 
						|
    paddd           mm4, mm4            ; 2A0            2a0
 | 
						|
    psubd           mm4, mm7            ; A0-B0          a0-b0
 | 
						|
    paddd           mm5, mm2            ; B1             b1
 | 
						|
    psrad           mm7, %7
 | 
						|
    psrad           mm4, %7
 | 
						|
    movq            mm2, mm1            ; A1             a1
 | 
						|
    paddd           mm1, mm5            ; A1+B1          a1+b1
 | 
						|
    psubd           mm2, mm5            ; A1-B1          a1-b1
 | 
						|
    psrad           mm1, %7
 | 
						|
    psrad           mm2, %7
 | 
						|
    packssdw        mm7, mm1            ; A1+B1  a1+b1   A0+B0   a0+b0
 | 
						|
    packssdw        mm2, mm4            ; A0-B0  a0-b0   A1-B1   a1-b1
 | 
						|
    movq           [%5], mm7
 | 
						|
    movq            mm1, [blockq + %3]  ; R3     R1      r3      r1
 | 
						|
    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
 | 
						|
    movq      [24 + %5], mm2
 | 
						|
    pmaddwd         mm4, mm1            ; -C1R3+C5R1     -C1r3+C5r1
 | 
						|
    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
 | 
						|
    pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
 | 
						|
    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
 | 
						|
    movq            mm2, mm0            ; A2             a2
 | 
						|
    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
 | 
						|
    paddd           mm4, mm7            ; B2             b2
 | 
						|
    paddd           mm2, mm4            ; A2+B2          a2+b2
 | 
						|
    psubd           mm0, mm4            ; a2-B2          a2-b2
 | 
						|
    psrad           mm2, %7
 | 
						|
    psrad           mm0, %7
 | 
						|
    movq            mm4, mm6            ; A3             a3
 | 
						|
    paddd           mm3, mm1            ; B3             b3
 | 
						|
    paddd           mm6, mm3            ; A3+B3          a3+b3
 | 
						|
    psubd           mm4, mm3            ; a3-B3          a3-b3
 | 
						|
    psrad           mm6, %7
 | 
						|
    packssdw        mm2, mm6            ; A3+B3  a3+b3   A2+B2   a2+b2
 | 
						|
    movq       [8 + %5], mm2
 | 
						|
    psrad           mm4, %7
 | 
						|
    packssdw        mm4, mm0            ; A2-B2  a2-b2   A3-B3   a3-b3
 | 
						|
    movq      [16 + %5], mm4
 | 
						|
%endmacro
 | 
						|
 | 
						|
%macro IDCT1 6
 | 
						|
    movq            mm0, %1             ; R4     R0      r4      r0
 | 
						|
    movq            mm1, %2             ; R6     R2      r6      r2
 | 
						|
    movq            mm2, %3             ; R3     R1      r3      r1
 | 
						|
    movq            mm3, %4             ; R7     R5      r7      r5
 | 
						|
    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | 
						|
    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | 
						|
    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
 | 
						|
    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
 | 
						|
    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
 | 
						|
    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
 | 
						|
    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
 | 
						|
    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
 | 
						|
    paddd           mm4, mm5            ; A0             a0
 | 
						|
    psubd           mm6, mm5            ; A3             a3
 | 
						|
    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    paddd           mm0, mm1            ; A1             a1
 | 
						|
    psubd           mm5, mm1            ; A2             a2
 | 
						|
    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
 | 
						|
    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
 | 
						|
    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
 | 
						|
    paddd           mm7, mm1            ; B0             b0
 | 
						|
    movq            mm1, [coeffs + 72]  ; -C5    -C1     -C5     -C1
 | 
						|
    pmaddwd         mm1, mm3            ; -C5R7-C1R5     -C5r7-C1r5
 | 
						|
    paddd           mm7, mm4            ; A0+B0          a0+b0
 | 
						|
    paddd           mm4, mm4            ; 2A0            2a0
 | 
						|
    psubd           mm4, mm7            ; A0-B0          a0-b0
 | 
						|
    paddd           mm1, mm2            ; B1             b1
 | 
						|
    psrad           mm7, %6
 | 
						|
    psrad           mm4, %6
 | 
						|
    movq            mm2, mm0            ; A1             a1
 | 
						|
    paddd           mm0, mm1            ; A1+B1          a1+b1
 | 
						|
    psubd           mm2, mm1            ; A1-B1          a1-b1
 | 
						|
    psrad           mm0, %6
 | 
						|
    psrad           mm2, %6
 | 
						|
    packssdw        mm7, mm7            ; A0+B0  a0+b0
 | 
						|
    movd           [%5], mm7
 | 
						|
    packssdw        mm0, mm0            ; A1+B1  a1+b1
 | 
						|
    movd      [16 + %5], mm0
 | 
						|
    packssdw        mm2, mm2            ; A1-B1  a1-b1
 | 
						|
    movd      [96 + %5], mm2
 | 
						|
    packssdw        mm4, mm4            ; A0-B0  a0-b0
 | 
						|
    movd     [112 + %5], mm4
 | 
						|
    movq            mm0, %3             ; R3     R1      r3      r1
 | 
						|
    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
 | 
						|
    pmaddwd         mm4, mm0            ; -C1R3+C5R1     -C1r3+C5r1
 | 
						|
    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
 | 
						|
    pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
 | 
						|
    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
 | 
						|
    movq            mm2, mm5            ; A2             a2
 | 
						|
    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
 | 
						|
    paddd           mm4, mm7            ; B2             b2
 | 
						|
    paddd           mm2, mm4            ; A2+B2          a2+b2
 | 
						|
    psubd           mm5, mm4            ; a2-B2          a2-b2
 | 
						|
    psrad           mm2, %6
 | 
						|
    psrad           mm5, %6
 | 
						|
    movq            mm4, mm6            ; A3             a3
 | 
						|
    paddd           mm3, mm0            ; B3             b3
 | 
						|
    paddd           mm6, mm3            ; A3+B3          a3+b3
 | 
						|
    psubd           mm4, mm3            ; a3-B3          a3-b3
 | 
						|
    psrad           mm6, %6
 | 
						|
    psrad           mm4, %6
 | 
						|
    packssdw        mm2, mm2            ; A2+B2  a2+b2
 | 
						|
    packssdw        mm6, mm6            ; A3+B3  a3+b3
 | 
						|
    movd      [32 + %5], mm2
 | 
						|
    packssdw        mm4, mm4            ; A3-B3  a3-b3
 | 
						|
    packssdw        mm5, mm5            ; A2-B2  a2-b2
 | 
						|
    movd      [48 + %5], mm6
 | 
						|
    movd      [64 + %5], mm4
 | 
						|
    movd      [80 + %5], mm5
 | 
						|
%endmacro
 | 
						|
 | 
						|
%macro IDCT2 6
 | 
						|
    movq            mm0, %1             ; R4     R0      r4      r0
 | 
						|
    movq            mm1, %2             ; R6     R2      r6      r2
 | 
						|
    movq            mm3, %4             ; R7     R5      r7      r5
 | 
						|
    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | 
						|
    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | 
						|
    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
 | 
						|
    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
 | 
						|
    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
 | 
						|
    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
 | 
						|
    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    paddd           mm4, mm5            ; A0             a0
 | 
						|
    psubd           mm6, mm5            ; A3             a3
 | 
						|
    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    paddd           mm0, mm1            ; A1             a1
 | 
						|
    psubd           mm5, mm1            ; A2             a2
 | 
						|
    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
 | 
						|
    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
 | 
						|
    movq            mm7, [coeffs + 72]  ; -C5    -C1     -C5     -C1
 | 
						|
    pmaddwd         mm7, mm3            ; -C5R7-C1R5     -C5r7-C1r5
 | 
						|
    paddd           mm1, mm4            ; A0+B0          a0+b0
 | 
						|
    paddd           mm4, mm4            ; 2A0            2a0
 | 
						|
    psubd           mm4, mm1            ; A0-B0          a0-b0
 | 
						|
    psrad           mm1, %6
 | 
						|
    psrad           mm4, %6
 | 
						|
    movq            mm2, mm0            ; A1             a1
 | 
						|
    paddd           mm0, mm7            ; A1+B1          a1+b1
 | 
						|
    psubd           mm2, mm7            ; A1-B1          a1-b1
 | 
						|
    psrad           mm0, %6
 | 
						|
    psrad           mm2, %6
 | 
						|
    packssdw        mm1, mm1            ; A0+B0  a0+b0
 | 
						|
    movd           [%5], mm1
 | 
						|
    packssdw        mm0, mm0            ; A1+B1  a1+b1
 | 
						|
    movd      [16 + %5], mm0
 | 
						|
    packssdw        mm2, mm2            ; A1-B1  a1-b1
 | 
						|
    movd      [96 + %5], mm2
 | 
						|
    packssdw        mm4, mm4            ; A0-B0  a0-b0
 | 
						|
    movd     [112 + %5], mm4
 | 
						|
    movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
 | 
						|
    pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
 | 
						|
    movq            mm2, mm5            ; A2             a2
 | 
						|
    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
 | 
						|
    paddd           mm2, mm1            ; A2+B2          a2+b2
 | 
						|
    psubd           mm5, mm1            ; a2-B2          a2-b2
 | 
						|
    psrad           mm2, %6
 | 
						|
    psrad           mm5, %6
 | 
						|
    movq            mm1, mm6            ; A3             a3
 | 
						|
    paddd           mm6, mm3            ; A3+B3          a3+b3
 | 
						|
    psubd           mm1, mm3            ; a3-B3          a3-b3
 | 
						|
    psrad           mm6, %6
 | 
						|
    psrad           mm1, %6
 | 
						|
    packssdw        mm2, mm2            ; A2+B2  a2+b2
 | 
						|
    packssdw        mm6, mm6            ; A3+B3  a3+b3
 | 
						|
    movd      [32 + %5], mm2
 | 
						|
    packssdw        mm1, mm1            ; A3-B3  a3-b3
 | 
						|
    packssdw        mm5, mm5            ; A2-B2  a2-b2
 | 
						|
    movd      [48 + %5], mm6
 | 
						|
    movd      [64 + %5], mm1
 | 
						|
    movd      [80 + %5], mm5
 | 
						|
%endmacro
 | 
						|
 | 
						|
%macro IDCT3 6
 | 
						|
    movq            mm0, %1             ; R4     R0      r4      r0
 | 
						|
    movq            mm3, %4             ; R7     R5      r7      r5
 | 
						|
    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | 
						|
    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | 
						|
    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
 | 
						|
    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
 | 
						|
    movq            mm7, [coeffs + 72]  ; -C5    -C1     -C5     -C1
 | 
						|
    pmaddwd         mm7, mm3            ; -C5R7-C1R5     -C5r7-C1r5
 | 
						|
    paddd           mm1, mm4            ; A0+B0          a0+b0
 | 
						|
    paddd           mm4, mm4            ; 2A0            2a0
 | 
						|
    psubd           mm4, mm1            ; A0-B0          a0-b0
 | 
						|
    psrad           mm1, %6
 | 
						|
    psrad           mm4, %6
 | 
						|
    movq            mm2, mm0            ; A1             a1
 | 
						|
    paddd           mm0, mm7            ; A1+B1          a1+b1
 | 
						|
    psubd           mm2, mm7            ; A1-B1          a1-b1
 | 
						|
    psrad           mm0, %6
 | 
						|
    psrad           mm2, %6
 | 
						|
    packssdw        mm1, mm1            ; A0+B0  a0+b0
 | 
						|
    movd           [%5], mm1
 | 
						|
    packssdw        mm0, mm0            ; A1+B1  a1+b1
 | 
						|
    movd      [16 + %5], mm0
 | 
						|
    packssdw        mm2, mm2            ; A1-B1  a1-b1
 | 
						|
    movd      [96 + %5], mm2
 | 
						|
    packssdw        mm4, mm4            ; A0-B0  a0-b0
 | 
						|
    movd     [112 + %5], mm4
 | 
						|
    movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
 | 
						|
    pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
 | 
						|
    movq            mm2, mm5            ; A2             a2
 | 
						|
    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
 | 
						|
    paddd           mm2, mm1            ; A2+B2          a2+b2
 | 
						|
    psubd           mm5, mm1            ; a2-B2          a2-b2
 | 
						|
    psrad           mm2, %6
 | 
						|
    psrad           mm5, %6
 | 
						|
    movq            mm1, mm6            ; A3             a3
 | 
						|
    paddd           mm6, mm3            ; A3+B3          a3+b3
 | 
						|
    psubd           mm1, mm3            ; a3-B3          a3-b3
 | 
						|
    psrad           mm6, %6
 | 
						|
    psrad           mm1, %6
 | 
						|
    packssdw        mm2, mm2            ; A2+B2  a2+b2
 | 
						|
    packssdw        mm6, mm6            ; A3+B3  a3+b3
 | 
						|
    movd      [32 + %5], mm2
 | 
						|
    packssdw        mm1, mm1            ; A3-B3  a3-b3
 | 
						|
    packssdw        mm5, mm5            ; A2-B2  a2-b2
 | 
						|
    movd      [48 + %5], mm6
 | 
						|
    movd      [64 + %5], mm1
 | 
						|
    movd      [80 + %5], mm5
 | 
						|
%endmacro
 | 
						|
 | 
						|
%macro IDCT4 6
 | 
						|
    movq            mm0, %1             ; R4     R0      r4      r0
 | 
						|
    movq            mm2, %3             ; R3     R1      r3      r1
 | 
						|
    movq            mm3, %4             ; R7     R5      r7      r5
 | 
						|
    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | 
						|
    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | 
						|
    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
 | 
						|
    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
 | 
						|
    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
 | 
						|
    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
 | 
						|
    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
 | 
						|
    paddd           mm7, mm1            ; B0             b0
 | 
						|
    movq            mm1, [coeffs + 72]  ; -C5    -C1     -C5     -C1
 | 
						|
    pmaddwd         mm1, mm3            ; -C5R7-C1R5     -C5r7-C1r5
 | 
						|
    paddd           mm7, mm4            ; A0+B0          a0+b0
 | 
						|
    paddd           mm4, mm4            ; 2A0            2a0
 | 
						|
    psubd           mm4, mm7            ; A0-B0          a0-b0
 | 
						|
    paddd           mm1, mm2            ; B1             b1
 | 
						|
    psrad           mm7, %6
 | 
						|
    psrad           mm4, %6
 | 
						|
    movq            mm2, mm0            ; A1             a1
 | 
						|
    paddd           mm0, mm1            ; A1+B1          a1+b1
 | 
						|
    psubd           mm2, mm1            ; A1-B1          a1-b1
 | 
						|
    psrad           mm0, %6
 | 
						|
    psrad           mm2, %6
 | 
						|
    packssdw        mm7, mm7            ; A0+B0  a0+b0
 | 
						|
    movd           [%5], mm7
 | 
						|
    packssdw        mm0, mm0            ; A1+B1  a1+b1
 | 
						|
    movd      [16 + %5], mm0
 | 
						|
    packssdw        mm2, mm2            ; A1-B1  a1-b1
 | 
						|
    movd      [96 + %5], mm2
 | 
						|
    packssdw        mm4, mm4            ; A0-B0  a0-b0
 | 
						|
    movd     [112 + %5], mm4
 | 
						|
    movq            mm0, %3             ; R3     R1      r3      r1
 | 
						|
    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
 | 
						|
    pmaddwd         mm4, mm0            ; -C1R3+C5R1     -C1r3+C5r1
 | 
						|
    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
 | 
						|
    pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
 | 
						|
    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
 | 
						|
    movq            mm2, mm5            ; A2             a2
 | 
						|
    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
 | 
						|
    paddd           mm4, mm7            ; B2             b2
 | 
						|
    paddd           mm2, mm4            ; A2+B2          a2+b2
 | 
						|
    psubd           mm5, mm4            ; a2-B2          a2-b2
 | 
						|
    psrad           mm2, %6
 | 
						|
    psrad           mm5, %6
 | 
						|
    movq            mm4, mm6            ; A3             a3
 | 
						|
    paddd           mm3, mm0            ; B3             b3
 | 
						|
    paddd           mm6, mm3            ; A3+B3          a3+b3
 | 
						|
    psubd           mm4, mm3            ; a3-B3          a3-b3
 | 
						|
    psrad           mm6, %6
 | 
						|
    psrad           mm4, %6
 | 
						|
    packssdw        mm2, mm2            ; A2+B2  a2+b2
 | 
						|
    packssdw        mm6, mm6            ; A3+B3  a3+b3
 | 
						|
    movd      [32 + %5], mm2
 | 
						|
    packssdw        mm4, mm4            ; A3-B3  a3-b3
 | 
						|
    packssdw        mm5, mm5            ; A2-B2  a2-b2
 | 
						|
    movd      [48 + %5], mm6
 | 
						|
    movd      [64 + %5], mm4
 | 
						|
    movd      [80 + %5], mm5
 | 
						|
%endmacro
 | 
						|
 | 
						|
%macro IDCT5 6
 | 
						|
    movq            mm0, %1             ; R4     R0      r4      r0
 | 
						|
    movq            mm2, %3             ; R3     R1      r3      r1
 | 
						|
    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | 
						|
    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | 
						|
    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
 | 
						|
    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
 | 
						|
    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    movq            mm3, [coeffs + 64]
 | 
						|
    pmaddwd         mm3, mm2            ; -C7R3+C3R1     -C7r3+C3r1
 | 
						|
    paddd           mm7, mm4            ; A0+B0          a0+b0
 | 
						|
    paddd           mm4, mm4            ; 2A0            2a0
 | 
						|
    psubd           mm4, mm7            ; A0-B0          a0-b0
 | 
						|
    psrad           mm7, %6
 | 
						|
    psrad           mm4, %6
 | 
						|
    movq            mm1, mm0            ; A1             a1
 | 
						|
    paddd           mm0, mm3            ; A1+B1          a1+b1
 | 
						|
    psubd           mm1, mm3            ; A1-B1          a1-b1
 | 
						|
    psrad           mm0, %6
 | 
						|
    psrad           mm1, %6
 | 
						|
    packssdw        mm7, mm7            ; A0+B0  a0+b0
 | 
						|
    movd           [%5], mm7
 | 
						|
    packssdw        mm0, mm0            ; A1+B1  a1+b1
 | 
						|
    movd      [16 + %5], mm0
 | 
						|
    packssdw        mm1, mm1            ; A1-B1  a1-b1
 | 
						|
    movd      [96 + %5], mm1
 | 
						|
    packssdw        mm4, mm4            ; A0-B0  a0-b0
 | 
						|
    movd     [112 + %5], mm4
 | 
						|
    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
 | 
						|
    pmaddwd         mm4, mm2            ; -C1R3+C5R1     -C1r3+C5r1
 | 
						|
    pmaddwd         mm2, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
 | 
						|
    movq            mm1, mm5            ; A2             a2
 | 
						|
    paddd           mm1, mm4            ; A2+B2          a2+b2
 | 
						|
    psubd           mm5, mm4            ; a2-B2          a2-b2
 | 
						|
    psrad           mm1, %6
 | 
						|
    psrad           mm5, %6
 | 
						|
    movq            mm4, mm6            ; A3             a3
 | 
						|
    paddd           mm6, mm2            ; A3+B3          a3+b3
 | 
						|
    psubd           mm4, mm2            ; a3-B3          a3-b3
 | 
						|
    psrad           mm6, %6
 | 
						|
    psrad           mm4, %6
 | 
						|
    packssdw        mm1, mm1            ; A2+B2  a2+b2
 | 
						|
    packssdw        mm6, mm6            ; A3+B3  a3+b3
 | 
						|
    movd      [32 + %5], mm1
 | 
						|
    packssdw        mm4, mm4            ; A3-B3  a3-b3
 | 
						|
    packssdw        mm5, mm5            ; A2-B2  a2-b2
 | 
						|
    movd      [48 + %5], mm6
 | 
						|
    movd      [64 + %5], mm4
 | 
						|
    movd      [80 + %5], mm5
 | 
						|
%endmacro
 | 
						|
 | 
						|
%macro IDCT6 6
 | 
						|
    movq            mm0, [%1]           ; R4     R0      r4      r0
 | 
						|
    movq            mm1, [%2]           ; R6     R2      r6      r2
 | 
						|
    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | 
						|
    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | 
						|
    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
 | 
						|
    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
 | 
						|
    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
 | 
						|
    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
 | 
						|
    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    paddd           mm4, mm5            ; A0             a0
 | 
						|
    psubd           mm6, mm5            ; A3             a3
 | 
						|
    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    paddd           mm0, mm1            ; A1             a1
 | 
						|
    psubd           mm5, mm1            ; A2             a2
 | 
						|
    movq            mm2, [8 + %1]       ; R4     R0      r4      r0
 | 
						|
    movq            mm3, [8 + %2]       ; R6     R2      r6      r2
 | 
						|
    movq            mm1, [coeffs + 16]  ; C4     C4      C4      C4
 | 
						|
    pmaddwd         mm1, mm2            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm7, [coeffs + 24]  ; -C4    C4      -C4     C4
 | 
						|
    pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
 | 
						|
    pmaddwd         mm7, mm3            ; C6R6+C2R2      C6r6+C2r2
 | 
						|
    pmaddwd         mm3, [coeffs + 40]  ; -C2R6+C6R2     -C2r6+C6r2
 | 
						|
    paddd           mm7, mm1            ; A0             a0
 | 
						|
    paddd           mm1, mm1            ; 2C0            2c0
 | 
						|
    psubd           mm1, mm7            ; A3             a3
 | 
						|
    paddd           mm3, mm2            ; A1             a1
 | 
						|
    paddd           mm2, mm2            ; 2C1            2c1
 | 
						|
    psubd           mm2, mm3            ; A2             a2
 | 
						|
    psrad           mm4, %6
 | 
						|
    psrad           mm7, %6
 | 
						|
    psrad           mm3, %6
 | 
						|
    packssdw        mm4, mm7            ; A0     a0
 | 
						|
    movq           [%5], mm4
 | 
						|
    psrad           mm0, %6
 | 
						|
    packssdw        mm0, mm3            ; A1     a1
 | 
						|
    movq      [16 + %5], mm0
 | 
						|
    movq      [96 + %5], mm0
 | 
						|
    movq     [112 + %5], mm4
 | 
						|
    psrad           mm5, %6
 | 
						|
    psrad           mm6, %6
 | 
						|
    psrad           mm2, %6
 | 
						|
    packssdw        mm5, mm2            ; A2-B2  a2-b2
 | 
						|
    movq      [32 + %5], mm5
 | 
						|
    psrad           mm1, %6
 | 
						|
    packssdw        mm6, mm1            ; A3+B3  a3+b3
 | 
						|
    movq      [48 + %5], mm6
 | 
						|
    movq      [64 + %5], mm6
 | 
						|
    movq      [80 + %5], mm5
 | 
						|
%endmacro
 | 
						|
 | 
						|
%macro IDCT7 6
 | 
						|
    movq            mm0, %1             ; R4     R0      r4      r0
 | 
						|
    movq            mm1, %2             ; R6     R2      r6      r2
 | 
						|
    movq            mm2, %3             ; R3     R1      r3      r1
 | 
						|
    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | 
						|
    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | 
						|
    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
 | 
						|
    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
 | 
						|
    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
 | 
						|
    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
 | 
						|
    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
 | 
						|
    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
 | 
						|
    paddd           mm4, mm5            ; A0             a0
 | 
						|
    psubd           mm6, mm5            ; A3             a3
 | 
						|
    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    paddd           mm0, mm1            ; A1             a1
 | 
						|
    psubd           mm5, mm1            ; A2             a2
 | 
						|
    movq            mm1, [coeffs + 64]
 | 
						|
    pmaddwd         mm1, mm2            ; -C7R3+C3R1     -C7r3+C3r1
 | 
						|
    paddd           mm7, mm4            ; A0+B0          a0+b0
 | 
						|
    paddd           mm4, mm4            ; 2A0            2a0
 | 
						|
    psubd           mm4, mm7            ; A0-B0          a0-b0
 | 
						|
    psrad           mm7, %6
 | 
						|
    psrad           mm4, %6
 | 
						|
    movq            mm3, mm0            ; A1             a1
 | 
						|
    paddd           mm0, mm1            ; A1+B1          a1+b1
 | 
						|
    psubd           mm3, mm1            ; A1-B1          a1-b1
 | 
						|
    psrad           mm0, %6
 | 
						|
    psrad           mm3, %6
 | 
						|
    packssdw        mm7, mm7            ; A0+B0  a0+b0
 | 
						|
    movd           [%5], mm7
 | 
						|
    packssdw        mm0, mm0            ; A1+B1  a1+b1
 | 
						|
    movd      [16 + %5], mm0
 | 
						|
    packssdw        mm3, mm3            ; A1-B1  a1-b1
 | 
						|
    movd      [96 + %5], mm3
 | 
						|
    packssdw        mm4, mm4            ; A0-B0  a0-b0
 | 
						|
    movd     [112 + %5], mm4
 | 
						|
    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
 | 
						|
    pmaddwd         mm4, mm2            ; -C1R3+C5R1     -C1r3+C5r1
 | 
						|
    pmaddwd         mm2, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
 | 
						|
    movq            mm3, mm5            ; A2             a2
 | 
						|
    paddd           mm3, mm4            ; A2+B2          a2+b2
 | 
						|
    psubd           mm5, mm4            ; a2-B2          a2-b2
 | 
						|
    psrad           mm3, %6
 | 
						|
    psrad           mm5, %6
 | 
						|
    movq            mm4, mm6            ; A3             a3
 | 
						|
    paddd           mm6, mm2            ; A3+B3          a3+b3
 | 
						|
    psubd           mm4, mm2            ; a3-B3          a3-b3
 | 
						|
    psrad           mm6, %6
 | 
						|
    packssdw        mm3, mm3            ; A2+B2  a2+b2
 | 
						|
    movd      [32 + %5], mm3
 | 
						|
    psrad           mm4, %6
 | 
						|
    packssdw        mm6, mm6            ; A3+B3  a3+b3
 | 
						|
    movd      [48 + %5], mm6
 | 
						|
    packssdw        mm4, mm4            ; A3-B3  a3-b3
 | 
						|
    packssdw        mm5, mm5            ; A2-B2  a2-b2
 | 
						|
    movd      [64 + %5], mm4
 | 
						|
    movd      [80 + %5], mm5
 | 
						|
%endmacro
 | 
						|
 | 
						|
%macro IDCT8 6
 | 
						|
    movq            mm0, [%1]           ; R4     R0      r4      r0
 | 
						|
    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | 
						|
    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | 
						|
    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    psrad           mm4, %6
 | 
						|
    psrad           mm0, %6
 | 
						|
    movq            mm2, [8 + %1]       ; R4     R0      r4      r0
 | 
						|
    movq            mm1, [coeffs + 16]  ; C4     C4      C4      C4
 | 
						|
    pmaddwd         mm1, mm2            ; C4R4+C4R0      C4r4+C4r0
 | 
						|
    movq            mm7, [coeffs + 24]  ; -C4    C4      -C4     C4
 | 
						|
    pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
 | 
						|
    movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
 | 
						|
    psrad           mm1, %6
 | 
						|
    packssdw        mm4, mm1            ; A0     a0
 | 
						|
    movq           [%5], mm4
 | 
						|
    psrad           mm2, %6
 | 
						|
    packssdw        mm0, mm2            ; A1     a1
 | 
						|
    movq      [16 + %5], mm0
 | 
						|
    movq      [96 + %5], mm0
 | 
						|
    movq     [112 + %5], mm4
 | 
						|
    movq      [32 + %5], mm0
 | 
						|
    movq      [48 + %5], mm4
 | 
						|
    movq      [64 + %5], mm4
 | 
						|
    movq      [80 + %5], mm0
 | 
						|
%endmacro
 | 
						|
 | 
						|
%macro IDCT 0
 | 
						|
    DC_COND_IDCT  0,   8,  16,  24, rsp +  0, null, 11
 | 
						|
    Z_COND_IDCT  32,  40,  48,  56, rsp + 32, null, 11, %%4
 | 
						|
    Z_COND_IDCT  64,  72,  80,  88, rsp + 64, null, 11, %%2
 | 
						|
    Z_COND_IDCT  96, 104, 112, 120, rsp + 96, null, 11, %%1
 | 
						|
 | 
						|
    IDCT1 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
 | 
						|
    IDCT1 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
 | 
						|
    IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
 | 
						|
    IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
 | 
						|
    jmp %%9
 | 
						|
 | 
						|
    ALIGN 16
 | 
						|
    %%4:
 | 
						|
    Z_COND_IDCT 64,  72,  80,  88, rsp + 64, null, 11, %%6
 | 
						|
    Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5
 | 
						|
 | 
						|
    IDCT2 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
 | 
						|
    IDCT2 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
 | 
						|
    IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
 | 
						|
    IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
 | 
						|
    jmp %%9
 | 
						|
 | 
						|
    ALIGN 16
 | 
						|
    %%6:
 | 
						|
    Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7
 | 
						|
 | 
						|
    IDCT3 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
 | 
						|
    IDCT3 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
 | 
						|
    IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
 | 
						|
    IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
 | 
						|
    jmp %%9
 | 
						|
 | 
						|
    ALIGN 16
 | 
						|
    %%2:
 | 
						|
    Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3
 | 
						|
 | 
						|
    IDCT4 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
 | 
						|
    IDCT4 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
 | 
						|
    IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
 | 
						|
    IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
 | 
						|
    jmp %%9
 | 
						|
 | 
						|
    ALIGN 16
 | 
						|
    %%3:
 | 
						|
 | 
						|
    IDCT5 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
 | 
						|
    IDCT5 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
 | 
						|
    IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
 | 
						|
    IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
 | 
						|
    jmp %%9
 | 
						|
 | 
						|
    ALIGN 16
 | 
						|
    %%5:
 | 
						|
 | 
						|
    IDCT6 rsp +  0, rsp + 64, rsp + 32, rsp +  96, blockq +  0, 20
 | 
						|
    IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq +  8, 20
 | 
						|
    jmp %%9
 | 
						|
 | 
						|
    ALIGN 16
 | 
						|
    %%1:
 | 
						|
 | 
						|
    IDCT7 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
 | 
						|
    IDCT7 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
 | 
						|
    IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
 | 
						|
    IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
 | 
						|
    jmp %%9
 | 
						|
 | 
						|
    ALIGN 16
 | 
						|
    %%7:
 | 
						|
 | 
						|
    IDCT8 rsp +  0, rsp + 64, rsp + 32, rsp +  96, blockq +  0, 20
 | 
						|
    IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq +  8, 20
 | 
						|
 | 
						|
    %%9:
 | 
						|
%endmacro
 | 
						|
 | 
						|
%macro PUT_PIXELS_CLAMPED_HALF 1
 | 
						|
    mova     m0, [blockq+mmsize*0+%1]
 | 
						|
    mova     m1, [blockq+mmsize*2+%1]
 | 
						|
%if mmsize == 8
 | 
						|
    mova     m2, [blockq+mmsize*4+%1]
 | 
						|
    mova     m3, [blockq+mmsize*6+%1]
 | 
						|
%endif
 | 
						|
    packuswb m0, [blockq+mmsize*1+%1]
 | 
						|
    packuswb m1, [blockq+mmsize*3+%1]
 | 
						|
%if mmsize == 8
 | 
						|
    packuswb m2, [blockq+mmsize*5+%1]
 | 
						|
    packuswb m3, [blockq+mmsize*7+%1]
 | 
						|
    movq           [pixelsq], m0
 | 
						|
    movq    [lsizeq+pixelsq], m1
 | 
						|
    movq  [2*lsizeq+pixelsq], m2
 | 
						|
    movq   [lsize3q+pixelsq], m3
 | 
						|
%else
 | 
						|
    movq           [pixelsq], m0
 | 
						|
    movhps  [lsizeq+pixelsq], m0
 | 
						|
    movq  [2*lsizeq+pixelsq], m1
 | 
						|
    movhps [lsize3q+pixelsq], m1
 | 
						|
%endif
 | 
						|
%endmacro
 | 
						|
 | 
						|
%macro ADD_PIXELS_CLAMPED 1
 | 
						|
    mova       m0, [blockq+mmsize*0+%1]
 | 
						|
    mova       m1, [blockq+mmsize*1+%1]
 | 
						|
%if mmsize == 8
 | 
						|
    mova       m5, [blockq+mmsize*2+%1]
 | 
						|
    mova       m6, [blockq+mmsize*3+%1]
 | 
						|
%endif
 | 
						|
    movq       m2, [pixelsq]
 | 
						|
    movq       m3, [pixelsq+lsizeq]
 | 
						|
%if mmsize == 8
 | 
						|
    mova       m7, m2
 | 
						|
    punpcklbw  m2, m4
 | 
						|
    punpckhbw  m7, m4
 | 
						|
    paddsw     m0, m2
 | 
						|
    paddsw     m1, m7
 | 
						|
    mova       m7, m3
 | 
						|
    punpcklbw  m3, m4
 | 
						|
    punpckhbw  m7, m4
 | 
						|
    paddsw     m5, m3
 | 
						|
    paddsw     m6, m7
 | 
						|
%else
 | 
						|
    punpcklbw  m2, m4
 | 
						|
    punpcklbw  m3, m4
 | 
						|
    paddsw     m0, m2
 | 
						|
    paddsw     m1, m3
 | 
						|
%endif
 | 
						|
    packuswb   m0, m1
 | 
						|
%if mmsize == 8
 | 
						|
    packuswb   m5, m6
 | 
						|
    movq       [pixelsq], m0
 | 
						|
    movq       [pixelsq+lsizeq], m5
 | 
						|
%else
 | 
						|
    movq       [pixelsq], m0
 | 
						|
    movhps     [pixelsq+lsizeq], m0
 | 
						|
%endif
 | 
						|
%endmacro
 | 
						|
 | 
						|
INIT_MMX mmx
 | 
						|
 | 
						|
cglobal simple_idct, 1, 2, 8, 128, block, t0
 | 
						|
    IDCT
 | 
						|
    emms
 | 
						|
RET
 | 
						|
 | 
						|
INIT_XMM sse2
 | 
						|
 | 
						|
cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
 | 
						|
    IDCT
 | 
						|
    lea lsize3q, [lsizeq*3]
 | 
						|
    PUT_PIXELS_CLAMPED_HALF 0
 | 
						|
    lea pixelsq, [pixelsq+lsizeq*4]
 | 
						|
    PUT_PIXELS_CLAMPED_HALF 64
 | 
						|
RET
 | 
						|
 | 
						|
cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0
 | 
						|
    IDCT
 | 
						|
    pxor       m4, m4
 | 
						|
    ADD_PIXELS_CLAMPED 0
 | 
						|
    lea        pixelsq, [pixelsq+lsizeq*2]
 | 
						|
    ADD_PIXELS_CLAMPED 32
 | 
						|
    lea        pixelsq, [pixelsq+lsizeq*2]
 | 
						|
    ADD_PIXELS_CLAMPED 64
 | 
						|
    lea        pixelsq, [pixelsq+lsizeq*2]
 | 
						|
    ADD_PIXELS_CLAMPED 96
 | 
						|
RET
 | 
						|
%endif
 |