rv40dsp x86: use only one register, for both increment and loop counter
Around 10 cycles faster for luma. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
		
							parent
							
								
									272b252c01
								
							
						
					
					
						commit
						2130bd8f5b
					
				@ -32,13 +32,14 @@ SECTION .text
 | 
			
		||||
 | 
			
		||||
; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
 | 
			
		||||
%macro RV40_WCORE  4-5
 | 
			
		||||
    movh       m4, [%3 + 0]
 | 
			
		||||
    movh       m5, [%4 + 0]
 | 
			
		||||
    movh       m4, [%3 + r6 + 0]
 | 
			
		||||
    movh       m5, [%4 + r6 + 0]
 | 
			
		||||
%if %0 == 4
 | 
			
		||||
%define OFFSET mmsize / 2
 | 
			
		||||
%define OFFSET r6 + mmsize / 2
 | 
			
		||||
%else
 | 
			
		||||
    ; 8x8 block and sse2, stride was provided
 | 
			
		||||
%define OFFSET %5
 | 
			
		||||
%define OFFSET r6
 | 
			
		||||
    add        r6, r5
 | 
			
		||||
%endif
 | 
			
		||||
    movh       m6, [%3 + OFFSET]
 | 
			
		||||
    movh       m7, [%4 + OFFSET]
 | 
			
		||||
@ -99,10 +100,12 @@ SECTION .text
 | 
			
		||||
    packuswb   m4, m6
 | 
			
		||||
%if %0 == 5
 | 
			
		||||
    ; Only called for 8x8 blocks and sse2
 | 
			
		||||
    movh       [%2 +  0], m4
 | 
			
		||||
    movhps     [%2 + %5], m4
 | 
			
		||||
    sub        r6, r5
 | 
			
		||||
    movh       [%2 + r6], m4
 | 
			
		||||
    add        r6, r5
 | 
			
		||||
    movhps     [%2 + r6], m4
 | 
			
		||||
%else
 | 
			
		||||
    mova       [%2], m4
 | 
			
		||||
    mova       [%2 + r6], m4
 | 
			
		||||
%endif
 | 
			
		||||
%endmacro
 | 
			
		||||
 | 
			
		||||
@ -115,26 +118,19 @@ SECTION .text
 | 
			
		||||
%endif
 | 
			
		||||
 | 
			
		||||
    ; Prepare for next loop
 | 
			
		||||
    add        r0, r5
 | 
			
		||||
    add        r1, r5
 | 
			
		||||
    add        r2, r5
 | 
			
		||||
    add        r6, r5
 | 
			
		||||
%else
 | 
			
		||||
%ifidn %1, 8
 | 
			
		||||
    RV40_WCORE %2, r0, r1, r2, r5
 | 
			
		||||
    ; Prepare 2 next lines
 | 
			
		||||
    lea        r0, [r0 + 2 * r5]
 | 
			
		||||
    lea        r1, [r1 + 2 * r5]
 | 
			
		||||
    lea        r2, [r2 + 2 * r5]
 | 
			
		||||
    add        r6, r5
 | 
			
		||||
%else
 | 
			
		||||
    RV40_WCORE %2, r0, r1, r2
 | 
			
		||||
    ; Prepare single next line
 | 
			
		||||
    add        r0, r5
 | 
			
		||||
    add        r1, r5
 | 
			
		||||
    add        r2, r5
 | 
			
		||||
    add        r6, r5
 | 
			
		||||
%endif
 | 
			
		||||
%endif
 | 
			
		||||
 | 
			
		||||
    dec        r6
 | 
			
		||||
%endmacro
 | 
			
		||||
 | 
			
		||||
; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
 | 
			
		||||
@ -145,7 +141,7 @@ SECTION .text
 | 
			
		||||
; Therefore, we check here whether they are multiples of 2^9 for
 | 
			
		||||
; those simplifications to occur.
 | 
			
		||||
%macro RV40_WEIGHT  3
 | 
			
		||||
cglobal rv40_weight_func_%1_%2, 6, 7, %3
 | 
			
		||||
cglobal rv40_weight_func_%1_%2, 6, 7, 8
 | 
			
		||||
%if cpuflag(ssse3)
 | 
			
		||||
    mova       m1, [shift_round]
 | 
			
		||||
%else
 | 
			
		||||
@ -153,11 +149,12 @@ cglobal rv40_weight_func_%1_%2, 6, 7, %3
 | 
			
		||||
%endif
 | 
			
		||||
    pxor       m0, m0
 | 
			
		||||
    ; Set loop counter and increments
 | 
			
		||||
%if mmsize == 8
 | 
			
		||||
    mov        r6, %2
 | 
			
		||||
%else
 | 
			
		||||
    mov        r6, (%2 * %2) / mmsize
 | 
			
		||||
%endif
 | 
			
		||||
    mov        r6, r5
 | 
			
		||||
    shl        r6, %3
 | 
			
		||||
    add        r0, r6
 | 
			
		||||
    add        r1, r6
 | 
			
		||||
    add        r2, r6
 | 
			
		||||
    neg        r6
 | 
			
		||||
 | 
			
		||||
    movd       m2, r3
 | 
			
		||||
    movd       m3, r4
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user