pngdsp x86: use unaligned access
For test images manually generated to contain only up prediction,
timing results:
         8380x3032    255x185
before:   138635       1992
after:    139232       1996
Actually jumping to the proper version depending on the alignment:
8380x3032: 138767
A 0.5% speed improvement for gigantic images is not worth the code
duplication.
Fixes ticket #4148
Signed-off-by: Christophe Gisquet <christophe.gisquet@gmail.com>
Tested-by: Benoit Fouet <benoit.fouet@free.fr>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
			
			
This commit is contained in:
		
							parent
							
								
									242f1152bf
								
							
						
					
					
						commit
						9fa056ba75
					
				@ -25,9 +25,9 @@
 | 
			
		||||
#include <stdint.h>
 | 
			
		||||
 | 
			
		||||
typedef struct PNGDSPContext {
 | 
			
		||||
    void (*add_bytes_l2)(uint8_t *dst  /* align 16 */,
 | 
			
		||||
    void (*add_bytes_l2)(uint8_t *dst,
 | 
			
		||||
                         uint8_t *src1 /* align 16 */,
 | 
			
		||||
                         uint8_t *src2 /* align 16 */, int w);
 | 
			
		||||
                         uint8_t *src2, int w);
 | 
			
		||||
 | 
			
		||||
    /* this might write to dst[w] */
 | 
			
		||||
    void (*add_paeth_prediction)(uint8_t *dst, uint8_t *src,
 | 
			
		||||
 | 
			
		||||
@ -42,12 +42,12 @@ cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
 | 
			
		||||
    and                waq, ~(mmsize*2-1)
 | 
			
		||||
    jmp .end_v
 | 
			
		||||
.loop_v:
 | 
			
		||||
    mova                m0, [src1q+iq]
 | 
			
		||||
    mova                m1, [src1q+iq+mmsize]
 | 
			
		||||
    paddb               m0, [src2q+iq]
 | 
			
		||||
    paddb               m1, [src2q+iq+mmsize]
 | 
			
		||||
    mova  [dstq+iq       ], m0
 | 
			
		||||
    mova  [dstq+iq+mmsize], m1
 | 
			
		||||
    movu                m0, [src2q+iq]
 | 
			
		||||
    movu                m1, [src2q+iq+mmsize]
 | 
			
		||||
    paddb               m0, [src1q+iq]
 | 
			
		||||
    paddb               m1, [src1q+iq+mmsize]
 | 
			
		||||
    movu  [dstq+iq       ], m0
 | 
			
		||||
    movu  [dstq+iq+mmsize], m1
 | 
			
		||||
    add                 iq, mmsize*2
 | 
			
		||||
.end_v:
 | 
			
		||||
    cmp                 iq, waq
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user