x86/vf_blend: Add SSE2 optimization for screen
10x faster than C. Reviewed-by: Paul B Mahol <onemda@gmail.com>
This commit is contained in:
		
							parent
							
								
									c8b1612af0
								
							
						
					
					
						commit
						74f8d9aaef
					
				@ -111,6 +111,13 @@ BLEND_END
 | 
				
			|||||||
    psrlw           %1, 8                ; 00xx00xx  a * b / 255
 | 
					    psrlw           %1, 8                ; 00xx00xx  a * b / 255
 | 
				
			||||||
%endmacro
 | 
					%endmacro
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					%macro SCREEN 4   ; a, b, pw_1, pw_255
 | 
				
			||||||
 | 
					    pxor            %1, %4               ; 00xx00xx  255 - a
 | 
				
			||||||
 | 
					    pxor            %2, %4
 | 
				
			||||||
 | 
					    MULTIPLY        %1, %2, %3
 | 
				
			||||||
 | 
					    pxor            %1, %4               ; 00xx00xx  255 - x / 255
 | 
				
			||||||
 | 
					%endmacro
 | 
				
			||||||
 | 
					
 | 
				
			||||||
BLEND_INIT multiply, 4
 | 
					BLEND_INIT multiply, 4
 | 
				
			||||||
    pxor       m2, m2
 | 
					    pxor       m2, m2
 | 
				
			||||||
    mova       m3, [pw_1]
 | 
					    mova       m3, [pw_1]
 | 
				
			||||||
@ -134,6 +141,28 @@ BLEND_INIT multiply, 4
 | 
				
			|||||||
    jl .loop
 | 
					    jl .loop
 | 
				
			||||||
BLEND_END
 | 
					BLEND_END
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					BLEND_INIT screen, 5
 | 
				
			||||||
 | 
					    pxor       m2, m2
 | 
				
			||||||
 | 
					    mova       m3, [pw_1]
 | 
				
			||||||
 | 
					    mova       m4, [pw_255]
 | 
				
			||||||
 | 
					.nextrow:
 | 
				
			||||||
 | 
					    mov        xq, widthq
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    .loop:
 | 
				
			||||||
 | 
					        movh            m0, [topq + xq]      ; 0000xxxx
 | 
				
			||||||
 | 
					        movh            m1, [bottomq + xq]
 | 
				
			||||||
 | 
					        punpcklbw       m0, m2               ; 00xx00xx
 | 
				
			||||||
 | 
					        punpcklbw       m1, m2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        SCREEN          m0, m1, m3, m4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        packuswb        m0, m0               ; 0000xxxx
 | 
				
			||||||
 | 
					        movh   [dstq + xq], m0
 | 
				
			||||||
 | 
					        add             xq, mmsize / 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    jl .loop
 | 
				
			||||||
 | 
					BLEND_END
 | 
				
			||||||
 | 
					
 | 
				
			||||||
BLEND_INIT average, 3
 | 
					BLEND_INIT average, 3
 | 
				
			||||||
    pxor       m2, m2
 | 
					    pxor       m2, m2
 | 
				
			||||||
.nextrow:
 | 
					.nextrow:
 | 
				
			||||||
 | 
				
			|||||||
@ -37,6 +37,7 @@ BLEND_FUNC(and, sse2)
 | 
				
			|||||||
BLEND_FUNC(darken, sse2)
 | 
					BLEND_FUNC(darken, sse2)
 | 
				
			||||||
BLEND_FUNC(difference128, sse2)
 | 
					BLEND_FUNC(difference128, sse2)
 | 
				
			||||||
BLEND_FUNC(multiply, sse2)
 | 
					BLEND_FUNC(multiply, sse2)
 | 
				
			||||||
 | 
					BLEND_FUNC(screen, sse2)
 | 
				
			||||||
BLEND_FUNC(hardmix, sse2)
 | 
					BLEND_FUNC(hardmix, sse2)
 | 
				
			||||||
BLEND_FUNC(lighten, sse2)
 | 
					BLEND_FUNC(lighten, sse2)
 | 
				
			||||||
BLEND_FUNC(or, sse2)
 | 
					BLEND_FUNC(or, sse2)
 | 
				
			||||||
@ -65,6 +66,7 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
 | 
				
			|||||||
        case BLEND_MULTIPLY: param->blend = ff_blend_multiply_sse2; break;
 | 
					        case BLEND_MULTIPLY: param->blend = ff_blend_multiply_sse2; break;
 | 
				
			||||||
        case BLEND_OR:       param->blend = ff_blend_or_sse2;       break;
 | 
					        case BLEND_OR:       param->blend = ff_blend_or_sse2;       break;
 | 
				
			||||||
        case BLEND_PHOENIX:  param->blend = ff_blend_phoenix_sse2;  break;
 | 
					        case BLEND_PHOENIX:  param->blend = ff_blend_phoenix_sse2;  break;
 | 
				
			||||||
 | 
					        case BLEND_SCREEN:   param->blend = ff_blend_screen_sse2; break;
 | 
				
			||||||
        case BLEND_SUBTRACT: param->blend = ff_blend_subtract_sse2; break;
 | 
					        case BLEND_SUBTRACT: param->blend = ff_blend_subtract_sse2; break;
 | 
				
			||||||
        case BLEND_XOR:      param->blend = ff_blend_xor_sse2;      break;
 | 
					        case BLEND_XOR:      param->blend = ff_blend_xor_sse2;      break;
 | 
				
			||||||
        case BLEND_DIFFERENCE: param->blend = ff_blend_difference_sse2; break;
 | 
					        case BLEND_DIFFERENCE: param->blend = ff_blend_difference_sse2; break;
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user