The .text section is already 16-byte aligned by default on all supported platforms so `SECTION_TEXT` isn't any different from `SECTION .text`.
		
			
				
	
	
		
			1673 lines
		
	
	
		
			49 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
			
		
		
	
	
			1673 lines
		
	
	
		
			49 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
| ; /*
 | |
| ; * Provide SSE luma and chroma mc functions for HEVC decoding
 | |
| ; * Copyright (c) 2013 Pierre-Edouard LEPERE
 | |
| ; *
 | |
| ; * This file is part of FFmpeg.
 | |
| ; *
 | |
| ; * FFmpeg is free software; you can redistribute it and/or
 | |
| ; * modify it under the terms of the GNU Lesser General Public
 | |
| ; * License as published by the Free Software Foundation; either
 | |
| ; * version 2.1 of the License, or (at your option) any later version.
 | |
| ; *
 | |
| ; * FFmpeg is distributed in the hope that it will be useful,
 | |
| ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
| ; * Lesser General Public License for more details.
 | |
| ; *
 | |
| ; * You should have received a copy of the GNU Lesser General Public
 | |
| ; * License along with FFmpeg; if not, write to the Free Software
 | |
| ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | |
| ; */
 | |
| %include "libavutil/x86/x86util.asm"
 | |
| 
 | |
| SECTION_RODATA 32
 | |
| cextern pw_255
 | |
| cextern pw_512
 | |
| cextern pw_2048
 | |
| cextern pw_8192
 | |
| cextern pw_1023
 | |
| cextern pw_1024
 | |
| cextern pw_4096
 | |
| %define pw_8 pw_512
 | |
| %define pw_10 pw_2048
 | |
| %define pw_12 pw_8192
 | |
| %define pw_bi_10 pw_1024
 | |
| %define pw_bi_12 pw_4096
 | |
| %define max_pixels_8 pw_255
 | |
| %define max_pixels_10 pw_1023
 | |
| pw_bi_8:                times 16 dw  (1 <<  8)
 | |
| max_pixels_12:          times 16 dw ((1 << 12)-1)
 | |
| cextern pd_1
 | |
| cextern pb_0
 | |
| 
 | |
| %macro EPEL_TABLE 4
 | |
| hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
 | |
|                         times %2 d%3 10, -2
 | |
|                         times %2 d%3 -4, 54
 | |
|                         times %2 d%3 16, -2
 | |
|                         times %2 d%3 -6, 46
 | |
|                         times %2 d%3 28, -4
 | |
|                         times %2 d%3 -4, 36
 | |
|                         times %2 d%3 36, -4
 | |
|                         times %2 d%3 -4, 28
 | |
|                         times %2 d%3 46, -6
 | |
|                         times %2 d%3 -2, 16
 | |
|                         times %2 d%3 54, -4
 | |
|                         times %2 d%3 -2, 10
 | |
|                         times %2 d%3 58, -2
 | |
| %endmacro
 | |
| 
 | |
| 
 | |
| EPEL_TABLE  8,16, b, avx2
 | |
| EPEL_TABLE 10, 8, w, avx2
 | |
| 
 | |
| EPEL_TABLE  8, 8, b, sse4
 | |
| EPEL_TABLE 10, 4, w, sse4
 | |
| EPEL_TABLE 12, 4, w, sse4
 | |
| 
 | |
| %macro QPEL_TABLE 4
 | |
| hevc_qpel_filters_%4_%1 times %2 d%3  -1,  4
 | |
|                         times %2 d%3 -10, 58
 | |
|                         times %2 d%3  17, -5
 | |
|                         times %2 d%3   1,  0
 | |
|                         times %2 d%3  -1,  4
 | |
|                         times %2 d%3 -11, 40
 | |
|                         times %2 d%3  40,-11
 | |
|                         times %2 d%3   4, -1
 | |
|                         times %2 d%3   0,  1
 | |
|                         times %2 d%3  -5, 17
 | |
|                         times %2 d%3  58,-10
 | |
|                         times %2 d%3   4, -1
 | |
| %endmacro
 | |
| 
 | |
| QPEL_TABLE  8, 8, b, sse4
 | |
| QPEL_TABLE 10, 4, w, sse4
 | |
| QPEL_TABLE 12, 4, w, sse4
 | |
| 
 | |
| QPEL_TABLE  8,16, b, avx2
 | |
| QPEL_TABLE 10, 8, w, avx2
 | |
| 
 | |
| SECTION .text
 | |
| 
 | |
| %define MAX_PB_SIZE  64
 | |
| 
 | |
| %define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
 | |
| 
 | |
| %define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10
 | |
| 
 | |
| %if ARCH_X86_64
 | |
| 
 | |
| %macro SIMPLE_BILOAD 4   ;width, tab, r1, r2
 | |
| %if %1 <= 4
 | |
|     movq              %3, [%2]                                              ; load data from source2
 | |
| %elif %1 <= 8
 | |
|     movdqa            %3, [%2]                                              ; load data from source2
 | |
| %elif %1 <= 12
 | |
| %if cpuflag(avx2)
 | |
|     mova              %3, [%2]
 | |
| %else
 | |
|     movdqa            %3, [%2]                                              ; load data from source2
 | |
|     movq              %4, [%2+16]                                           ; load data from source2
 | |
| %endif ;avx
 | |
| %elif %1 <= 16
 | |
| %if cpuflag(avx2)
 | |
|     mova              %3, [%2]
 | |
| %else
 | |
|     movdqa            %3, [%2]                                              ; load data from source2
 | |
|     movdqa            %4, [%2+16]                                           ; load data from source2
 | |
| %endif ; avx
 | |
| %else ; %1 = 32
 | |
|     mova              %3, [%2]
 | |
|     mova              %4, [%2+32]
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| %macro SIMPLE_LOAD 4    ;width, bitd, tab, r1
 | |
| %if %1 == 2 || (%2 == 8 && %1 <= 4)
 | |
|     movd              %4, [%3]                                               ; load data from source
 | |
| %elif %1 == 4 || (%2 == 8 && %1 <= 8)
 | |
|     movq              %4, [%3]                                               ; load data from source
 | |
| %elif notcpuflag(avx)
 | |
|     movu              %4, [%3]                                               ; load data from source
 | |
| %elif %1 <= 8 || (%2 == 8 && %1 <= 16)
 | |
|     movdqu           %4, [%3]
 | |
| %else
 | |
|     movu              %4, [%3]
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| 
 | |
| %macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp
 | |
| %if cpuflag(avx2)
 | |
| %assign %%offset 32
 | |
| %ifdef PIC
 | |
|     lea              %5q, [hevc_epel_filters_avx2_%1]
 | |
|     %define FILTER %5q
 | |
| %else
 | |
|     %define FILTER hevc_epel_filters_avx2_%1
 | |
| %endif
 | |
| %else
 | |
| %assign %%offset 16
 | |
| %ifdef PIC
 | |
|     lea              %5q, [hevc_epel_filters_sse4_%1]
 | |
|     %define FILTER %5q
 | |
| %else
 | |
|     %define FILTER hevc_epel_filters_sse4_%1
 | |
| %endif
 | |
| %endif ;cpuflag(avx2)
 | |
|     sub              %2q, 1
 | |
| %if cpuflag(avx2)
 | |
|     shl              %2q, 6                      ; multiply by 64
 | |
|   %else
 | |
|     shl              %2q, 5                      ; multiply by 32
 | |
| %endif
 | |
|     mova           %3, [FILTER + %2q]        ; get 2 first values of filters
 | |
|     mova           %4, [FILTER + %2q+%%offset]     ; get 2 last values of filters
 | |
| %endmacro
 | |
| 
 | |
| %macro EPEL_HV_FILTER 1
 | |
| %if cpuflag(avx2)
 | |
| %assign %%offset 32
 | |
| %assign %%shift  6
 | |
| %define %%table  hevc_epel_filters_avx2_%1
 | |
| %else
 | |
| %assign %%offset 16
 | |
| %assign %%shift  5
 | |
| %define %%table  hevc_epel_filters_sse4_%1
 | |
| %endif
 | |
| 
 | |
| %ifdef PIC
 | |
|     lea           r3srcq, [%%table]
 | |
|     %define FILTER r3srcq
 | |
| %else
 | |
|     %define FILTER %%table
 | |
| %endif
 | |
|     sub              mxq, 1
 | |
|     sub              myq, 1
 | |
|     shl              mxq, %%shift                ; multiply by 32
 | |
|     shl              myq, %%shift                ; multiply by 32
 | |
|     mova             m14, [FILTER + mxq]        ; get 2 first values of filters
 | |
|     mova             m15, [FILTER + mxq+%%offset]     ; get 2 last values of filters
 | |
| 
 | |
| %if cpuflag(avx2)
 | |
| %define %%table  hevc_epel_filters_avx2_10
 | |
| %else
 | |
| %define %%table  hevc_epel_filters_sse4_10
 | |
| %endif
 | |
| %ifdef PIC
 | |
|     lea           r3srcq, [%%table]
 | |
|     %define FILTER r3srcq
 | |
| %else
 | |
|     %define FILTER %%table
 | |
| %endif
 | |
|     mova             m12, [FILTER + myq]        ; get 2 first values of filters
 | |
|     mova             m13, [FILTER + myq+%%offset]     ; get 2 last values of filters
 | |
|     lea           r3srcq, [srcstrideq*3]
 | |
| %endmacro
 | |
| 
 | |
| %macro QPEL_FILTER 2
 | |
| 
 | |
| %if cpuflag(avx2)
 | |
| %assign %%offset 32
 | |
| %assign %%shift  7
 | |
| %define %%table  hevc_qpel_filters_avx2_%1
 | |
| %else
 | |
| %assign %%offset 16
 | |
| %assign %%shift  6
 | |
| %define %%table  hevc_qpel_filters_sse4_%1
 | |
| %endif
 | |
| 
 | |
| %ifdef PIC
 | |
|     lea         rfilterq, [%%table]
 | |
| %else
 | |
|     %define rfilterq %%table
 | |
| %endif
 | |
|     sub              %2q, 1
 | |
|     shl              %2q, %%shift                        ; multiply by 32
 | |
|     mova             m12, [rfilterq + %2q]               ; get 4 first values of filters
 | |
|     mova             m13, [rfilterq + %2q +   %%offset]  ; get 4 first values of filters
 | |
|     mova             m14, [rfilterq + %2q + 2*%%offset]  ; get 4 first values of filters
 | |
|     mova             m15, [rfilterq + %2q + 3*%%offset]  ; get 4 first values of filters
 | |
| %endmacro
 | |
| 
 | |
| %macro EPEL_LOAD 4
 | |
| %if (%1 == 8 && %4 <= 4)
 | |
| %define %%load movd
 | |
| %elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
 | |
| %define %%load movq
 | |
| %else
 | |
| %define %%load movdqu
 | |
| %endif
 | |
| 
 | |
|     %%load            m0, [%2q ]
 | |
| %ifnum %3
 | |
|     %%load            m1, [%2q+  %3]
 | |
|     %%load            m2, [%2q+2*%3]
 | |
|     %%load            m3, [%2q+3*%3]
 | |
| %else
 | |
|     %%load            m1, [%2q+  %3q]
 | |
|     %%load            m2, [%2q+2*%3q]
 | |
|     %%load            m3, [%2q+r3srcq]
 | |
| %endif
 | |
| %if %1 == 8
 | |
| %if %4 > 8
 | |
|     SBUTTERFLY        bw, 0, 1, 7
 | |
|     SBUTTERFLY        bw, 2, 3, 7
 | |
| %else
 | |
|     punpcklbw         m0, m1
 | |
|     punpcklbw         m2, m3
 | |
| %endif
 | |
| %else
 | |
| %if %4 > 4
 | |
|     SBUTTERFLY        wd, 0, 1, 7
 | |
|     SBUTTERFLY        wd, 2, 3, 7
 | |
| %else
 | |
|     punpcklwd         m0, m1
 | |
|     punpcklwd         m2, m3
 | |
| %endif
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| 
 | |
| %macro QPEL_H_LOAD 4
 | |
| %assign %%stride (%1+7)/8
 | |
| %if %1 == 8
 | |
| %if %3 <= 4
 | |
| %define %%load movd
 | |
| %elif %3 == 8
 | |
| %define %%load movq
 | |
| %else
 | |
| %define %%load movu
 | |
| %endif
 | |
| %else
 | |
| %if %3 == 2
 | |
| %define %%load movd
 | |
| %elif %3 == 4
 | |
| %define %%load movq
 | |
| %else
 | |
| %define %%load movu
 | |
| %endif
 | |
| %endif
 | |
|     %%load            m0, [%2-3*%%stride]        ;load data from source
 | |
|     %%load            m1, [%2-2*%%stride]
 | |
|     %%load            m2, [%2-%%stride  ]
 | |
|     %%load            m3, [%2           ]
 | |
|     %%load            m4, [%2+%%stride  ]
 | |
|     %%load            m5, [%2+2*%%stride]
 | |
|     %%load            m6, [%2+3*%%stride]
 | |
|     %%load            m7, [%2+4*%%stride]
 | |
| 
 | |
| %if %1 == 8
 | |
| %if %3 > 8
 | |
|     SBUTTERFLY        wd, 0, 1, %4
 | |
|     SBUTTERFLY        wd, 2, 3, %4
 | |
|     SBUTTERFLY        wd, 4, 5, %4
 | |
|     SBUTTERFLY        wd, 6, 7, %4
 | |
| %else
 | |
|     punpcklbw         m0, m1
 | |
|     punpcklbw         m2, m3
 | |
|     punpcklbw         m4, m5
 | |
|     punpcklbw         m6, m7
 | |
| %endif
 | |
| %else
 | |
| %if %3 > 4
 | |
|     SBUTTERFLY        dq, 0, 1, %4
 | |
|     SBUTTERFLY        dq, 2, 3, %4
 | |
|     SBUTTERFLY        dq, 4, 5, %4
 | |
|     SBUTTERFLY        dq, 6, 7, %4
 | |
| %else
 | |
|     punpcklwd         m0, m1
 | |
|     punpcklwd         m2, m3
 | |
|     punpcklwd         m4, m5
 | |
|     punpcklwd         m6, m7
 | |
| %endif
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| %macro QPEL_V_LOAD 5
 | |
|     lea              %5q, [%2]
 | |
|     sub              %5q, r3srcq
 | |
|     movu              m0, [%5q            ]      ;load x- 3*srcstride
 | |
|     movu              m1, [%5q+   %3q     ]      ;load x- 2*srcstride
 | |
|     movu              m2, [%5q+ 2*%3q     ]      ;load x-srcstride
 | |
|     movu              m3, [%2       ]      ;load x
 | |
|     movu              m4, [%2+   %3q]      ;load x+stride
 | |
|     movu              m5, [%2+ 2*%3q]      ;load x+2*stride
 | |
|     movu              m6, [%2+r3srcq]      ;load x+3*stride
 | |
|     movu              m7, [%2+ 4*%3q]      ;load x+4*stride
 | |
| %if %1 == 8
 | |
| %if %4 > 8
 | |
|     SBUTTERFLY        bw, 0, 1, 8
 | |
|     SBUTTERFLY        bw, 2, 3, 8
 | |
|     SBUTTERFLY        bw, 4, 5, 8
 | |
|     SBUTTERFLY        bw, 6, 7, 8
 | |
| %else
 | |
|     punpcklbw         m0, m1
 | |
|     punpcklbw         m2, m3
 | |
|     punpcklbw         m4, m5
 | |
|     punpcklbw         m6, m7
 | |
| %endif
 | |
| %else
 | |
| %if %4 > 4
 | |
|     SBUTTERFLY        wd, 0, 1, 8
 | |
|     SBUTTERFLY        wd, 2, 3, 8
 | |
|     SBUTTERFLY        wd, 4, 5, 8
 | |
|     SBUTTERFLY        wd, 6, 7, 8
 | |
| %else
 | |
|     punpcklwd         m0, m1
 | |
|     punpcklwd         m2, m3
 | |
|     punpcklwd         m4, m5
 | |
|     punpcklwd         m6, m7
 | |
| %endif
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| %macro PEL_12STORE2 3
 | |
|     movd           [%1], %2
 | |
| %endmacro
 | |
| %macro PEL_12STORE4 3
 | |
|     movq           [%1], %2
 | |
| %endmacro
 | |
| %macro PEL_12STORE6 3
 | |
|     movq           [%1], %2
 | |
|     psrldq            %2, 8
 | |
|     movd         [%1+8], %2
 | |
| %endmacro
 | |
| %macro PEL_12STORE8 3
 | |
|     movdqa         [%1], %2
 | |
| %endmacro
 | |
| %macro PEL_12STORE12 3
 | |
|     movdqa         [%1], %2
 | |
|     movq        [%1+16], %3
 | |
| %endmacro
 | |
| %macro PEL_12STORE16 3
 | |
|     PEL_12STORE8      %1, %2, %3
 | |
|     movdqa       [%1+16], %3
 | |
| %endmacro
 | |
| 
 | |
| %macro PEL_10STORE2 3
 | |
|     movd           [%1], %2
 | |
| %endmacro
 | |
| %macro PEL_10STORE4 3
 | |
|     movq           [%1], %2
 | |
| %endmacro
 | |
| %macro PEL_10STORE6 3
 | |
|     movq           [%1], %2
 | |
|     psrldq            %2, 8
 | |
|     movd         [%1+8], %2
 | |
| %endmacro
 | |
| %macro PEL_10STORE8 3
 | |
|     movdqa         [%1], %2
 | |
| %endmacro
 | |
| %macro PEL_10STORE12 3
 | |
|     movdqa         [%1], %2
 | |
|     movq        [%1+16], %3
 | |
| %endmacro
 | |
| %macro PEL_10STORE16 3
 | |
| %if cpuflag(avx2)
 | |
|     movu            [%1], %2
 | |
| %else
 | |
|     PEL_10STORE8      %1, %2, %3
 | |
|     movdqa       [%1+16], %3
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| %macro PEL_10STORE32 3
 | |
|     PEL_10STORE16     %1, %2, %3
 | |
|     movu         [%1+32], %3
 | |
| %endmacro
 | |
| 
 | |
| %macro PEL_8STORE2 3
 | |
|     pextrw          [%1], %2, 0
 | |
| %endmacro
 | |
| %macro PEL_8STORE4 3
 | |
|     movd            [%1], %2
 | |
| %endmacro
 | |
| %macro PEL_8STORE6 3
 | |
|     movd            [%1], %2
 | |
|     pextrw        [%1+4], %2, 2
 | |
| %endmacro
 | |
| %macro PEL_8STORE8 3
 | |
|     movq           [%1], %2
 | |
| %endmacro
 | |
| %macro PEL_8STORE12 3
 | |
|     movq            [%1], %2
 | |
|     psrldq            %2, 8
 | |
|     movd          [%1+8], %2
 | |
| %endmacro
 | |
| %macro PEL_8STORE16 3
 | |
| %if cpuflag(avx2)
 | |
|     movdqu        [%1], %2
 | |
| %else
 | |
|     mova          [%1], %2
 | |
| %endif ; avx
 | |
| %endmacro
 | |
| %macro PEL_8STORE32 3
 | |
|     movu          [%1], %2
 | |
| %endmacro
 | |
| 
 | |
| %macro LOOP_END 3
 | |
|     add              %1q, 2*MAX_PB_SIZE          ; dst += dststride
 | |
|     add              %2q, %3q                    ; src += srcstride
 | |
|     dec          heightd                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
| %endmacro
 | |
| 
 | |
| 
 | |
| %macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth
 | |
| %if %2 == 8
 | |
| %if cpuflag(avx2) && %0 ==3
 | |
| %if %1 > 16
 | |
|     vextracti128 xm1, m0, 1
 | |
|     pmovzxbw      m1, xm1
 | |
|     psllw         m1, 14-%2
 | |
| %endif
 | |
|     pmovzxbw      m0, xm0
 | |
| %else ; not avx
 | |
| %if %1 > 8
 | |
|     punpckhbw     m1, m0, m2
 | |
|     psllw         m1, 14-%2
 | |
| %endif
 | |
|     punpcklbw     m0, m2
 | |
| %endif
 | |
| %endif ;avx
 | |
|     psllw         m0, 14-%2
 | |
| %endmacro
 | |
| 
 | |
| %macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3
 | |
| %if %0 == 8
 | |
| %define %%reg0 %5
 | |
| %define %%reg2 %6
 | |
| %define %%reg1 %7
 | |
| %define %%reg3 %8
 | |
| %else
 | |
| %define %%reg0 m0
 | |
| %define %%reg2 m2
 | |
| %define %%reg1 m1
 | |
| %define %%reg3 m3
 | |
| %endif
 | |
| %if %1 == 8
 | |
| %if cpuflag(avx2) && (%0 == 5)
 | |
| %if %2 > 16
 | |
|     vperm2i128    m10, m0, m1, q0301
 | |
| %endif
 | |
|     vinserti128    m0, m0, xm1, 1
 | |
|     mova           m1, m10
 | |
| %if %2 > 16
 | |
|     vperm2i128    m10, m2, m3, q0301
 | |
| %endif
 | |
|     vinserti128    m2, m2, xm3, 1
 | |
|     mova           m3, m10
 | |
| %endif
 | |
|     pmaddubsw      %%reg0, %3   ;x1*c1+x2*c2
 | |
|     pmaddubsw      %%reg2, %4   ;x3*c3+x4*c4
 | |
|     paddw          %%reg0, %%reg2
 | |
| %if %2 > 8
 | |
|     pmaddubsw      %%reg1, %3
 | |
|     pmaddubsw      %%reg3, %4
 | |
|     paddw          %%reg1, %%reg3
 | |
| %endif
 | |
| %else
 | |
|     pmaddwd        %%reg0, %3
 | |
|     pmaddwd        %%reg2, %4
 | |
|     paddd          %%reg0, %%reg2
 | |
| %if %2 > 4
 | |
|     pmaddwd        %%reg1, %3
 | |
|     pmaddwd        %%reg3, %4
 | |
|     paddd          %%reg1, %%reg3
 | |
| %if %1 != 8
 | |
|     psrad          %%reg1, %1-8
 | |
| %endif
 | |
| %endif
 | |
| %if %1 != 8
 | |
|     psrad          %%reg0, %1-8
 | |
| %endif
 | |
|     packssdw       %%reg0, %%reg1
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| %macro QPEL_HV_COMPUTE 4     ; width, bitdepth, filter idx
 | |
| 
 | |
| %if cpuflag(avx2)
 | |
| %assign %%offset 32
 | |
| %define %%table  hevc_qpel_filters_avx2_%2
 | |
| %else
 | |
| %assign %%offset 16
 | |
| %define %%table  hevc_qpel_filters_sse4_%2
 | |
| %endif
 | |
| 
 | |
| %ifdef PIC
 | |
|     lea         rfilterq, [%%table]
 | |
| %else
 | |
|     %define rfilterq %%table
 | |
| %endif
 | |
| 
 | |
| %if %2 == 8
 | |
|     pmaddubsw         m0, [rfilterq + %3q*8   ]   ;x1*c1+x2*c2
 | |
|     pmaddubsw         m2, [rfilterq + %3q*8+%%offset]   ;x3*c3+x4*c4
 | |
|     pmaddubsw         m4, [rfilterq + %3q*8+2*%%offset]   ;x5*c5+x6*c6
 | |
|     pmaddubsw         m6, [rfilterq + %3q*8+3*%%offset]   ;x7*c7+x8*c8
 | |
|     paddw             m0, m2
 | |
|     paddw             m4, m6
 | |
|     paddw             m0, m4
 | |
| %else
 | |
|     pmaddwd           m0, [rfilterq + %3q*8   ]
 | |
|     pmaddwd           m2, [rfilterq + %3q*8+%%offset]
 | |
|     pmaddwd           m4, [rfilterq + %3q*8+2*%%offset]
 | |
|     pmaddwd           m6, [rfilterq + %3q*8+3*%%offset]
 | |
|     paddd             m0, m2
 | |
|     paddd             m4, m6
 | |
|     paddd             m0, m4
 | |
| %if %2 != 8
 | |
|     psrad             m0, %2-8
 | |
| %endif
 | |
| %if %1 > 4
 | |
|     pmaddwd           m1, [rfilterq + %3q*8   ]
 | |
|     pmaddwd           m3, [rfilterq + %3q*8+%%offset]
 | |
|     pmaddwd           m5, [rfilterq + %3q*8+2*%%offset]
 | |
|     pmaddwd           m7, [rfilterq + %3q*8+3*%%offset]
 | |
|     paddd             m1, m3
 | |
|     paddd             m5, m7
 | |
|     paddd             m1, m5
 | |
| %if %2 != 8
 | |
|     psrad             m1, %2-8
 | |
| %endif
 | |
| %endif
 | |
|     p%4               m0, m1
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| %macro QPEL_COMPUTE 2-3     ; width, bitdepth
 | |
| %if %2 == 8
 | |
| %if cpuflag(avx2) && (%0 == 3)
 | |
| 
 | |
|     vperm2i128 m10, m0,  m1, q0301
 | |
|     vinserti128 m0, m0, xm1, 1
 | |
|     SWAP 1, 10
 | |
| 
 | |
|     vperm2i128 m10, m2,  m3, q0301
 | |
|     vinserti128 m2, m2, xm3, 1
 | |
|     SWAP 3, 10
 | |
| 
 | |
| 
 | |
|     vperm2i128 m10, m4,  m5, q0301
 | |
|     vinserti128 m4, m4, xm5, 1
 | |
|     SWAP 5, 10
 | |
| 
 | |
|     vperm2i128 m10, m6,  m7, q0301
 | |
|     vinserti128 m6, m6, xm7, 1
 | |
|     SWAP 7, 10
 | |
| %endif
 | |
| 
 | |
|     pmaddubsw         m0, m12   ;x1*c1+x2*c2
 | |
|     pmaddubsw         m2, m13   ;x3*c3+x4*c4
 | |
|     pmaddubsw         m4, m14   ;x5*c5+x6*c6
 | |
|     pmaddubsw         m6, m15   ;x7*c7+x8*c8
 | |
|     paddw             m0, m2
 | |
|     paddw             m4, m6
 | |
|     paddw             m0, m4
 | |
| %if %1 > 8
 | |
|     pmaddubsw         m1, m12
 | |
|     pmaddubsw         m3, m13
 | |
|     pmaddubsw         m5, m14
 | |
|     pmaddubsw         m7, m15
 | |
|     paddw             m1, m3
 | |
|     paddw             m5, m7
 | |
|     paddw             m1, m5
 | |
| %endif
 | |
| %else
 | |
|     pmaddwd           m0, m12
 | |
|     pmaddwd           m2, m13
 | |
|     pmaddwd           m4, m14
 | |
|     pmaddwd           m6, m15
 | |
|     paddd             m0, m2
 | |
|     paddd             m4, m6
 | |
|     paddd             m0, m4
 | |
| %if %2 != 8
 | |
|     psrad             m0, %2-8
 | |
| %endif
 | |
| %if %1 > 4
 | |
|     pmaddwd           m1, m12
 | |
|     pmaddwd           m3, m13
 | |
|     pmaddwd           m5, m14
 | |
|     pmaddwd           m7, m15
 | |
|     paddd             m1, m3
 | |
|     paddd             m5, m7
 | |
|     paddd             m1, m5
 | |
| %if %2 != 8
 | |
|     psrad             m1, %2-8
 | |
| %endif
 | |
| %endif
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| %macro BI_COMPUTE 7-8     ; width, bitd, src1l, src1h, scr2l, scr2h, pw
 | |
|     paddsw            %3, %5
 | |
| %if %1 > 8
 | |
|     paddsw            %4, %6
 | |
| %endif
 | |
|     UNI_COMPUTE       %1, %2, %3, %4, %7
 | |
| %if %0 == 8 && cpuflag(avx2) && (%2 == 8)
 | |
|     vpermq            %3, %3, 216
 | |
|     vpermq            %4, %4, 216
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| %macro UNI_COMPUTE 5
 | |
|     pmulhrsw          %3, %5
 | |
| %if %1 > 8 || (%2 > 8 && %1 > 4)
 | |
|     pmulhrsw          %4, %5
 | |
| %endif
 | |
| %if %2 == 8
 | |
|     packuswb          %3, %4
 | |
| %else
 | |
|     CLIPW             %3, [pb_0], [max_pixels_%2]
 | |
| %if (%1 > 8 && notcpuflag(avx)) || %1 > 16
 | |
|     CLIPW             %4, [pb_0], [max_pixels_%2]
 | |
| %endif
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| 
 | |
| ; ******************************
 | |
| ; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
 | |
| ;                         uint8_t *_src, ptrdiff_t _srcstride,
 | |
| ;                         int height, int mx, int my)
 | |
| ; ******************************
 | |
| 
 | |
| %macro HEVC_PUT_HEVC_PEL_PIXELS 2
 | |
| HEVC_PEL_PIXELS     %1, %2
 | |
| HEVC_UNI_PEL_PIXELS %1, %2
 | |
| HEVC_BI_PEL_PIXELS  %1, %2
 | |
| %endmacro
 | |
| 
 | |
| %macro HEVC_PEL_PIXELS 2
 | |
| cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
 | |
|     pxor               m2, m2
 | |
| .loop:
 | |
|     SIMPLE_LOAD       %1, %2, srcq, m0
 | |
|     MC_PIXEL_COMPUTE  %1, %2, 1
 | |
|     PEL_10STORE%1     dstq, m0, m1
 | |
|     LOOP_END         dst, src, srcstride
 | |
|     RET
 | |
|  %endmacro
 | |
| 
 | |
| %macro HEVC_UNI_PEL_PIXELS 2
 | |
| cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
 | |
| .loop:
 | |
|     SIMPLE_LOAD       %1, %2, srcq, m0
 | |
|     PEL_%2STORE%1   dstq, m0, m1
 | |
|     add             dstq, dststrideq             ; dst += dststride
 | |
|     add             srcq, srcstrideq             ; src += srcstride
 | |
|     dec          heightd                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| %macro HEVC_BI_PEL_PIXELS 2
 | |
| cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
 | |
|     pxor              m2, m2
 | |
|     movdqa            m5, [pw_bi_%2]
 | |
| .loop:
 | |
|     SIMPLE_LOAD       %1, %2, srcq, m0
 | |
|     SIMPLE_BILOAD     %1, src2q, m3, m4
 | |
|     MC_PIXEL_COMPUTE  %1, %2, 1
 | |
|     BI_COMPUTE        %1, %2, m0, m1, m3, m4, m5, 1
 | |
|     PEL_%2STORE%1   dstq, m0, m1
 | |
|     add             dstq, dststrideq             ; dst += dststride
 | |
|     add             srcq, srcstrideq             ; src += srcstride
 | |
|     add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
 | |
|     dec          heightd                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| 
 | |
| ; ******************************
 | |
| ; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
 | |
| ;                       uint8_t *_src, ptrdiff_t _srcstride,
 | |
| ;                       int height, int mx, int my, int width);
 | |
| ; ******************************
 | |
| 
 | |
| 
 | |
| %macro HEVC_PUT_HEVC_EPEL 2
 | |
| %if cpuflag(avx2)
 | |
| %define XMM_REGS  11
 | |
| %else
 | |
| %define XMM_REGS  8
 | |
| %endif
 | |
| 
 | |
| cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter
 | |
| %assign %%stride ((%2 + 7)/8)
 | |
|     EPEL_FILTER       %2, mx, m4, m5, rfilter
 | |
| .loop:
 | |
|     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m4, m5, 1
 | |
|     PEL_10STORE%1      dstq, m0, m1
 | |
|     LOOP_END         dst, src, srcstride
 | |
|     RET
 | |
| 
 | |
| cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter
 | |
| %assign %%stride ((%2 + 7)/8)
 | |
|     movdqa            m6, [pw_%2]
 | |
|     EPEL_FILTER       %2, mx, m4, m5, rfilter
 | |
| .loop:
 | |
|     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m4, m5
 | |
|     UNI_COMPUTE       %1, %2, m0, m1, m6
 | |
|     PEL_%2STORE%1   dstq, m0, m1
 | |
|     add             dstq, dststrideq             ; dst += dststride
 | |
|     add             srcq, srcstrideq             ; src += srcstride
 | |
|     dec          heightd                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
|     RET
 | |
| 
 | |
| cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter
 | |
|     movdqa            m6, [pw_bi_%2]
 | |
|     EPEL_FILTER       %2, mx, m4, m5, rfilter
 | |
| .loop:
 | |
|     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m4, m5, 1
 | |
|     SIMPLE_BILOAD     %1, src2q, m2, m3
 | |
|     BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
 | |
|     PEL_%2STORE%1   dstq, m0, m1
 | |
|     add             dstq, dststrideq             ; dst += dststride
 | |
|     add             srcq, srcstrideq             ; src += srcstride
 | |
|     add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
 | |
|     dec          heightd                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
|     RET
 | |
| 
 | |
| ; ******************************
 | |
| ; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
 | |
| ;                      uint8_t *_src, ptrdiff_t _srcstride,
 | |
| ;                      int height, int mx, int my, int width)
 | |
| ; ******************************
 | |
| 
 | |
| cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my
 | |
|     movifnidn        myd, mym
 | |
|     sub             srcq, srcstrideq
 | |
|     EPEL_FILTER       %2, my, m4, m5, r3src
 | |
|     lea           r3srcq, [srcstrideq*3]
 | |
| .loop:
 | |
|     EPEL_LOAD         %2, srcq, srcstride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m4, m5, 1
 | |
|     PEL_10STORE%1     dstq, m0, m1
 | |
|     LOOP_END          dst, src, srcstride
 | |
|     RET
 | |
| 
 | |
| cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my
 | |
|     movifnidn        myd, mym
 | |
|     movdqa            m6, [pw_%2]
 | |
|     sub             srcq, srcstrideq
 | |
|     EPEL_FILTER       %2, my, m4, m5, r3src
 | |
|     lea           r3srcq, [srcstrideq*3]
 | |
| .loop:
 | |
|     EPEL_LOAD         %2, srcq, srcstride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m4, m5
 | |
|     UNI_COMPUTE       %1, %2, m0, m1, m6
 | |
|     PEL_%2STORE%1   dstq, m0, m1
 | |
|     add             dstq, dststrideq             ; dst += dststride
 | |
|     add             srcq, srcstrideq             ; src += srcstride
 | |
|     dec          heightd                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
|     RET
 | |
| 
 | |
| 
 | |
| cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my
 | |
|     movifnidn        myd, mym
 | |
|     movdqa            m6, [pw_bi_%2]
 | |
|     sub             srcq, srcstrideq
 | |
|     EPEL_FILTER       %2, my, m4, m5, r3src
 | |
|     lea           r3srcq, [srcstrideq*3]
 | |
| .loop:
 | |
|     EPEL_LOAD         %2, srcq, srcstride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m4, m5, 1
 | |
|     SIMPLE_BILOAD     %1, src2q, m2, m3
 | |
|     BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
 | |
|     PEL_%2STORE%1   dstq, m0, m1
 | |
|     add             dstq, dststrideq             ; dst += dststride
 | |
|     add             srcq, srcstrideq             ; src += srcstride
 | |
|     add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
 | |
|     dec          heightd                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| 
 | |
| ; ******************************
 | |
| ; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
 | |
| ;                       uint8_t *_src, ptrdiff_t _srcstride,
 | |
| ;                       int height, int mx, int my, int width)
 | |
| ; ******************************
 | |
| 
 | |
| %macro HEVC_PUT_HEVC_EPEL_HV 2
 | |
| cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src
 | |
| %assign %%stride ((%2 + 7)/8)
 | |
|     sub             srcq, srcstrideq
 | |
|     EPEL_HV_FILTER    %2
 | |
|     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m14, m15
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     SWAP              m8, m1
 | |
| %endif
 | |
|     SWAP              m4, m0
 | |
|     add             srcq, srcstrideq
 | |
|     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m14, m15
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     SWAP              m9, m1
 | |
| %endif
 | |
|     SWAP              m5, m0
 | |
|     add             srcq, srcstrideq
 | |
|     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m14, m15
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     SWAP             m10, m1
 | |
| %endif
 | |
|     SWAP              m6, m0
 | |
|     add             srcq, srcstrideq
 | |
| .loop:
 | |
|     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m14, m15
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     SWAP             m11, m1
 | |
| %endif
 | |
|     SWAP              m7, m0
 | |
|     punpcklwd         m0, m4, m5
 | |
|     punpcklwd         m2, m6, m7
 | |
| %if %1 > 4
 | |
|     punpckhwd         m1, m4, m5
 | |
|     punpckhwd         m3, m6, m7
 | |
| %endif
 | |
|     EPEL_COMPUTE      14, %1, m12, m13
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     punpcklwd         m4, m8, m9
 | |
|     punpcklwd         m2, m10, m11
 | |
|     punpckhwd         m8, m8, m9
 | |
|     punpckhwd         m3, m10, m11
 | |
|     EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
 | |
| %if cpuflag(avx2)
 | |
|     vinserti128       m2, m0, xm4, 1
 | |
|     vperm2i128        m3, m0, m4, q0301
 | |
|     PEL_10STORE%1     dstq, m2, m3
 | |
| %else
 | |
|     PEL_10STORE%1     dstq, m0, m4
 | |
| %endif
 | |
| %else
 | |
|     PEL_10STORE%1     dstq, m0, m1
 | |
| %endif
 | |
|     movdqa            m4, m5
 | |
|     movdqa            m5, m6
 | |
|     movdqa            m6, m7
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     mova              m8, m9
 | |
|     mova              m9, m10
 | |
|     mova             m10, m11
 | |
| %endif
 | |
|     LOOP_END         dst, src, srcstride
 | |
|     RET
 | |
| 
 | |
| cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src
 | |
| %assign %%stride ((%2 + 7)/8)
 | |
|     sub             srcq, srcstrideq
 | |
|     EPEL_HV_FILTER    %2
 | |
|     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m14, m15
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     SWAP              m8, m1
 | |
| %endif
 | |
|     SWAP              m4, m0
 | |
|     add             srcq, srcstrideq
 | |
|     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m14, m15
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     SWAP              m9, m1
 | |
| %endif
 | |
|     SWAP              m5, m0
 | |
|     add             srcq, srcstrideq
 | |
|     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m14, m15
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     SWAP             m10, m1
 | |
| %endif
 | |
|     SWAP              m6, m0
 | |
|     add             srcq, srcstrideq
 | |
| .loop:
 | |
|     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m14, m15
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     SWAP             m11, m1
 | |
| %endif
 | |
|     mova              m7, m0
 | |
|     punpcklwd         m0, m4, m5
 | |
|     punpcklwd         m2, m6, m7
 | |
| %if %1 > 4
 | |
|     punpckhwd         m1, m4, m5
 | |
|     punpckhwd         m3, m6, m7
 | |
| %endif
 | |
|     EPEL_COMPUTE      14, %1, m12, m13
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     punpcklwd         m4, m8, m9
 | |
|     punpcklwd         m2, m10, m11
 | |
|     punpckhwd         m8, m8, m9
 | |
|     punpckhwd         m3, m10, m11
 | |
|     EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
 | |
|     UNI_COMPUTE       %1, %2, m0, m4, [pw_%2]
 | |
| %else
 | |
|     UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
 | |
| %endif
 | |
|     PEL_%2STORE%1   dstq, m0, m1
 | |
|     mova              m4, m5
 | |
|     mova              m5, m6
 | |
|     mova              m6, m7
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     mova              m8, m9
 | |
|     mova              m9, m10
 | |
|     mova             m10, m11
 | |
| %endif
 | |
|     add             dstq, dststrideq             ; dst += dststride
 | |
|     add             srcq, srcstrideq             ; src += srcstride
 | |
|     dec          heightd                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
|     RET
 | |
| 
 | |
| cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
 | |
| %assign %%stride ((%2 + 7)/8)
 | |
|     sub             srcq, srcstrideq
 | |
|     EPEL_HV_FILTER    %2
 | |
|     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m14, m15
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     SWAP              m8, m1
 | |
| %endif
 | |
|     SWAP              m4, m0
 | |
|     add             srcq, srcstrideq
 | |
|     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m14, m15
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     SWAP              m9, m1
 | |
| %endif
 | |
|     SWAP              m5, m0
 | |
|     add             srcq, srcstrideq
 | |
|     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m14, m15
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     SWAP             m10, m1
 | |
| %endif
 | |
|     SWAP              m6, m0
 | |
|     add             srcq, srcstrideq
 | |
| .loop:
 | |
|     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
 | |
|     EPEL_COMPUTE      %2, %1, m14, m15
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     SWAP             m11, m1
 | |
| %endif
 | |
|     SWAP              m7, m0
 | |
|     punpcklwd         m0, m4, m5
 | |
|     punpcklwd         m2, m6, m7
 | |
| %if %1 > 4
 | |
|     punpckhwd         m1, m4, m5
 | |
|     punpckhwd         m3, m6, m7
 | |
| %endif
 | |
|     EPEL_COMPUTE      14, %1, m12, m13
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     punpcklwd         m4, m8, m9
 | |
|     punpcklwd         m2, m10, m11
 | |
|     punpckhwd         m8, m8, m9
 | |
|     punpckhwd         m3, m10, m11
 | |
|     EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
 | |
|     SIMPLE_BILOAD     %1, src2q, m8, m3
 | |
| %if cpuflag(avx2)
 | |
|     vinserti128       m1, m8, xm3, 1
 | |
|     vperm2i128        m2, m8, m3, q0301
 | |
|     BI_COMPUTE        %1, %2, m0, m4, m1, m2, [pw_bi_%2]
 | |
| %else
 | |
|     BI_COMPUTE        %1, %2, m0, m4, m8, m3, [pw_bi_%2]
 | |
| %endif
 | |
| %else
 | |
|     SIMPLE_BILOAD     %1, src2q, m8, m9
 | |
|     BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
 | |
| %endif
 | |
|     PEL_%2STORE%1   dstq, m0, m4
 | |
|     mova              m4, m5
 | |
|     mova              m5, m6
 | |
|     mova              m6, m7
 | |
| %if (%1 > 8 && (%2 == 8))
 | |
|     mova              m8, m9
 | |
|     mova              m9, m10
 | |
|     mova             m10, m11
 | |
| %endif
 | |
|     add             dstq, dststrideq             ; dst += dststride
 | |
|     add             srcq, srcstrideq             ; src += srcstride
 | |
|     add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
 | |
|     dec          heightd                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| ; ******************************
 | |
| ; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
 | |
| ;                       uint8_t *_src, ptrdiff_t _srcstride,
 | |
| ;                       int height, int mx, int my, int width)
 | |
| ; ******************************
 | |
| 
 | |
| %macro HEVC_PUT_HEVC_QPEL 2
 | |
| cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
 | |
|     QPEL_FILTER       %2, mx
 | |
| .loop:
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 10
 | |
|     QPEL_COMPUTE      %1, %2, 1
 | |
| %if %2 > 8
 | |
|     packssdw          m0, m1
 | |
| %endif
 | |
|     PEL_10STORE%1     dstq, m0, m1
 | |
|     LOOP_END          dst, src, srcstride
 | |
|     RET
 | |
| 
 | |
| cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
 | |
|     mova              m9, [pw_%2]
 | |
|     QPEL_FILTER       %2, mx
 | |
| .loop:
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 10
 | |
|     QPEL_COMPUTE      %1, %2
 | |
| %if %2 > 8
 | |
|     packssdw          m0, m1
 | |
| %endif
 | |
|     UNI_COMPUTE       %1, %2, m0, m1, m9
 | |
|     PEL_%2STORE%1   dstq, m0, m1
 | |
|     add             dstq, dststrideq             ; dst += dststride
 | |
|     add             srcq, srcstrideq             ; src += srcstride
 | |
|     dec          heightd                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
|     RET
 | |
| 
 | |
| cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
 | |
|     movdqa            m9, [pw_bi_%2]
 | |
|     QPEL_FILTER       %2, mx
 | |
| .loop:
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 10
 | |
|     QPEL_COMPUTE      %1, %2, 1
 | |
| %if %2 > 8
 | |
|     packssdw          m0, m1
 | |
| %endif
 | |
|     SIMPLE_BILOAD     %1, src2q, m10, m11
 | |
|     BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
 | |
|     PEL_%2STORE%1   dstq, m0, m1
 | |
|     add             dstq, dststrideq             ; dst += dststride
 | |
|     add             srcq, srcstrideq             ; src += srcstride
 | |
|     add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
 | |
|     dec          heightd                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
|     RET
 | |
| 
 | |
| 
 | |
| ; ******************************
 | |
| ; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
 | |
| ;                       uint8_t *_src, ptrdiff_t _srcstride,
 | |
| ;                       int height, int mx, int my, int width)
 | |
| ; ******************************
 | |
| 
 | |
| cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
 | |
|     movifnidn        myd, mym
 | |
|     lea           r3srcq, [srcstrideq*3]
 | |
|     QPEL_FILTER       %2, my
 | |
| .loop:
 | |
|     QPEL_V_LOAD       %2, srcq, srcstride, %1, r7
 | |
|     QPEL_COMPUTE      %1, %2, 1
 | |
| %if %2 > 8
 | |
|     packssdw          m0, m1
 | |
| %endif
 | |
|     PEL_10STORE%1     dstq, m0, m1
 | |
|     LOOP_END         dst, src, srcstride
 | |
|     RET
 | |
| 
 | |
| cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
 | |
|     movifnidn        myd, mym
 | |
|     movdqa            m9, [pw_%2]
 | |
|     lea           r3srcq, [srcstrideq*3]
 | |
|     QPEL_FILTER       %2, my
 | |
| .loop:
 | |
|     QPEL_V_LOAD       %2, srcq, srcstride, %1, r8
 | |
|     QPEL_COMPUTE      %1, %2
 | |
| %if %2 > 8
 | |
|     packssdw          m0, m1
 | |
| %endif
 | |
|     UNI_COMPUTE       %1, %2, m0, m1, m9
 | |
|     PEL_%2STORE%1   dstq, m0, m1
 | |
|     add             dstq, dststrideq             ; dst += dststride
 | |
|     add             srcq, srcstrideq             ; src += srcstride
 | |
|     dec          heightd                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
|     RET
 | |
| 
 | |
| cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
 | |
|     movifnidn        myd, mym
 | |
|     movdqa            m9, [pw_bi_%2]
 | |
|     lea           r3srcq, [srcstrideq*3]
 | |
|     QPEL_FILTER       %2, my
 | |
| .loop:
 | |
|     QPEL_V_LOAD       %2, srcq, srcstride, %1, r9
 | |
|     QPEL_COMPUTE      %1, %2, 1
 | |
| %if %2 > 8
 | |
|     packssdw          m0, m1
 | |
| %endif
 | |
|     SIMPLE_BILOAD     %1, src2q, m10, m11
 | |
|     BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
 | |
|     PEL_%2STORE%1   dstq, m0, m1
 | |
|     add             dstq, dststrideq             ; dst += dststride
 | |
|     add             srcq, srcstrideq             ; src += srcstride
 | |
|     add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
 | |
|     dec          heightd                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| 
 | |
| ; ******************************
 | |
| ; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
 | |
| ;                       uint8_t *_src, ptrdiff_t _srcstride,
 | |
| ;                       int height, int mx, int my)
 | |
| ; ******************************
 | |
| %macro HEVC_PUT_HEVC_QPEL_HV 2
 | |
| cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
 | |
| %if cpuflag(avx2)
 | |
| %assign %%shift  4
 | |
| %else
 | |
| %assign %%shift  3
 | |
| %endif
 | |
|     sub              mxq, 1
 | |
|     sub              myq, 1
 | |
|     shl              mxq, %%shift                ; multiply by 32
 | |
|     shl              myq, %%shift                ; multiply by 32
 | |
|     lea           r3srcq, [srcstrideq*3]
 | |
|     sub             srcq, r3srcq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP              m8, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP              m9, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m10, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m11, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m12, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m13, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m14, m0
 | |
|     add             srcq, srcstrideq
 | |
| .loop:
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m15, m0
 | |
|     punpcklwd         m0, m8, m9
 | |
|     punpcklwd         m2, m10, m11
 | |
|     punpcklwd         m4, m12, m13
 | |
|     punpcklwd         m6, m14, m15
 | |
| %if %1 > 4
 | |
|     punpckhwd         m1, m8, m9
 | |
|     punpckhwd         m3, m10, m11
 | |
|     punpckhwd         m5, m12, m13
 | |
|     punpckhwd         m7, m14, m15
 | |
| %endif
 | |
|     QPEL_HV_COMPUTE   %1, 14, my, ackssdw
 | |
|     PEL_10STORE%1     dstq, m0, m1
 | |
| %if %1 <= 4
 | |
|     movq              m8, m9
 | |
|     movq              m9, m10
 | |
|     movq             m10, m11
 | |
|     movq             m11, m12
 | |
|     movq             m12, m13
 | |
|     movq             m13, m14
 | |
|     movq             m14, m15
 | |
| %else
 | |
|     movdqa            m8, m9
 | |
|     movdqa            m9, m10
 | |
|     movdqa           m10, m11
 | |
|     movdqa           m11, m12
 | |
|     movdqa           m12, m13
 | |
|     movdqa           m13, m14
 | |
|     movdqa           m14, m15
 | |
| %endif
 | |
|     LOOP_END         dst, src, srcstride
 | |
|     RET
 | |
| 
 | |
| cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
 | |
| %if cpuflag(avx2)
 | |
| %assign %%shift  4
 | |
| %else
 | |
| %assign %%shift  3
 | |
| %endif
 | |
|     sub              mxq, 1
 | |
|     sub              myq, 1
 | |
|     shl              mxq, %%shift                ; multiply by 32
 | |
|     shl              myq, %%shift                ; multiply by 32
 | |
|     lea           r3srcq, [srcstrideq*3]
 | |
|     sub             srcq, r3srcq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP              m8, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP              m9, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m10, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m11, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m12, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m13, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m14, m0
 | |
|     add             srcq, srcstrideq
 | |
| .loop:
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m15, m0
 | |
|     punpcklwd         m0, m8, m9
 | |
|     punpcklwd         m2, m10, m11
 | |
|     punpcklwd         m4, m12, m13
 | |
|     punpcklwd         m6, m14, m15
 | |
| %if %1 > 4
 | |
|     punpckhwd         m1, m8, m9
 | |
|     punpckhwd         m3, m10, m11
 | |
|     punpckhwd         m5, m12, m13
 | |
|     punpckhwd         m7, m14, m15
 | |
| %endif
 | |
|     QPEL_HV_COMPUTE   %1, 14, my, ackusdw
 | |
|     UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
 | |
|     PEL_%2STORE%1   dstq, m0, m1
 | |
| 
 | |
| %if %1 <= 4
 | |
|     movq              m8, m9
 | |
|     movq              m9, m10
 | |
|     movq             m10, m11
 | |
|     movq             m11, m12
 | |
|     movq             m12, m13
 | |
|     movq             m13, m14
 | |
|     movq             m14, m15
 | |
| %else
 | |
|     mova            m8, m9
 | |
|     mova            m9, m10
 | |
|     mova           m10, m11
 | |
|     mova           m11, m12
 | |
|     mova           m12, m13
 | |
|     mova           m13, m14
 | |
|     mova           m14, m15
 | |
| %endif
 | |
|     add             dstq, dststrideq             ; dst += dststride
 | |
|     add             srcq, srcstrideq             ; src += srcstride
 | |
|     dec          heightd                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
|     RET
 | |
| 
 | |
| cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
 | |
| %if cpuflag(avx2)
 | |
| %assign %%shift  4
 | |
| %else
 | |
| %assign %%shift  3
 | |
| %endif
 | |
|     sub              mxq, 1
 | |
|     sub              myq, 1
 | |
|     shl              mxq, %%shift                ; multiply by 32
 | |
|     shl              myq, %%shift                ; multiply by 32
 | |
|     lea           r3srcq, [srcstrideq*3]
 | |
|     sub             srcq, r3srcq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP              m8, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP              m9, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m10, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m11, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m12, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m13, m0
 | |
|     add             srcq, srcstrideq
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m14, m0
 | |
|     add             srcq, srcstrideq
 | |
| .loop:
 | |
|     QPEL_H_LOAD       %2, srcq, %1, 15
 | |
|     QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
 | |
|     SWAP             m15, m0
 | |
|     punpcklwd         m0, m8, m9
 | |
|     punpcklwd         m2, m10, m11
 | |
|     punpcklwd         m4, m12, m13
 | |
|     punpcklwd         m6, m14, m15
 | |
| %if %1 > 4
 | |
|     punpckhwd         m1, m8, m9
 | |
|     punpckhwd         m3, m10, m11
 | |
|     punpckhwd         m5, m12, m13
 | |
|     punpckhwd         m7, m14, m15
 | |
| %endif
 | |
|     QPEL_HV_COMPUTE   %1, 14, my, ackssdw
 | |
|     SIMPLE_BILOAD     %1, src2q, m8, m9 ;m9 not used in this case
 | |
|     BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
 | |
|     PEL_%2STORE%1   dstq, m0, m1
 | |
| 
 | |
| %if %1 <= 4
 | |
|     movq              m8, m9
 | |
|     movq              m9, m10
 | |
|     movq             m10, m11
 | |
|     movq             m11, m12
 | |
|     movq             m12, m13
 | |
|     movq             m13, m14
 | |
|     movq             m14, m15
 | |
| %else
 | |
|     movdqa            m8, m9
 | |
|     movdqa            m9, m10
 | |
|     movdqa           m10, m11
 | |
|     movdqa           m11, m12
 | |
|     movdqa           m12, m13
 | |
|     movdqa           m13, m14
 | |
|     movdqa           m14, m15
 | |
| %endif
 | |
|     add             dstq, dststrideq             ; dst += dststride
 | |
|     add             srcq, srcstrideq             ; src += srcstride
 | |
|     add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
 | |
|     dec          heightd                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| %macro WEIGHTING_FUNCS 2
 | |
| %if WIN64 || ARCH_X86_32
 | |
| cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox
 | |
|     mov             r4d, denomm
 | |
| %define SHIFT  r4d
 | |
| %else
 | |
| cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox
 | |
| %define SHIFT  denomd
 | |
| %endif
 | |
|     lea           SHIFT, [SHIFT+14-%2]          ; shift = 14 - bitd + denom
 | |
| %if %1 <= 4
 | |
|     pxor             m1, m1
 | |
| %endif
 | |
|     movd             m2, wxm        ; WX
 | |
|     movd             m4, SHIFT      ; shift
 | |
| %if %1 <= 4
 | |
|     punpcklwd        m2, m1
 | |
| %else
 | |
|     punpcklwd        m2, m2
 | |
| %endif
 | |
|     dec           SHIFT
 | |
|     movdqu           m5, [pd_1]
 | |
|     movd             m6, SHIFT
 | |
|     pshufd           m2, m2, 0
 | |
|     mov           SHIFT, oxm
 | |
|     pslld            m5, m6
 | |
| %if %2 != 8
 | |
|     shl           SHIFT, %2-8       ; ox << (bitd - 8)
 | |
| %endif
 | |
|     movd             m3, SHIFT      ; OX
 | |
|     pshufd           m3, m3, 0
 | |
| %if WIN64 || ARCH_X86_32
 | |
|     mov           SHIFT, heightm
 | |
| %endif
 | |
| .loop:
 | |
|    SIMPLE_LOAD        %1, 10, srcq, m0
 | |
| %if %1 <= 4
 | |
|     punpcklwd         m0, m1
 | |
|     pmaddwd           m0, m2
 | |
|     paddd             m0, m5
 | |
|     psrad             m0, m4
 | |
|     paddd             m0, m3
 | |
| %else
 | |
|     pmulhw            m6, m0, m2
 | |
|     pmullw            m0, m2
 | |
|     punpckhwd         m1, m0, m6
 | |
|     punpcklwd         m0, m6
 | |
|     paddd             m0, m5
 | |
|     paddd             m1, m5
 | |
|     psrad             m0, m4
 | |
|     psrad             m1, m4
 | |
|     paddd             m0, m3
 | |
|     paddd             m1, m3
 | |
| %endif
 | |
|     packssdw          m0, m1
 | |
| %if %2 == 8
 | |
|     packuswb          m0, m0
 | |
| %else
 | |
|     CLIPW             m0, [pb_0], [max_pixels_%2]
 | |
| %endif
 | |
|     PEL_%2STORE%1   dstq, m0, m1
 | |
|     add             dstq, dststrideq             ; dst += dststride
 | |
|     add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
 | |
|     dec          heightd                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
|     RET
 | |
| 
 | |
| cglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1
 | |
|     movifnidn        r5d, denomm
 | |
| %if %1 <= 4
 | |
|     pxor              m1, m1
 | |
| %endif
 | |
|     movd              m2, wx0m         ; WX0
 | |
|     lea              r5d, [r5d+14-%2]  ; shift = 14 - bitd + denom
 | |
|     movd              m3, wx1m         ; WX1
 | |
|     movd              m0, r5d          ; shift
 | |
| %if %1 <= 4
 | |
|     punpcklwd         m2, m1
 | |
|     punpcklwd         m3, m1
 | |
| %else
 | |
|     punpcklwd         m2, m2
 | |
|     punpcklwd         m3, m3
 | |
| %endif
 | |
|     inc              r5d
 | |
|     movd              m5, r5d          ; shift+1
 | |
|     pshufd            m2, m2, 0
 | |
|     mov              r5d, ox0m
 | |
|     pshufd            m3, m3, 0
 | |
|     add              r5d, ox1m
 | |
| %if %2 != 8
 | |
|     shl              r5d, %2-8         ; ox << (bitd - 8)
 | |
| %endif
 | |
|     inc              r5d
 | |
|     movd              m4, r5d          ; offset
 | |
|     pshufd            m4, m4, 0
 | |
| %if UNIX64
 | |
| %define h heightd
 | |
| %else
 | |
|     mov              r5d, heightm
 | |
| %define h r5d
 | |
| %endif
 | |
|     pslld             m4, m0
 | |
| 
 | |
| .loop:
 | |
|    SIMPLE_LOAD        %1, 10, srcq,  m0
 | |
|    SIMPLE_LOAD        %1, 10, src2q, m8
 | |
| %if %1 <= 4
 | |
|     punpcklwd         m0, m1
 | |
|     punpcklwd         m8, m1
 | |
|     pmaddwd           m0, m3
 | |
|     pmaddwd           m8, m2
 | |
|     paddd             m0, m4
 | |
|     paddd             m0, m8
 | |
|     psrad             m0, m5
 | |
| %else
 | |
|     pmulhw            m6, m0, m3
 | |
|     pmullw            m0, m3
 | |
|     pmulhw            m7, m8, m2
 | |
|     pmullw            m8, m2
 | |
|     punpckhwd         m1, m0, m6
 | |
|     punpcklwd         m0, m6
 | |
|     punpckhwd         m9, m8, m7
 | |
|     punpcklwd         m8, m7
 | |
|     paddd             m0, m8
 | |
|     paddd             m1, m9
 | |
|     paddd             m0, m4
 | |
|     paddd             m1, m4
 | |
|     psrad             m0, m5
 | |
|     psrad             m1, m5
 | |
| %endif
 | |
|     packssdw          m0, m1
 | |
| %if %2 == 8
 | |
|     packuswb          m0, m0
 | |
| %else
 | |
|      CLIPW            m0, [pb_0], [max_pixels_%2]
 | |
| %endif
 | |
|     PEL_%2STORE%1   dstq, m0, m1
 | |
|     add             dstq, dststrideq             ; dst += dststride
 | |
|     add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
 | |
|     add            src2q, 2*MAX_PB_SIZE          ; src2 += srcstride
 | |
|     dec                h                         ; cmp height
 | |
|     jnz               .loop                      ; height loop
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| INIT_XMM sse4                                    ; adds ff_ and _sse4 to function name
 | |
| 
 | |
| WEIGHTING_FUNCS 2, 8
 | |
| WEIGHTING_FUNCS 4, 8
 | |
| WEIGHTING_FUNCS 6, 8
 | |
| WEIGHTING_FUNCS 8, 8
 | |
| 
 | |
| WEIGHTING_FUNCS 2, 10
 | |
| WEIGHTING_FUNCS 4, 10
 | |
| WEIGHTING_FUNCS 6, 10
 | |
| WEIGHTING_FUNCS 8, 10
 | |
| 
 | |
| WEIGHTING_FUNCS 2, 12
 | |
| WEIGHTING_FUNCS 4, 12
 | |
| WEIGHTING_FUNCS 6, 12
 | |
| WEIGHTING_FUNCS 8, 12
 | |
| 
 | |
| HEVC_PUT_HEVC_PEL_PIXELS  2, 8
 | |
| HEVC_PUT_HEVC_PEL_PIXELS  4, 8
 | |
| HEVC_PUT_HEVC_PEL_PIXELS  6, 8
 | |
| HEVC_PUT_HEVC_PEL_PIXELS  8, 8
 | |
| HEVC_PUT_HEVC_PEL_PIXELS 12, 8
 | |
| HEVC_PUT_HEVC_PEL_PIXELS 16, 8
 | |
| 
 | |
| HEVC_PUT_HEVC_PEL_PIXELS 2, 10
 | |
| HEVC_PUT_HEVC_PEL_PIXELS 4, 10
 | |
| HEVC_PUT_HEVC_PEL_PIXELS 6, 10
 | |
| HEVC_PUT_HEVC_PEL_PIXELS 8, 10
 | |
| 
 | |
| HEVC_PUT_HEVC_PEL_PIXELS 2, 12
 | |
| HEVC_PUT_HEVC_PEL_PIXELS 4, 12
 | |
| HEVC_PUT_HEVC_PEL_PIXELS 6, 12
 | |
| HEVC_PUT_HEVC_PEL_PIXELS 8, 12
 | |
| 
 | |
| HEVC_PUT_HEVC_EPEL 2,  8
 | |
| HEVC_PUT_HEVC_EPEL 4,  8
 | |
| HEVC_PUT_HEVC_EPEL 6,  8
 | |
| HEVC_PUT_HEVC_EPEL 8,  8
 | |
| HEVC_PUT_HEVC_EPEL 12, 8
 | |
| HEVC_PUT_HEVC_EPEL 16, 8
 | |
| 
 | |
| 
 | |
| HEVC_PUT_HEVC_EPEL 2, 10
 | |
| HEVC_PUT_HEVC_EPEL 4, 10
 | |
| HEVC_PUT_HEVC_EPEL 6, 10
 | |
| HEVC_PUT_HEVC_EPEL 8, 10
 | |
| 
 | |
| HEVC_PUT_HEVC_EPEL 2, 12
 | |
| HEVC_PUT_HEVC_EPEL 4, 12
 | |
| HEVC_PUT_HEVC_EPEL 6, 12
 | |
| HEVC_PUT_HEVC_EPEL 8, 12
 | |
| 
 | |
| HEVC_PUT_HEVC_EPEL_HV 2,  8
 | |
| HEVC_PUT_HEVC_EPEL_HV 4,  8
 | |
| HEVC_PUT_HEVC_EPEL_HV 6,  8
 | |
| HEVC_PUT_HEVC_EPEL_HV 8,  8
 | |
| HEVC_PUT_HEVC_EPEL_HV 16, 8
 | |
| 
 | |
| HEVC_PUT_HEVC_EPEL_HV 2, 10
 | |
| HEVC_PUT_HEVC_EPEL_HV 4, 10
 | |
| HEVC_PUT_HEVC_EPEL_HV 6, 10
 | |
| HEVC_PUT_HEVC_EPEL_HV 8, 10
 | |
| 
 | |
| HEVC_PUT_HEVC_EPEL_HV 2, 12
 | |
| HEVC_PUT_HEVC_EPEL_HV 4, 12
 | |
| HEVC_PUT_HEVC_EPEL_HV 6, 12
 | |
| HEVC_PUT_HEVC_EPEL_HV 8, 12
 | |
| 
 | |
| HEVC_PUT_HEVC_QPEL 4,  8
 | |
| HEVC_PUT_HEVC_QPEL 8,  8
 | |
| HEVC_PUT_HEVC_QPEL 12, 8
 | |
| HEVC_PUT_HEVC_QPEL 16, 8
 | |
| 
 | |
| HEVC_PUT_HEVC_QPEL 4, 10
 | |
| HEVC_PUT_HEVC_QPEL 8, 10
 | |
| 
 | |
| HEVC_PUT_HEVC_QPEL 4, 12
 | |
| HEVC_PUT_HEVC_QPEL 8, 12
 | |
| 
 | |
| HEVC_PUT_HEVC_QPEL_HV 2, 8
 | |
| HEVC_PUT_HEVC_QPEL_HV 4, 8
 | |
| HEVC_PUT_HEVC_QPEL_HV 6, 8
 | |
| HEVC_PUT_HEVC_QPEL_HV 8, 8
 | |
| 
 | |
| HEVC_PUT_HEVC_QPEL_HV 2, 10
 | |
| HEVC_PUT_HEVC_QPEL_HV 4, 10
 | |
| HEVC_PUT_HEVC_QPEL_HV 6, 10
 | |
| HEVC_PUT_HEVC_QPEL_HV 8, 10
 | |
| 
 | |
| HEVC_PUT_HEVC_QPEL_HV 2, 12
 | |
| HEVC_PUT_HEVC_QPEL_HV 4, 12
 | |
| HEVC_PUT_HEVC_QPEL_HV 6, 12
 | |
| HEVC_PUT_HEVC_QPEL_HV 8, 12
 | |
| 
 | |
| %if HAVE_AVX2_EXTERNAL
 | |
| INIT_YMM avx2  ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
 | |
| 
 | |
| HEVC_PUT_HEVC_PEL_PIXELS 32, 8
 | |
| HEVC_PUT_HEVC_PEL_PIXELS 16, 10
 | |
| 
 | |
| HEVC_PUT_HEVC_EPEL 32, 8
 | |
| HEVC_PUT_HEVC_EPEL 16, 10
 | |
| 
 | |
| HEVC_PUT_HEVC_EPEL_HV 16, 10
 | |
| HEVC_PUT_HEVC_EPEL_HV 32, 8
 | |
| 
 | |
| HEVC_PUT_HEVC_QPEL 32, 8
 | |
| 
 | |
| HEVC_PUT_HEVC_QPEL 16, 10
 | |
| 
 | |
| HEVC_PUT_HEVC_QPEL_HV 16, 10
 | |
| 
 | |
| %endif ;AVX2
 | |
| %endif ; ARCH_X86_64
 |