x86/tx_float: fully support 128bit regs in LOAD64_LUT
The gather path didn't support 128bit registers. It's not faster on Zen 3, but it's here for completeness.
This commit is contained in:
parent
af42bb3d61
commit
892548e6a1
@ -107,19 +107,19 @@ SECTION .text
|
||||
; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set)
|
||||
%macro LOAD64_LUT 5-7
|
||||
%if %0 > 6 && cpuflag(avx2)
|
||||
pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
|
||||
movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction
|
||||
vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args
|
||||
pcmpeqd %7, %7 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
|
||||
movupd xmm%6, [%3 + %4] ; float mov since vgatherdpd is a float instruction
|
||||
vgatherdpd %1, [%2 + xmm%6*8], %7 ; must use separate registers for args
|
||||
%else
|
||||
mov %5d, [%3 + %4 + 0]
|
||||
movsd xmm%1, [%2 + %5q*8]
|
||||
%if mmsize == 32
|
||||
%if sizeof%1 > 16 && %0 > 5
|
||||
mov %5d, [%3 + %4 + 8]
|
||||
movsd xmm%6, [%2 + %5q*8]
|
||||
%endif
|
||||
mov %5d, [%3 + %4 + 4]
|
||||
movhps xmm%1, [%2 + %5q*8]
|
||||
%if mmsize == 32
|
||||
%if sizeof%1 > 16 && %0 > 5
|
||||
mov %5d, [%3 + %4 + 12]
|
||||
movhps xmm%6, [%2 + %5q*8]
|
||||
vinsertf128 %1, %1, xmm%6, 1
|
||||
|
Loading…
x
Reference in New Issue
Block a user