x86/tx_float: add permute-free FFT versions
These are used in the PFA transforms and MDCTs.
This commit is contained in:
parent
350142560b
commit
28bff6ae54
@ -707,13 +707,21 @@ cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
|
|||||||
FFT4 fwd, 0
|
FFT4 fwd, 0
|
||||||
FFT4 inv, 1
|
FFT4 inv, 1
|
||||||
|
|
||||||
|
%macro FFT8_FN 2
|
||||||
INIT_XMM sse3
|
INIT_XMM sse3
|
||||||
cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
|
cglobal fft8_ %+ %1, 4, 4, 6, ctx, out, in, tmp
|
||||||
|
%if %2
|
||||||
mov ctxq, [ctxq + AVTXContext.map]
|
mov ctxq, [ctxq + AVTXContext.map]
|
||||||
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
|
||||||
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
|
||||||
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
|
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
|
||||||
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq
|
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq
|
||||||
|
%else
|
||||||
|
movaps m0, [inq + 0*mmsize]
|
||||||
|
movaps m1, [inq + 1*mmsize]
|
||||||
|
movaps m2, [inq + 2*mmsize]
|
||||||
|
movaps m3, [inq + 3*mmsize]
|
||||||
|
%endif
|
||||||
|
|
||||||
FFT8 m0, m1, m2, m3, m4, m5
|
FFT8 m0, m1, m2, m3, m4, m5
|
||||||
|
|
||||||
@ -728,12 +736,22 @@ cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
|
|||||||
movups [outq + 3*mmsize], m1
|
movups [outq + 3*mmsize], m1
|
||||||
|
|
||||||
RET
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
FFT8_FN float, 1
|
||||||
|
FFT8_FN ns_float, 0
|
||||||
|
|
||||||
|
%macro FFT16_FN 2
|
||||||
INIT_YMM avx
|
INIT_YMM avx
|
||||||
cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
|
cglobal fft8_ %+ %1, 4, 4, 4, ctx, out, in, tmp
|
||||||
|
%if %2
|
||||||
mov ctxq, [ctxq + AVTXContext.map]
|
mov ctxq, [ctxq + AVTXContext.map]
|
||||||
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
|
||||||
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
|
||||||
|
%else
|
||||||
|
movaps m0, [inq + 0*mmsize]
|
||||||
|
movaps m1, [inq + 1*mmsize]
|
||||||
|
%endif
|
||||||
|
|
||||||
FFT8_AVX m0, m1, m2, m3
|
FFT8_AVX m0, m1, m2, m3
|
||||||
|
|
||||||
@ -747,16 +765,26 @@ cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
|
|||||||
vextractf128 [outq + 16*3], m0, 1
|
vextractf128 [outq + 16*3], m0, 1
|
||||||
|
|
||||||
RET
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
%macro FFT16_FN 1
|
FFT16_FN float, 1
|
||||||
|
FFT16_FN ns_float, 0
|
||||||
|
|
||||||
|
%macro FFT16_FN 3
|
||||||
INIT_YMM %1
|
INIT_YMM %1
|
||||||
cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
|
cglobal fft16_ %+ %2, 4, 4, 8, ctx, out, in, tmp
|
||||||
|
%if %3
|
||||||
|
movaps m0, [inq + 0*mmsize]
|
||||||
|
movaps m1, [inq + 1*mmsize]
|
||||||
|
movaps m2, [inq + 2*mmsize]
|
||||||
|
movaps m3, [inq + 3*mmsize]
|
||||||
|
%else
|
||||||
mov ctxq, [ctxq + AVTXContext.map]
|
mov ctxq, [ctxq + AVTXContext.map]
|
||||||
|
|
||||||
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
|
||||||
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
|
||||||
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6
|
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6
|
||||||
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7
|
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7
|
||||||
|
%endif
|
||||||
|
|
||||||
FFT16 m0, m1, m2, m3, m4, m5, m6, m7
|
FFT16 m0, m1, m2, m3, m4, m5, m6, m7
|
||||||
|
|
||||||
@ -777,25 +805,40 @@ cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
|
|||||||
RET
|
RET
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
FFT16_FN avx
|
FFT16_FN avx, float, 0
|
||||||
FFT16_FN fma3
|
FFT16_FN avx, ns_float, 1
|
||||||
|
FFT16_FN fma3, float, 0
|
||||||
|
FFT16_FN fma3, ns_float, 1
|
||||||
|
|
||||||
%macro FFT32_FN 1
|
%macro FFT32_FN 3
|
||||||
INIT_YMM %1
|
INIT_YMM %1
|
||||||
cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
|
cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp
|
||||||
|
%if %3
|
||||||
|
movaps m4, [inq + 4*mmsize]
|
||||||
|
movaps m5, [inq + 5*mmsize]
|
||||||
|
movaps m6, [inq + 6*mmsize]
|
||||||
|
movaps m7, [inq + 7*mmsize]
|
||||||
|
%else
|
||||||
mov ctxq, [ctxq + AVTXContext.map]
|
mov ctxq, [ctxq + AVTXContext.map]
|
||||||
|
|
||||||
LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m9
|
LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m9
|
||||||
LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m10, m11
|
LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m10, m11
|
||||||
LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m12, m13
|
LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m12, m13
|
||||||
LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m14, m15
|
LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m14, m15
|
||||||
|
%endif
|
||||||
|
|
||||||
FFT8 m4, m5, m6, m7, m8, m9
|
FFT8 m4, m5, m6, m7, m8, m9
|
||||||
|
|
||||||
|
%if %3
|
||||||
|
movaps m0, [inq + 0*mmsize]
|
||||||
|
movaps m1, [inq + 1*mmsize]
|
||||||
|
movaps m2, [inq + 2*mmsize]
|
||||||
|
movaps m3, [inq + 3*mmsize]
|
||||||
|
%else
|
||||||
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8, m9
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8, m9
|
||||||
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m10, m11
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m10, m11
|
||||||
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m12, m13
|
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m12, m13
|
||||||
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m14, m15
|
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m14, m15
|
||||||
|
%endif
|
||||||
|
|
||||||
movaps m8, [tab_32_float]
|
movaps m8, [tab_32_float]
|
||||||
vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23
|
vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23
|
||||||
@ -836,8 +879,10 @@ cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
|
|||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%if ARCH_X86_64
|
%if ARCH_X86_64
|
||||||
FFT32_FN avx
|
FFT32_FN avx, float, 0
|
||||||
FFT32_FN fma3
|
FFT32_FN avx, ns_float, 1
|
||||||
|
FFT32_FN fma3, float, 0
|
||||||
|
FFT32_FN fma3, ns_float, 1
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
%macro FFT_SPLIT_RADIX_DEF 1-2
|
%macro FFT_SPLIT_RADIX_DEF 1-2
|
||||||
@ -878,9 +923,9 @@ ALIGN 16
|
|||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro FFT_SPLIT_RADIX_FN 1
|
%macro FFT_SPLIT_RADIX_FN 3
|
||||||
INIT_YMM %1
|
INIT_YMM %1
|
||||||
cglobal fft_sr_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
|
cglobal fft_sr_ %+ %2, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
|
||||||
movsxd lenq, dword [lutq + AVTXContext.len]
|
movsxd lenq, dword [lutq + AVTXContext.len]
|
||||||
mov lutq, [lutq + AVTXContext.map]
|
mov lutq, [lutq + AVTXContext.map]
|
||||||
mov tgtq, lenq
|
mov tgtq, lenq
|
||||||
@ -888,17 +933,31 @@ cglobal fft_sr_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
|
|||||||
; Bottom-most/32-point transform ===============================================
|
; Bottom-most/32-point transform ===============================================
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
.32pt:
|
.32pt:
|
||||||
|
%if %3
|
||||||
|
movaps m4, [inq + 4*mmsize]
|
||||||
|
movaps m5, [inq + 5*mmsize]
|
||||||
|
movaps m6, [inq + 6*mmsize]
|
||||||
|
movaps m7, [inq + 7*mmsize]
|
||||||
|
%else
|
||||||
LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8, m9
|
LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8, m9
|
||||||
LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m10, m11
|
LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m10, m11
|
||||||
LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m12, m13
|
LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m12, m13
|
||||||
LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m14, m15
|
LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m14, m15
|
||||||
|
%endif
|
||||||
|
|
||||||
FFT8 m4, m5, m6, m7, m8, m9
|
FFT8 m4, m5, m6, m7, m8, m9
|
||||||
|
|
||||||
|
%if %3
|
||||||
|
movaps m0, [inq + 0*mmsize]
|
||||||
|
movaps m1, [inq + 1*mmsize]
|
||||||
|
movaps m2, [inq + 2*mmsize]
|
||||||
|
movaps m3, [inq + 3*mmsize]
|
||||||
|
%else
|
||||||
LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8, m9
|
LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8, m9
|
||||||
LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m10, m11
|
LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m10, m11
|
||||||
LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m12, m13
|
LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m12, m13
|
||||||
LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m14, m15
|
LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m14, m15
|
||||||
|
%endif
|
||||||
|
|
||||||
movaps m8, [tab_32_float]
|
movaps m8, [tab_32_float]
|
||||||
vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23
|
vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23
|
||||||
@ -913,7 +972,11 @@ ALIGN 16
|
|||||||
movaps [outq + 5*mmsize], m5
|
movaps [outq + 5*mmsize], m5
|
||||||
movaps [outq + 7*mmsize], m7
|
movaps [outq + 7*mmsize], m7
|
||||||
|
|
||||||
|
%if %3
|
||||||
|
add inq, 8*mmsize
|
||||||
|
%else
|
||||||
add lutq, (mmsize/2)*8
|
add lutq, (mmsize/2)*8
|
||||||
|
%endif
|
||||||
cmp lenq, 32
|
cmp lenq, 32
|
||||||
jg .64pt
|
jg .64pt
|
||||||
|
|
||||||
@ -944,24 +1007,42 @@ ALIGN 16
|
|||||||
SWAP m4, m1
|
SWAP m4, m1
|
||||||
SWAP m6, m3
|
SWAP m6, m3
|
||||||
|
|
||||||
|
%if %3
|
||||||
|
movaps tx1_e0, [inq + 0*mmsize]
|
||||||
|
movaps tx1_e1, [inq + 1*mmsize]
|
||||||
|
movaps tx1_o0, [inq + 2*mmsize]
|
||||||
|
movaps tx1_o1, [inq + 3*mmsize]
|
||||||
|
%else
|
||||||
LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tw_o
|
LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tw_o
|
||||||
LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tmp1, tmp2
|
LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tmp1, tmp2
|
||||||
LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tw_o
|
LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tw_o
|
||||||
LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tmp1, tmp2
|
LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tmp1, tmp2
|
||||||
|
%endif
|
||||||
|
|
||||||
FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1
|
FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1
|
||||||
|
|
||||||
|
%if %3
|
||||||
|
movaps tx2_e0, [inq + 4*mmsize]
|
||||||
|
movaps tx2_e1, [inq + 5*mmsize]
|
||||||
|
movaps tx2_o0, [inq + 6*mmsize]
|
||||||
|
movaps tx2_o1, [inq + 7*mmsize]
|
||||||
|
%else
|
||||||
LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tmp1, tmp2
|
LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tmp1, tmp2
|
||||||
LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_e, tw_o
|
LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_e, tw_o
|
||||||
LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tmp1, tmp2
|
LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tmp1, tmp2
|
||||||
LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_e, tw_o
|
LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_e, tw_o
|
||||||
|
%endif
|
||||||
|
|
||||||
FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o
|
FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o
|
||||||
|
|
||||||
movaps tw_e, [tab_64_float]
|
movaps tw_e, [tab_64_float]
|
||||||
vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23
|
vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23
|
||||||
|
|
||||||
|
%if %3
|
||||||
|
add inq, 8*mmsize
|
||||||
|
%else
|
||||||
add lutq, (mmsize/2)*8
|
add lutq, (mmsize/2)*8
|
||||||
|
%endif
|
||||||
cmp tgtq, 64
|
cmp tgtq, 64
|
||||||
je .deinterleave
|
je .deinterleave
|
||||||
|
|
||||||
@ -1204,8 +1285,10 @@ FFT_SPLIT_RADIX_DEF 131072
|
|||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%if ARCH_X86_64
|
%if ARCH_X86_64
|
||||||
FFT_SPLIT_RADIX_FN avx
|
FFT_SPLIT_RADIX_FN avx, float, 0
|
||||||
|
FFT_SPLIT_RADIX_FN avx, ns_float, 1
|
||||||
%if HAVE_AVX2_EXTERNAL
|
%if HAVE_AVX2_EXTERNAL
|
||||||
FFT_SPLIT_RADIX_FN avx2
|
FFT_SPLIT_RADIX_FN avx2, float, 0
|
||||||
|
FFT_SPLIT_RADIX_FN avx2, ns_float, 1
|
||||||
%endif
|
%endif
|
||||||
%endif
|
%endif
|
||||||
|
@ -23,6 +23,10 @@
|
|||||||
|
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
|
|
||||||
|
/* These versions already do what we need them to do. */
|
||||||
|
#define ff_tx_fft2_ns_float_sse3 ff_tx_fft2_float_sse3
|
||||||
|
#define ff_tx_fft4_ns_float_sse2 ff_tx_fft4_fwd_float_sse2
|
||||||
|
|
||||||
#define DECL_INIT_FN(basis, interleave) \
|
#define DECL_INIT_FN(basis, interleave) \
|
||||||
static av_cold int \
|
static av_cold int \
|
||||||
ff_tx_fft_sr_codelet_init_b ##basis## _i ##interleave## _x86 \
|
ff_tx_fft_sr_codelet_init_b ##basis## _i ##interleave## _x86 \
|
||||||
@ -35,6 +39,9 @@ static av_cold int \
|
|||||||
{ \
|
{ \
|
||||||
const int inv_lookup = opts ? opts->invert_lookup : 1; \
|
const int inv_lookup = opts ? opts->invert_lookup : 1; \
|
||||||
ff_tx_init_tabs_float(len); \
|
ff_tx_init_tabs_float(len); \
|
||||||
|
if (cd->max_len == 2) \
|
||||||
|
return ff_tx_gen_ptwo_revtab(s, inv_lookup); \
|
||||||
|
else \
|
||||||
return ff_tx_gen_split_radix_parity_revtab(s, inv_lookup, \
|
return ff_tx_gen_split_radix_parity_revtab(s, inv_lookup, \
|
||||||
basis, interleave); \
|
basis, interleave); \
|
||||||
}
|
}
|
||||||
@ -43,82 +50,91 @@ static av_cold int \
|
|||||||
DECL_INIT_FN(8, 0)
|
DECL_INIT_FN(8, 0)
|
||||||
DECL_INIT_FN(8, 2)
|
DECL_INIT_FN(8, 2)
|
||||||
|
|
||||||
#define DECL_SR_CD_DEF(fn_name, len, init_fn, fn_prio, cpu, fn_flags) \
|
#define DECL_CD_DEF(fn, t, min, max, f1, f2, i, p, c, f) \
|
||||||
void ff_tx_ ##fn_name(AVTXContext *s, void *out, void *in, ptrdiff_t stride); \
|
void ff_tx_ ##fn(AVTXContext *s, void *out, void *in, ptrdiff_t stride); \
|
||||||
static const FFTXCodelet ff_tx_ ##fn_name## _def = { \
|
static const FFTXCodelet ff_tx_ ##fn## _def = { \
|
||||||
.name = #fn_name, \
|
.name = #fn, \
|
||||||
.function = ff_tx_ ##fn_name, \
|
.function = ff_tx_ ##fn, \
|
||||||
.type = TX_TYPE(FFT), \
|
.type = TX_TYPE(t), \
|
||||||
.flags = FF_TX_OUT_OF_PLACE | FF_TX_ALIGNED | fn_flags, \
|
.flags = FF_TX_ALIGNED | f, \
|
||||||
.factors[0] = 2, \
|
.factors = { f1, f2 }, \
|
||||||
.min_len = len, \
|
.min_len = min, \
|
||||||
.max_len = len, \
|
.max_len = max, \
|
||||||
.init = ff_tx_fft_sr_codelet_init_ ##init_fn## _x86, \
|
.init = ff_tx_ ##i## _x86, \
|
||||||
.cpu_flags = AV_CPU_FLAG_ ##cpu, \
|
.cpu_flags = c, \
|
||||||
.prio = fn_prio, \
|
.prio = p, \
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define DECL_SR_CD_DEF(fn_name, len, init_fn, fn_prio, cpu, fn_flags) \
|
||||||
|
DECL_CD_DEF(fn_name, FFT, len, len, 2, 0, \
|
||||||
|
fft_sr_codelet_init_ ##init_fn, fn_prio, \
|
||||||
|
AV_CPU_FLAG_ ##cpu, fn_flags) \
|
||||||
|
|
||||||
DECL_SR_CD_DEF(fft2_float_sse3, 2, b0_i0, 128, SSE3, AV_TX_INPLACE)
|
DECL_SR_CD_DEF(fft2_float_sse3, 2, b0_i0, 128, SSE3, AV_TX_INPLACE)
|
||||||
|
DECL_SR_CD_DEF(fft2_ns_float_sse3, 2, b8_i0, 192, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE)
|
||||||
DECL_SR_CD_DEF(fft4_fwd_float_sse2, 4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY)
|
DECL_SR_CD_DEF(fft4_fwd_float_sse2, 4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY)
|
||||||
DECL_SR_CD_DEF(fft4_inv_float_sse2, 4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY)
|
DECL_SR_CD_DEF(fft4_inv_float_sse2, 4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY)
|
||||||
|
DECL_SR_CD_DEF(fft4_ns_float_sse2, 4, b8_i0, 192, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE)
|
||||||
DECL_SR_CD_DEF(fft8_float_sse3, 8, b8_i0, 128, SSE3, AV_TX_INPLACE)
|
DECL_SR_CD_DEF(fft8_float_sse3, 8, b8_i0, 128, SSE3, AV_TX_INPLACE)
|
||||||
|
DECL_SR_CD_DEF(fft8_ns_float_sse3, 8, b8_i0, 192, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE)
|
||||||
DECL_SR_CD_DEF(fft8_float_avx, 8, b8_i0, 256, AVX, AV_TX_INPLACE)
|
DECL_SR_CD_DEF(fft8_float_avx, 8, b8_i0, 256, AVX, AV_TX_INPLACE)
|
||||||
|
DECL_SR_CD_DEF(fft8_ns_float_avx, 8, b8_i0, 320, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE)
|
||||||
DECL_SR_CD_DEF(fft16_float_avx, 16, b8_i2, 256, AVX, AV_TX_INPLACE)
|
DECL_SR_CD_DEF(fft16_float_avx, 16, b8_i2, 256, AVX, AV_TX_INPLACE)
|
||||||
|
DECL_SR_CD_DEF(fft16_ns_float_avx, 16, b8_i2, 320, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE)
|
||||||
DECL_SR_CD_DEF(fft16_float_fma3, 16, b8_i2, 288, FMA3, AV_TX_INPLACE)
|
DECL_SR_CD_DEF(fft16_float_fma3, 16, b8_i2, 288, FMA3, AV_TX_INPLACE)
|
||||||
|
DECL_SR_CD_DEF(fft16_ns_float_fma3, 16, b8_i2, 352, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE)
|
||||||
|
|
||||||
#if ARCH_X86_64
|
#if ARCH_X86_64
|
||||||
DECL_SR_CD_DEF(fft32_float_avx, 32, b8_i2, 256, AVX, AV_TX_INPLACE)
|
DECL_SR_CD_DEF(fft32_float_avx, 32, b8_i2, 256, AVX, AV_TX_INPLACE)
|
||||||
|
DECL_SR_CD_DEF(fft32_ns_float_avx, 32, b8_i2, 320, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE)
|
||||||
DECL_SR_CD_DEF(fft32_float_fma3, 32, b8_i2, 288, FMA3, AV_TX_INPLACE)
|
DECL_SR_CD_DEF(fft32_float_fma3, 32, b8_i2, 288, FMA3, AV_TX_INPLACE)
|
||||||
|
DECL_SR_CD_DEF(fft32_ns_float_fma3, 32, b8_i2, 352, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE)
|
||||||
|
|
||||||
void ff_tx_fft_sr_float_avx(AVTXContext *s, void *out, void *in, ptrdiff_t stride);
|
DECL_CD_DEF(fft_sr_float_avx, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2,
|
||||||
const FFTXCodelet ff_tx_fft_sr_float_avx_def = {
|
256, AV_CPU_FLAG_AVX,
|
||||||
.name = "fft_sr_float_avx",
|
FF_TX_OUT_OF_PLACE)
|
||||||
.function = ff_tx_fft_sr_float_avx,
|
|
||||||
.type = TX_TYPE(FFT),
|
DECL_CD_DEF(fft_sr_ns_float_avx, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2,
|
||||||
.flags = FF_TX_ALIGNED | FF_TX_OUT_OF_PLACE,
|
320, AV_CPU_FLAG_AVX,
|
||||||
.factors[0] = 2,
|
FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | FF_TX_PRESHUFFLE)
|
||||||
.min_len = 64,
|
|
||||||
.max_len = 131072,
|
|
||||||
.init = ff_tx_fft_sr_codelet_init_b8_i2_x86,
|
|
||||||
.cpu_flags = AV_CPU_FLAG_AVX,
|
|
||||||
.prio = 256,
|
|
||||||
};
|
|
||||||
|
|
||||||
#if HAVE_AVX2_EXTERNAL
|
#if HAVE_AVX2_EXTERNAL
|
||||||
void ff_tx_fft_sr_float_avx2(AVTXContext *s, void *out, void *in, ptrdiff_t stride);
|
DECL_CD_DEF(fft_sr_float_avx2, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2,
|
||||||
const FFTXCodelet ff_tx_fft_sr_float_avx2_def = {
|
288, AV_CPU_FLAG_AVX2,
|
||||||
.name = "fft_sr_float_avx2",
|
FF_TX_OUT_OF_PLACE)
|
||||||
.function = ff_tx_fft_sr_float_avx2,
|
|
||||||
.type = TX_TYPE(FFT),
|
DECL_CD_DEF(fft_sr_ns_float_avx2, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2,
|
||||||
.flags = FF_TX_ALIGNED | FF_TX_OUT_OF_PLACE,
|
352, AV_CPU_FLAG_AVX2,
|
||||||
.factors[0] = 2,
|
FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | FF_TX_PRESHUFFLE)
|
||||||
.min_len = 64,
|
|
||||||
.max_len = 131072,
|
|
||||||
.init = ff_tx_fft_sr_codelet_init_b8_i2_x86,
|
|
||||||
.cpu_flags = AV_CPU_FLAG_AVX2,
|
|
||||||
.prio = 288,
|
|
||||||
};
|
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
|
const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
|
||||||
/* Split-Radix codelets */
|
|
||||||
&ff_tx_fft2_float_sse3_def,
|
&ff_tx_fft2_float_sse3_def,
|
||||||
|
&ff_tx_fft2_ns_float_sse3_def,
|
||||||
&ff_tx_fft4_fwd_float_sse2_def,
|
&ff_tx_fft4_fwd_float_sse2_def,
|
||||||
&ff_tx_fft4_inv_float_sse2_def,
|
&ff_tx_fft4_inv_float_sse2_def,
|
||||||
|
&ff_tx_fft4_ns_float_sse2_def,
|
||||||
&ff_tx_fft8_float_sse3_def,
|
&ff_tx_fft8_float_sse3_def,
|
||||||
|
&ff_tx_fft8_ns_float_sse3_def,
|
||||||
&ff_tx_fft8_float_avx_def,
|
&ff_tx_fft8_float_avx_def,
|
||||||
|
&ff_tx_fft8_ns_float_avx_def,
|
||||||
&ff_tx_fft16_float_avx_def,
|
&ff_tx_fft16_float_avx_def,
|
||||||
|
&ff_tx_fft16_ns_float_avx_def,
|
||||||
&ff_tx_fft16_float_fma3_def,
|
&ff_tx_fft16_float_fma3_def,
|
||||||
|
&ff_tx_fft16_ns_float_fma3_def,
|
||||||
|
|
||||||
#if ARCH_X86_64
|
#if ARCH_X86_64
|
||||||
&ff_tx_fft32_float_avx_def,
|
&ff_tx_fft32_float_avx_def,
|
||||||
|
&ff_tx_fft32_ns_float_avx_def,
|
||||||
&ff_tx_fft32_float_fma3_def,
|
&ff_tx_fft32_float_fma3_def,
|
||||||
|
&ff_tx_fft32_ns_float_fma3_def,
|
||||||
|
|
||||||
/* Standalone transforms */
|
|
||||||
&ff_tx_fft_sr_float_avx_def,
|
&ff_tx_fft_sr_float_avx_def,
|
||||||
|
&ff_tx_fft_sr_ns_float_avx_def,
|
||||||
#if HAVE_AVX2_EXTERNAL
|
#if HAVE_AVX2_EXTERNAL
|
||||||
&ff_tx_fft_sr_float_avx2_def,
|
&ff_tx_fft_sr_float_avx2_def,
|
||||||
|
&ff_tx_fft_sr_ns_float_avx2_def,
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user