I spotted an interesting pattern that I didn't see before that leads to the implementation being faster. The bit shifting table I was using before is no longer needed, and was able to remove quite a few lines. I also add use of FMA on the AVX2 version. f32 1920x1080 1 thread with prelut c impl 1434012700 UNITS in lut3d->interp, 1 runs, 0 skips 1434035335 UNITS in lut3d->interp, 2 runs, 0 skips 1423615347 UNITS in lut3d->interp, 4 runs, 0 skips 1426268863 UNITS in lut3d->interp, 8 runs, 0 skips sse2 905484420 UNITS in lut3d->interp, 1 runs, 0 skips 905659010 UNITS in lut3d->interp, 2 runs, 0 skips 915167140 UNITS in lut3d->interp, 4 runs, 0 skips 915834222 UNITS in lut3d->interp, 8 runs, 0 skips avx 574794860 UNITS in lut3d->interp, 1 runs, 0 skips 581035090 UNITS in lut3d->interp, 2 runs, 0 skips 584116720 UNITS in lut3d->interp, 4 runs, 0 skips 581460290 UNITS in lut3d->interp, 8 runs, 0 skips avx2 301698880 UNITS in lut3d->interp, 1 runs, 0 skips 301982880 UNITS in lut3d->interp, 2 runs, 0 skips 306962430 UNITS in lut3d->interp, 4 runs, 0 skips 305472025 UNITS in lut3d->interp, 8 runs, 0 skips gbrap16 1920x1080 1 thread with prelut c impl 1480894840 UNITS in lut3d->interp, 1 runs, 0 skips 1502922990 UNITS in lut3d->interp, 2 runs, 0 skips 1496114307 UNITS in lut3d->interp, 4 runs, 0 skips 1492554551 UNITS in lut3d->interp, 8 runs, 0 skips sse2 980777180 UNITS in lut3d->interp, 1 runs, 0 skips 986121520 UNITS in lut3d->interp, 2 runs, 0 skips 986489840 UNITS in lut3d->interp, 4 runs, 0 skips 998832248 UNITS in lut3d->interp, 8 runs, 0 skips avx 622212360 UNITS in lut3d->interp, 1 runs, 0 skips 622981160 UNITS in lut3d->interp, 2 runs, 0 skips 645396315 UNITS in lut3d->interp, 4 runs, 0 skips 641057075 UNITS in lut3d->interp, 8 runs, 0 skips avx2 321336400 UNITS in lut3d->interp, 1 runs, 0 skips 321268920 UNITS in lut3d->interp, 2 runs, 0 skips 323459895 UNITS in lut3d->interp, 4 runs, 0 skips 324949967 UNITS in lut3d->interp, 8 runs, 0 skips
		
			
				
	
	
		
			84 lines
		
	
	
		
			2.1 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			84 lines
		
	
	
		
			2.1 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 2013 Clément Bœsch
 | |
|  * Copyright (c) 2018 Paul B Mahol
 | |
|  *
 | |
|  * This file is part of FFmpeg.
 | |
|  *
 | |
|  * FFmpeg is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU Lesser General Public
 | |
|  * License as published by the Free Software Foundation; either
 | |
|  * version 2.1 of the License, or (at your option) any later version.
 | |
|  *
 | |
|  * FFmpeg is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
|  * Lesser General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU Lesser General Public
 | |
|  * License along with FFmpeg; if not, write to the Free Software
 | |
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | |
|  */
 | |
| #ifndef AVFILTER_LUT3D_H
 | |
| #define AVFILTER_LUT3D_H
 | |
| 
 | |
| #include "libavutil/pixdesc.h"
 | |
| #include "framesync.h"
 | |
| #include "avfilter.h"
 | |
| 
 | |
| enum interp_mode {
 | |
|     INTERPOLATE_NEAREST,
 | |
|     INTERPOLATE_TRILINEAR,
 | |
|     INTERPOLATE_TETRAHEDRAL,
 | |
|     INTERPOLATE_PYRAMID,
 | |
|     INTERPOLATE_PRISM,
 | |
|     NB_INTERP_MODE
 | |
| };
 | |
| 
 | |
| struct rgbvec {
 | |
|     float r, g, b;
 | |
| };
 | |
| 
 | |
| /* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT
 | |
|  * of 512x512 (64x64x64) */
 | |
| #define MAX_LEVEL 256
 | |
| #define PRELUT_SIZE 65536
 | |
| 
 | |
| typedef struct Lut3DPreLut {
 | |
|     int size;
 | |
|     float min[3];
 | |
|     float max[3];
 | |
|     float scale[3];
 | |
|     float* lut[3];
 | |
| } Lut3DPreLut;
 | |
| 
 | |
| typedef struct LUT3DContext {
 | |
|     const AVClass *class;
 | |
|     struct rgbvec *lut;
 | |
|     int lutsize;
 | |
|     int lutsize2;
 | |
|     struct rgbvec scale;
 | |
|     int interpolation;          ///<interp_mode
 | |
|     char *file;
 | |
|     uint8_t rgba_map[4];
 | |
|     int step;
 | |
|     avfilter_action_func *interp;
 | |
|     Lut3DPreLut prelut;
 | |
| #if CONFIG_HALDCLUT_FILTER
 | |
|     uint8_t clut_rgba_map[4];
 | |
|     int clut_step;
 | |
|     int clut_bits;
 | |
|     int clut_planar;
 | |
|     int clut_float;
 | |
|     int clut_width;
 | |
|     FFFrameSync fs;
 | |
| #endif
 | |
| } LUT3DContext;
 | |
| 
 | |
| typedef struct ThreadData {
 | |
|     AVFrame *in, *out;
 | |
| } ThreadData;
 | |
| 
 | |
| void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc);
 | |
| 
 | |
| #endif /* AVFILTER_LUT3D_H */
 |