avfilter/vf_transpose: rewrite for x86 SIMD
Transpose first in chunks of 8x8 blocks. 15% faster overall.
This commit is contained in:
parent
cbd8e07056
commit
7b19e76aeb
@ -58,6 +58,12 @@ typedef struct TransContext {
|
|||||||
|
|
||||||
int passthrough; ///< PassthroughType, landscape passthrough mode enabled
|
int passthrough; ///< PassthroughType, landscape passthrough mode enabled
|
||||||
int dir; ///< TransposeDir
|
int dir; ///< TransposeDir
|
||||||
|
|
||||||
|
void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize,
|
||||||
|
uint8_t *dst, ptrdiff_t dst_linesize);
|
||||||
|
void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize,
|
||||||
|
uint8_t *dst, ptrdiff_t dst_linesize,
|
||||||
|
int w, int h);
|
||||||
} TransContext;
|
} TransContext;
|
||||||
|
|
||||||
static int query_formats(AVFilterContext *ctx)
|
static int query_formats(AVFilterContext *ctx)
|
||||||
@ -79,6 +85,109 @@ static int query_formats(AVFilterContext *ctx)
|
|||||||
return ff_set_common_formats(ctx, pix_fmts);
|
return ff_set_common_formats(ctx, pix_fmts);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void transpose_block_8_c(uint8_t *src, ptrdiff_t src_linesize,
|
||||||
|
uint8_t *dst, ptrdiff_t dst_linesize,
|
||||||
|
int w, int h)
|
||||||
|
{
|
||||||
|
int x, y;
|
||||||
|
for (y = 0; y < h; y++, dst += dst_linesize, src++)
|
||||||
|
for (x = 0; x < w; x++)
|
||||||
|
dst[x] = src[x*src_linesize];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void transpose_8x8_8_c(uint8_t *src, ptrdiff_t src_linesize,
|
||||||
|
uint8_t *dst, ptrdiff_t dst_linesize)
|
||||||
|
{
|
||||||
|
transpose_block_8_c(src, src_linesize, dst, dst_linesize, 8, 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void transpose_block_16_c(uint8_t *src, ptrdiff_t src_linesize,
|
||||||
|
uint8_t *dst, ptrdiff_t dst_linesize,
|
||||||
|
int w, int h)
|
||||||
|
{
|
||||||
|
int x, y;
|
||||||
|
for (y = 0; y < h; y++, dst += dst_linesize, src += 2)
|
||||||
|
for (x = 0; x < w; x++)
|
||||||
|
*((uint16_t *)(dst + 2*x)) = *((uint16_t *)(src + x*src_linesize));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void transpose_8x8_16_c(uint8_t *src, ptrdiff_t src_linesize,
|
||||||
|
uint8_t *dst, ptrdiff_t dst_linesize)
|
||||||
|
{
|
||||||
|
transpose_block_16_c(src, src_linesize, dst, dst_linesize, 8, 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void transpose_block_24_c(uint8_t *src, ptrdiff_t src_linesize,
|
||||||
|
uint8_t *dst, ptrdiff_t dst_linesize,
|
||||||
|
int w, int h)
|
||||||
|
{
|
||||||
|
int x, y;
|
||||||
|
for (y = 0; y < h; y++, dst += dst_linesize) {
|
||||||
|
for (x = 0; x < w; x++) {
|
||||||
|
int32_t v = AV_RB24(src + x*src_linesize + y*3);
|
||||||
|
AV_WB24(dst + 3*x, v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void transpose_8x8_24_c(uint8_t *src, ptrdiff_t src_linesize,
|
||||||
|
uint8_t *dst, ptrdiff_t dst_linesize)
|
||||||
|
{
|
||||||
|
transpose_block_24_c(src, src_linesize, dst, dst_linesize, 8, 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void transpose_block_32_c(uint8_t *src, ptrdiff_t src_linesize,
|
||||||
|
uint8_t *dst, ptrdiff_t dst_linesize,
|
||||||
|
int w, int h)
|
||||||
|
{
|
||||||
|
int x, y;
|
||||||
|
for (y = 0; y < h; y++, dst += dst_linesize, src += 4) {
|
||||||
|
for (x = 0; x < w; x++)
|
||||||
|
*((uint32_t *)(dst + 4*x)) = *((uint32_t *)(src + x*src_linesize));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void transpose_8x8_32_c(uint8_t *src, ptrdiff_t src_linesize,
|
||||||
|
uint8_t *dst, ptrdiff_t dst_linesize)
|
||||||
|
{
|
||||||
|
transpose_block_32_c(src, src_linesize, dst, dst_linesize, 8, 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void transpose_block_48_c(uint8_t *src, ptrdiff_t src_linesize,
|
||||||
|
uint8_t *dst, ptrdiff_t dst_linesize,
|
||||||
|
int w, int h)
|
||||||
|
{
|
||||||
|
int x, y;
|
||||||
|
for (y = 0; y < h; y++, dst += dst_linesize, src += 6) {
|
||||||
|
for (x = 0; x < w; x++) {
|
||||||
|
int64_t v = AV_RB48(src + x*src_linesize);
|
||||||
|
AV_WB48(dst + 6*x, v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void transpose_8x8_48_c(uint8_t *src, ptrdiff_t src_linesize,
|
||||||
|
uint8_t *dst, ptrdiff_t dst_linesize)
|
||||||
|
{
|
||||||
|
transpose_block_48_c(src, src_linesize, dst, dst_linesize, 8, 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void transpose_block_64_c(uint8_t *src, ptrdiff_t src_linesize,
|
||||||
|
uint8_t *dst, ptrdiff_t dst_linesize,
|
||||||
|
int w, int h)
|
||||||
|
{
|
||||||
|
int x, y;
|
||||||
|
for (y = 0; y < h; y++, dst += dst_linesize, src += 8)
|
||||||
|
for (x = 0; x < w; x++)
|
||||||
|
*((uint64_t *)(dst + 8*x)) = *((uint64_t *)(src + x*src_linesize));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void transpose_8x8_64_c(uint8_t *src, ptrdiff_t src_linesize,
|
||||||
|
uint8_t *dst, ptrdiff_t dst_linesize)
|
||||||
|
{
|
||||||
|
transpose_block_64_c(src, src_linesize, dst, dst_linesize, 8, 8);
|
||||||
|
}
|
||||||
|
|
||||||
static int config_props_output(AVFilterLink *outlink)
|
static int config_props_output(AVFilterLink *outlink)
|
||||||
{
|
{
|
||||||
AVFilterContext *ctx = outlink->src;
|
AVFilterContext *ctx = outlink->src;
|
||||||
@ -118,6 +227,21 @@ static int config_props_output(AVFilterLink *outlink)
|
|||||||
else
|
else
|
||||||
outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
|
outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
|
||||||
|
|
||||||
|
switch (s->pixsteps[0]) {
|
||||||
|
case 1: s->transpose_block = transpose_block_8_c;
|
||||||
|
s->transpose_8x8 = transpose_8x8_8_c; break;
|
||||||
|
case 2: s->transpose_block = transpose_block_16_c;
|
||||||
|
s->transpose_8x8 = transpose_8x8_16_c; break;
|
||||||
|
case 3: s->transpose_block = transpose_block_24_c;
|
||||||
|
s->transpose_8x8 = transpose_8x8_24_c; break;
|
||||||
|
case 4: s->transpose_block = transpose_block_32_c;
|
||||||
|
s->transpose_8x8 = transpose_8x8_32_c; break;
|
||||||
|
case 6: s->transpose_block = transpose_block_48_c;
|
||||||
|
s->transpose_8x8 = transpose_8x8_48_c; break;
|
||||||
|
case 8: s->transpose_block = transpose_block_64_c;
|
||||||
|
s->transpose_8x8 = transpose_8x8_64_c; break;
|
||||||
|
}
|
||||||
|
|
||||||
av_log(ctx, AV_LOG_VERBOSE,
|
av_log(ctx, AV_LOG_VERBOSE,
|
||||||
"w:%d h:%d dir:%d -> w:%d h:%d rotation:%s vflip:%d\n",
|
"w:%d h:%d dir:%d -> w:%d h:%d rotation:%s vflip:%d\n",
|
||||||
inlink->w, inlink->h, s->dir, outlink->w, outlink->h,
|
inlink->w, inlink->h, s->dir, outlink->w, outlink->h,
|
||||||
@ -176,49 +300,25 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr,
|
|||||||
dstlinesize *= -1;
|
dstlinesize *= -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (pixstep) {
|
for (y = start; y < end - 7; y += 8) {
|
||||||
case 1:
|
for (x = 0; x < outw - 7; x += 8) {
|
||||||
for (y = start; y < end; y++, dst += dstlinesize)
|
s->transpose_8x8(src + x * srclinesize + y * pixstep,
|
||||||
for (x = 0; x < outw; x++)
|
srclinesize,
|
||||||
dst[x] = src[x * srclinesize + y];
|
dst + (y - start) * dstlinesize + x * pixstep,
|
||||||
break;
|
dstlinesize);
|
||||||
case 2:
|
|
||||||
for (y = start; y < end; y++, dst += dstlinesize) {
|
|
||||||
for (x = 0; x < outw; x++)
|
|
||||||
*((uint16_t *)(dst + 2 * x)) =
|
|
||||||
*((uint16_t *)(src + x * srclinesize + y * 2));
|
|
||||||
}
|
}
|
||||||
break;
|
if (outw - x > 0 && end - y > 0)
|
||||||
case 3:
|
s->transpose_block(src + x * srclinesize + y * pixstep,
|
||||||
for (y = start; y < end; y++, dst += dstlinesize) {
|
srclinesize,
|
||||||
for (x = 0; x < outw; x++) {
|
dst + (y - start) * dstlinesize + x * pixstep,
|
||||||
int32_t v = AV_RB24(src + x * srclinesize + y * 3);
|
dstlinesize, outw - x, end - y);
|
||||||
AV_WB24(dst + 3 * x, v);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case 4:
|
|
||||||
for (y = start; y < end; y++, dst += dstlinesize) {
|
|
||||||
for (x = 0; x < outw; x++)
|
|
||||||
*((uint32_t *)(dst + 4 * x)) =
|
|
||||||
*((uint32_t *)(src + x * srclinesize + y * 4));
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case 6:
|
|
||||||
for (y = start; y < end; y++, dst += dstlinesize) {
|
|
||||||
for (x = 0; x < outw; x++) {
|
|
||||||
int64_t v = AV_RB48(src + x * srclinesize + y*6);
|
|
||||||
AV_WB48(dst + 6*x, v);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case 8:
|
|
||||||
for (y = start; y < end; y++, dst += dstlinesize) {
|
|
||||||
for (x = 0; x < outw; x++)
|
|
||||||
*((uint64_t *)(dst + 8*x)) = *((uint64_t *)(src + x * srclinesize + y*8));
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (end - y > 0)
|
||||||
|
s->transpose_block(src + 0 * srclinesize + y * pixstep,
|
||||||
|
srclinesize,
|
||||||
|
dst + (y - start) * dstlinesize + 0 * pixstep,
|
||||||
|
dstlinesize, outw, end - y);
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user