From c2b8dea1828f35c808adcf12615893d5c740bc0a Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Fri, 2 Mar 2012 16:10:00 -0500
Subject: [PATCH 01/27] wmaenc: limit block_align to MAX_CODED_SUPERFRAME_SIZE

This is near the theoretical limit for wma frame size and is the most that
our decoder can handle. Allowing higher bit rates will just end up padding
each frame with empty bytes.

Fixes invalid writes for avconv when using very high bit rates.

CC:libav-stable@libav.org
---
 libavcodec/wmaenc.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/libavcodec/wmaenc.c b/libavcodec/wmaenc.c
index c762a723b9..e24a3f4205 100644
--- a/libavcodec/wmaenc.c
+++ b/libavcodec/wmaenc.c
@@ -71,8 +71,12 @@ static int encode_init(AVCodecContext * avctx){
     for(i = 0; i < s->nb_block_sizes; i++)
         ff_mdct_init(&s->mdct_ctx[i], s->frame_len_bits - i + 1, 0, 1.0);
 
-    avctx->block_align=
-    s->block_align= avctx->bit_rate*(int64_t)s->frame_len / (avctx->sample_rate*8);
+    s->block_align     = avctx->bit_rate * (int64_t)s->frame_len /
+                         (avctx->sample_rate * 8);
+    s->block_align     = FFMIN(s->block_align, MAX_CODED_SUPERFRAME_SIZE);
+    avctx->block_align = s->block_align;
+    avctx->bit_rate    = avctx->block_align * 8LL * avctx->sample_rate /
+                         s->frame_len;
 //av_log(NULL, AV_LOG_ERROR, "%d %d %d %d\n", s->block_align, avctx->bit_rate, s->frame_len, avctx->sample_rate);
     avctx->frame_size= s->frame_len;
 

From 1ec075cfecac01f9a289965db06f76365b0b1737 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Fri, 2 Mar 2012 16:27:57 -0500
Subject: [PATCH 02/27] wmaenc: limit allowed sample rate to 48kHz

ff_wma_init() allows up to 50kHz, but this generates an exponent band
size table that requires 65 bands. The code assumes 25 bands in many
places, and using sample rates higher than 48kHz will lead to buffer
overwrites.

CC:libav-stable@libav.org
---
 libavcodec/wmaenc.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/libavcodec/wmaenc.c b/libavcodec/wmaenc.c
index e24a3f4205..99544c0c8a 100644
--- a/libavcodec/wmaenc.c
+++ b/libavcodec/wmaenc.c
@@ -39,6 +39,12 @@ static int encode_init(AVCodecContext * avctx){
         return AVERROR(EINVAL);
     }
 
+    if (avctx->sample_rate > 48000) {
+        av_log(avctx, AV_LOG_ERROR, "sample rate is too high: %d > 48kHz",
+               avctx->sample_rate);
+        return AVERROR(EINVAL);
+    }
+
     if(avctx->bit_rate < 24*1000) {
         av_log(avctx, AV_LOG_ERROR, "bitrate too low: got %i, need 24000 or higher\n",
                avctx->bit_rate);

From dfc4fdedf8cfc56a505579b1f2c1c5efbce4b97e Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Fri, 2 Mar 2012 16:33:33 -0500
Subject: [PATCH 03/27] wmaenc: require a large enough output buffer to prevent
 overwrites

The maximum theoretical frame size is around 17000 bytes. Although in
practice it will generally be much smaller, we require a larger buffer
just to be safe.

CC: libav-stable@libav.org
---
 libavcodec/wmaenc.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libavcodec/wmaenc.c b/libavcodec/wmaenc.c
index 99544c0c8a..5135b982aa 100644
--- a/libavcodec/wmaenc.c
+++ b/libavcodec/wmaenc.c
@@ -365,6 +365,11 @@ static int encode_superframe(AVCodecContext *avctx,
         }
     }
 
+    if (buf_size < 2 * MAX_CODED_SUPERFRAME_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "output buffer size is too small\n");
+        return AVERROR(EINVAL);
+    }
+
 #if 1
     total_gain= 128;
     for(i=64; i; i>>=1){

From 5d652e063bd3a180f9de8915e5137aa4f938846d Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Fri, 2 Mar 2012 16:42:21 -0500
Subject: [PATCH 04/27] wmaenc: check final frame size against output packet
 size

Currently we have an assert() that prevents the frame from being too large,
but it is more user-friendly to give an error message instead of aborting on
assert(). This condition is quite unlikely due to the minimum bit rate check
in encode_init(), but it is still worth having.
---
 libavcodec/wmaenc.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/libavcodec/wmaenc.c b/libavcodec/wmaenc.c
index 5135b982aa..c00f13623d 100644
--- a/libavcodec/wmaenc.c
+++ b/libavcodec/wmaenc.c
@@ -394,11 +394,13 @@ static int encode_superframe(AVCodecContext *avctx,
     }
 #endif
 
-    encode_frame(s, s->coefs, buf, buf_size, total_gain);
+    if ((i = encode_frame(s, s->coefs, buf, buf_size, total_gain)) >= 0) {
+        av_log(avctx, AV_LOG_ERROR, "required frame size too large. please "
+               "use a higher bit rate.\n");
+        return AVERROR(EINVAL);
+    }
     assert((put_bits_count(&s->pb) & 7) == 0);
-    i= s->block_align - (put_bits_count(&s->pb)+7)/8;
-    assert(i>=0);
-    while(i--)
+    while (i++)
         put_bits(&s->pb, 8, 'N');
 
     flush_put_bits(&s->pb);

From 8ed7488ea39a4ab60045e679ad5efa6a7b81ed98 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Fri, 2 Mar 2012 16:55:45 -0500
Subject: [PATCH 05/27] wmaenc: return s->block_align instead of recalculating
 it

---
 libavcodec/wmaenc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/wmaenc.c b/libavcodec/wmaenc.c
index c00f13623d..789f112aee 100644
--- a/libavcodec/wmaenc.c
+++ b/libavcodec/wmaenc.c
@@ -404,7 +404,7 @@ static int encode_superframe(AVCodecContext *avctx,
         put_bits(&s->pb, 8, 'N');
 
     flush_put_bits(&s->pb);
-    return put_bits_ptr(&s->pb) - s->pb.buf;
+    return s->block_align;
 }
 
 AVCodec ff_wmav1_encoder = {

From 51ddf35c9017018e58c15275ff5b129647a0c94d Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Fri, 2 Mar 2012 17:11:25 -0500
Subject: [PATCH 06/27] wmaenc: fix m/s stereo encoding for the first frame

We need to set ms_stereo in encode_init() in order to avoid incorrectly
encoding the first frame as non-m/s while flagging it as m/s. Fixes an
uncomfortable pop in the left channel at the start of playback.

CC:libav-stable@libav.org
---
 libavcodec/wmaenc.c    | 4 +++-
 tests/ref/acodec/wmav1 | 6 +++---
 tests/ref/acodec/wmav2 | 6 +++---
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/libavcodec/wmaenc.c b/libavcodec/wmaenc.c
index 789f112aee..6fd3494016 100644
--- a/libavcodec/wmaenc.c
+++ b/libavcodec/wmaenc.c
@@ -70,6 +70,8 @@ static int encode_init(AVCodecContext * avctx){
     s->use_exp_vlc = flags2 & 0x0001;
     s->use_bit_reservoir = flags2 & 0x0002;
     s->use_variable_block_len = flags2 & 0x0004;
+    if (avctx->channels == 2)
+        s->ms_stereo = 1;
 
     ff_wma_init(avctx, flags2);
 
@@ -191,7 +193,7 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
     }
 
     if (s->nb_channels == 2) {
-        put_bits(&s->pb, 1, s->ms_stereo= 1);
+        put_bits(&s->pb, 1, !!s->ms_stereo);
     }
 
     for(ch = 0; ch < s->nb_channels; ch++) {
diff --git a/tests/ref/acodec/wmav1 b/tests/ref/acodec/wmav1
index 916e4a8ab6..117aa12a8c 100644
--- a/tests/ref/acodec/wmav1
+++ b/tests/ref/acodec/wmav1
@@ -1,4 +1,4 @@
-26a7f6b0f0b7181df8df3fa589f6bf81 *./tests/data/acodec/wmav1.asf
+0260385b8a54df11ad349f9ba8240fd8 *./tests/data/acodec/wmav1.asf
 106004 ./tests/data/acodec/wmav1.asf
-stddev:12245.52 PSNR: 14.57 MAXDIFF:65521 bytes:  1064960/  1058400
-stddev: 2095.89 PSNR: 29.90 MAXDIFF:27658 bytes:  1056768/  1058400
+stddev:12241.90 PSNR: 14.57 MAXDIFF:65521 bytes:  1064960/  1058400
+stddev: 2074.79 PSNR: 29.99 MAXDIFF:27658 bytes:  1056768/  1058400
diff --git a/tests/ref/acodec/wmav2 b/tests/ref/acodec/wmav2
index 622b6fcc36..43b19b7530 100644
--- a/tests/ref/acodec/wmav2
+++ b/tests/ref/acodec/wmav2
@@ -1,4 +1,4 @@
-7c6c0cb692af01b312ae345723674b5f *./tests/data/acodec/wmav2.asf
+bdb4c312fb109f990be83a70f8ec9bdc *./tests/data/acodec/wmav2.asf
 106044 ./tests/data/acodec/wmav2.asf
-stddev:12249.93 PSNR: 14.57 MAXDIFF:65521 bytes:  1064960/  1058400
-stddev: 2089.21 PSNR: 29.93 MAXDIFF:27650 bytes:  1056768/  1058400
+stddev:12246.35 PSNR: 14.57 MAXDIFF:65521 bytes:  1064960/  1058400
+stddev: 2068.08 PSNR: 30.02 MAXDIFF:27650 bytes:  1056768/  1058400

From 6aa6e3e814ce1023844f916309f6091f231346fd Mon Sep 17 00:00:00 2001
From: Derek Buitenhuis <derek.buitenhuis@gmail.com>
Date: Thu, 1 Mar 2012 19:48:26 -0500
Subject: [PATCH 07/27] fate: Add sunrast regression test

Signed-off-by: Derek Buitenhuis <derek.buitenhuis@gmail.com>
Signed-off-by: Justin Ruggles <justin.ruggles@gmail.com>
---
 tests/lavf-regression.sh | 4 ++++
 tests/ref/lavf/sunrast   | 3 +++
 2 files changed, 7 insertions(+)
 create mode 100644 tests/ref/lavf/sunrast

diff --git a/tests/lavf-regression.sh b/tests/lavf-regression.sh
index 0041840d89..49d2dfc9cd 100755
--- a/tests/lavf-regression.sh
+++ b/tests/lavf-regression.sh
@@ -183,6 +183,10 @@ if [ -n "$do_dpx" ] ; then
 do_image_formats dpx
 fi
 
+if [ -n "$do_sunrast" ] ; then
+do_image_formats sun
+fi
+
 # audio only
 
 if [ -n "$do_wav" ] ; then
diff --git a/tests/ref/lavf/sunrast b/tests/ref/lavf/sunrast
new file mode 100644
index 0000000000..4db0505140
--- /dev/null
+++ b/tests/ref/lavf/sunrast
@@ -0,0 +1,3 @@
+07518bcb0841bc677ce6aea8464ea240 *./tests/data/images/sun/02.sun
+./tests/data/images/sun/%02d.sun CRC=0xe6c71946
+304123 ./tests/data/images/sun/02.sun

From 0f53d0cf4ba17c1ebf1f976e4dde1a4c32c37e9d Mon Sep 17 00:00:00 2001
From: Loren Merritt <pengvado@akuvian.org>
Date: Sun, 26 Feb 2012 09:05:29 -0800
Subject: [PATCH 08/27] x86inc: don't "bake" stack_offset in named arguments.

Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
---
 libavutil/x86/x86inc.asm | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index b20bb9a3a0..76a7b29b86 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -251,6 +251,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
         %endrep
     %endif
 
+    %xdefine %%stack_offset stack_offset
+    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
     %assign %%i 0
     %rep %0
         %xdefine %1q r %+ %%i %+ q
@@ -262,7 +264,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
         %assign %%i %%i+1
         %rotate 1
     %endrep
-    %assign n_arg_names %%i
+    %xdefine stack_offset %%stack_offset
+    %assign n_arg_names %0
 %endmacro
 
 %if WIN64 ; Windows x64 ;=================================================

From 8249a23fc1f33fa8b3c67b5cdb0050cc6d0013d6 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Thu, 1 Mar 2012 20:24:58 -0800
Subject: [PATCH 09/27] swscale: remove now unnecessary hack.

---
 libswscale/x86/output.asm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index c4daa8230c..4f15ec98cd 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -146,7 +146,7 @@ cglobal yuv2planeX_%1, %3, 7, %2, filter, fltsize, src, dst, w, dither, offset
     mova            m1, [yuv2yuvX_%1_start]
     mova            m2,  m1
 %endif ; %1 == 8/9/10/16
-    movsx     cntr_reg,  r1m ; FIXME should be fltsizem, but the assembler does the wrong thing b/c of SUB above
+    movsx     cntr_reg,  fltsizem
 .filterloop_ %+ %%i:
     ; input pixels
     mov             r6, [srcq+gprsize*cntr_reg-2*gprsize]

From e25be4715463da3abdb99acf735bb2148c3bd5c8 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Thu, 1 Mar 2012 21:35:22 -0800
Subject: [PATCH 10/27] vp8: convert idct/mc x86 assembly to use cpuflags().

---
 libavcodec/x86/vp8dsp-init.c | 106 ++++++++++++++---------------
 libavcodec/x86/vp8dsp.asm    | 128 +++++++++++++++++++----------------
 2 files changed, 121 insertions(+), 113 deletions(-)

diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c
index 3e05bb2fb9..d3f1456b71 100644
--- a/libavcodec/x86/vp8dsp-init.c
+++ b/libavcodec/x86/vp8dsp-init.c
@@ -29,16 +29,16 @@
 /*
  * MC functions
  */
-extern void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
+extern void ff_put_vp8_epel4_h4_mmx2  (uint8_t *dst, ptrdiff_t dststride,
                                        uint8_t *src, ptrdiff_t srcstride,
                                        int height, int mx, int my);
-extern void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
+extern void ff_put_vp8_epel4_h6_mmx2  (uint8_t *dst, ptrdiff_t dststride,
                                        uint8_t *src, ptrdiff_t srcstride,
                                        int height, int mx, int my);
-extern void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
+extern void ff_put_vp8_epel4_v4_mmx2  (uint8_t *dst, ptrdiff_t dststride,
                                        uint8_t *src, ptrdiff_t srcstride,
                                        int height, int mx, int my);
-extern void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
+extern void ff_put_vp8_epel4_v6_mmx2  (uint8_t *dst, ptrdiff_t dststride,
                                        uint8_t *src, ptrdiff_t srcstride,
                                        int height, int mx, int my);
 
@@ -80,7 +80,7 @@ extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                        uint8_t *src, ptrdiff_t srcstride,
                                        int height, int mx, int my);
 
-extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
+extern void ff_put_vp8_bilinear4_h_mmx2  (uint8_t *dst, ptrdiff_t dststride,
                                           uint8_t *src, ptrdiff_t srcstride,
                                           int height, int mx, int my);
 extern void ff_put_vp8_bilinear8_h_sse2  (uint8_t *dst, ptrdiff_t dststride,
@@ -93,7 +93,7 @@ extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                           uint8_t *src, ptrdiff_t srcstride,
                                           int height, int mx, int my);
 
-extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
+extern void ff_put_vp8_bilinear4_v_mmx2  (uint8_t *dst, ptrdiff_t dststride,
                                           uint8_t *src, ptrdiff_t srcstride,
                                           int height, int mx, int my);
 extern void ff_put_vp8_bilinear8_v_sse2  (uint8_t *dst, ptrdiff_t dststride,
@@ -139,27 +139,27 @@ static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
 }
 
 #if ARCH_X86_32
-TAP_W8 (mmxext, epel, h4)
-TAP_W8 (mmxext, epel, h6)
-TAP_W16(mmxext, epel, h6)
-TAP_W8 (mmxext, epel, v4)
-TAP_W8 (mmxext, epel, v6)
-TAP_W16(mmxext, epel, v6)
-TAP_W8 (mmxext, bilinear, h)
-TAP_W16(mmxext, bilinear, h)
-TAP_W8 (mmxext, bilinear, v)
-TAP_W16(mmxext, bilinear, v)
+TAP_W8 (mmx2,  epel, h4)
+TAP_W8 (mmx2,  epel, h6)
+TAP_W16(mmx2,  epel, h6)
+TAP_W8 (mmx2,  epel, v4)
+TAP_W8 (mmx2,  epel, v6)
+TAP_W16(mmx2,  epel, v6)
+TAP_W8 (mmx2,  bilinear, h)
+TAP_W16(mmx2,  bilinear, h)
+TAP_W8 (mmx2,  bilinear, v)
+TAP_W16(mmx2,  bilinear, v)
 #endif
 
-TAP_W16(sse2,   epel, h6)
-TAP_W16(sse2,   epel, v6)
-TAP_W16(sse2,   bilinear, h)
-TAP_W16(sse2,   bilinear, v)
+TAP_W16(sse2,  epel, h6)
+TAP_W16(sse2,  epel, v6)
+TAP_W16(sse2,  bilinear, h)
+TAP_W16(sse2,  bilinear, v)
 
-TAP_W16(ssse3,  epel, h6)
-TAP_W16(ssse3,  epel, v6)
-TAP_W16(ssse3,  bilinear, h)
-TAP_W16(ssse3,  bilinear, v)
+TAP_W16(ssse3, epel, h6)
+TAP_W16(ssse3, epel, v6)
+TAP_W16(ssse3, bilinear, h)
+TAP_W16(ssse3, bilinear, v)
 
 #define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
 static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
@@ -177,13 +177,13 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
 
 #if ARCH_X86_32
 #define HVTAPMMX(x, y) \
-HVTAP(mmxext, 8, x, y,  4,  8) \
-HVTAP(mmxext, 8, x, y,  8, 16)
+HVTAP(mmx2, 8, x, y,  4,  8) \
+HVTAP(mmx2, 8, x, y,  8, 16)
 
-HVTAP(mmxext, 8, 6, 6, 16, 16)
+HVTAP(mmx2, 8, 6, 6, 16, 16)
 #else
 #define HVTAPMMX(x, y) \
-HVTAP(mmxext, 8, x, y,  4,  8)
+HVTAP(mmx2, 8, x, y,  4,  8)
 #endif
 
 HVTAPMMX(4, 4)
@@ -218,16 +218,16 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
         dst, dststride, tmp, SIZE,      height,     mx, my); \
 }
 
-HVBILIN(mmxext, 8,  4,  8)
+HVBILIN(mmx2,  8,  4,  8)
 #if ARCH_X86_32
-HVBILIN(mmxext, 8,  8, 16)
-HVBILIN(mmxext, 8, 16, 16)
+HVBILIN(mmx2,  8,  8, 16)
+HVBILIN(mmx2,  8, 16, 16)
 #endif
-HVBILIN(sse2,   8,  8, 16)
-HVBILIN(sse2,   8, 16, 16)
-HVBILIN(ssse3,  8,  4,  8)
-HVBILIN(ssse3,  8,  8, 16)
-HVBILIN(ssse3,  8, 16, 16)
+HVBILIN(sse2,  8,  8, 16)
+HVBILIN(sse2,  8, 16, 16)
+HVBILIN(ssse3, 8,  4,  8)
+HVBILIN(ssse3, 8,  8, 16)
+HVBILIN(ssse3, 8, 16, 16)
 
 extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16],
                                    ptrdiff_t stride);
@@ -283,7 +283,7 @@ extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
                                                     int e, int i, int hvt);
 
 DECLARE_LOOP_FILTER(mmx)
-DECLARE_LOOP_FILTER(mmxext)
+DECLARE_LOOP_FILTER(mmx2)
 DECLARE_LOOP_FILTER(sse2)
 DECLARE_LOOP_FILTER(ssse3)
 DECLARE_LOOP_FILTER(sse4)
@@ -351,26 +351,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
     /* note that 4-tap width=16 functions are missing because w=16
      * is only used for luma, and luma is always a copy or sixtap. */
     if (mm_flags & AV_CPU_FLAG_MMX2) {
-        VP8_MC_FUNC(2, 4, mmxext);
-        VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
+        VP8_MC_FUNC(2, 4, mmx2);
+        VP8_BILINEAR_MC_FUNC(2, 4, mmx2);
 #if ARCH_X86_32
-        VP8_LUMA_MC_FUNC(0, 16, mmxext);
-        VP8_MC_FUNC(1, 8, mmxext);
-        VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
-        VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
+        VP8_LUMA_MC_FUNC(0, 16, mmx2);
+        VP8_MC_FUNC(1, 8, mmx2);
+        VP8_BILINEAR_MC_FUNC(0, 16, mmx2);
+        VP8_BILINEAR_MC_FUNC(1, 8, mmx2);
 
-        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
-        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
+        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx2;
+        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx2;
 
-        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
-        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
-        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
-        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
+        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx2;
+        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx2;
+        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx2;
+        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx2;
 
-        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_mmxext;
-        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_mmxext;
-        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
-        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
+        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_mmx2;
+        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_mmx2;
+        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_mmx2;
+        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_mmx2;
 #endif
     }
 
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index a7b83797ea..f21045d405 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -173,8 +173,8 @@ SECTION .text
 ;                                              int height,   int mx, int my);
 ;-----------------------------------------------------------------------------
 
-%macro FILTER_SSSE3 3
-cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
+%macro FILTER_SSSE3 1
+cglobal put_vp8_epel%1_h6, 6, 6, 8
     lea      r5d, [r5*3]
     mova      m3, [filter_h6_shuf2]
     mova      m4, [filter_h6_shuf3]
@@ -189,7 +189,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
     movu      m0, [r2-2]
     mova      m1, m0
     mova      m2, m0
-%ifidn %1, 4
+%if mmsize == 8
 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
 ; shuffle with a memory operand
     punpcklbw m0, [r2+3]
@@ -215,7 +215,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
     jg .nextrow
     REP_RET
 
-cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
+cglobal put_vp8_epel%1_h4, 6, 6, 7
     shl      r5d, 4
     mova      m2, [pw_64]
     mova      m3, [filter_h2_shuf]
@@ -246,7 +246,7 @@ cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
     jg .nextrow
     REP_RET
 
-cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
+cglobal put_vp8_epel%1_v4, 7, 7, 8
     shl      r6d, 4
 %ifdef PIC
     lea      r11, [fourtap_filter_hb_m]
@@ -285,7 +285,7 @@ cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
     jg .nextrow
     REP_RET
 
-cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
+cglobal put_vp8_epel%1_v6, 7, 7, 8
     lea      r6d, [r6*3]
 %ifdef PIC
     lea      r11, [sixtap_filter_hb_m]
@@ -333,13 +333,14 @@ cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
     REP_RET
 %endmacro
 
-INIT_MMX
-FILTER_SSSE3 4, 0, 0
-INIT_XMM
-FILTER_SSSE3 8, 8, 7
+INIT_MMX ssse3
+FILTER_SSSE3 4
+INIT_XMM ssse3
+FILTER_SSSE3 8
 
 ; 4x4 block, H-only 4-tap filter
-cglobal put_vp8_epel4_h4_mmxext, 6, 6
+INIT_MMX mmx2
+cglobal put_vp8_epel4_h4, 6, 6
     shl       r5d, 4
 %ifdef PIC
     lea       r11, [fourtap_filter_hw_m]
@@ -386,7 +387,8 @@ cglobal put_vp8_epel4_h4_mmxext, 6, 6
     REP_RET
 
 ; 4x4 block, H-only 6-tap filter
-cglobal put_vp8_epel4_h6_mmxext, 6, 6
+INIT_MMX mmx2
+cglobal put_vp8_epel4_h6, 6, 6
     lea       r5d, [r5*3]
 %ifdef PIC
     lea       r11, [sixtap_filter_hw_m]
@@ -442,8 +444,8 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6
     jg .nextrow
     REP_RET
 
-INIT_XMM
-cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
+INIT_XMM sse2
+cglobal put_vp8_epel8_h4, 6, 6, 10
     shl      r5d, 5
 %ifdef PIC
     lea      r11, [fourtap_filter_v_m]
@@ -490,7 +492,8 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
     jg .nextrow
     REP_RET
 
-cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
+INIT_XMM sse2
+cglobal put_vp8_epel8_h6, 6, 6, 14
     lea      r5d, [r5*3]
     shl      r5d, 4
 %ifdef PIC
@@ -552,9 +555,9 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
     jg .nextrow
     REP_RET
 
-%macro FILTER_V 3
+%macro FILTER_V 1
 ; 4x4 block, V-only 4-tap filter
-cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
+cglobal put_vp8_epel%1_v4, 7, 7, 8
     shl      r6d, 5
 %ifdef PIC
     lea      r11, [fourtap_filter_v_m]
@@ -607,7 +610,7 @@ cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
 
 
 ; 4x4 block, V-only 6-tap filter
-cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
+cglobal put_vp8_epel%1_v6, 7, 7, 8
     shl      r6d, 4
     lea       r6, [r6*3]
 %ifdef PIC
@@ -671,13 +674,13 @@ cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
     REP_RET
 %endmacro
 
-INIT_MMX
-FILTER_V mmxext, 4, 0
-INIT_XMM
-FILTER_V sse2,   8, 8
+INIT_MMX mmx2
+FILTER_V 4
+INIT_XMM sse2
+FILTER_V 8
 
-%macro FILTER_BILINEAR 3
-cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
+%macro FILTER_BILINEAR 1
+cglobal put_vp8_bilinear%1_v, 7, 7, 7
     mov      r5d, 8*16
     shl      r6d, 4
     sub      r5d, r6d
@@ -705,7 +708,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
     psraw     m2, 2
     pavgw     m0, m6
     pavgw     m2, m6
-%ifidn %1, mmxext
+%if mmsize == 8
     packuswb  m0, m0
     packuswb  m2, m2
     movh [r0+r1*0], m0
@@ -722,7 +725,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
     jg .nextrow
     REP_RET
 
-cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
+cglobal put_vp8_bilinear%1_h, 7, 7, 7
     mov      r6d, 8*16
     shl      r5d, 4
     sub      r6d, r5d
@@ -751,7 +754,7 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
     psraw     m2, 2
     pavgw     m0, m6
     pavgw     m2, m6
-%ifidn %1, mmxext
+%if mmsize == 8
     packuswb  m0, m0
     packuswb  m2, m2
     movh [r0+r1*0], m0
@@ -769,13 +772,13 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
     REP_RET
 %endmacro
 
-INIT_MMX
-FILTER_BILINEAR mmxext, 4, 0
-INIT_XMM
-FILTER_BILINEAR   sse2, 8, 7
+INIT_MMX mmx2
+FILTER_BILINEAR 4
+INIT_XMM sse2
+FILTER_BILINEAR 8
 
 %macro FILTER_BILINEAR_SSSE3 1
-cglobal put_vp8_bilinear%1_v_ssse3, 7,7
+cglobal put_vp8_bilinear%1_v, 7, 7, 5
     shl      r6d, 4
 %ifdef PIC
     lea      r11, [bilinear_filter_vb_m]
@@ -811,7 +814,7 @@ cglobal put_vp8_bilinear%1_v_ssse3, 7,7
     jg .nextrow
     REP_RET
 
-cglobal put_vp8_bilinear%1_h_ssse3, 7,7
+cglobal put_vp8_bilinear%1_h, 7, 7, 5
     shl      r5d, 4
 %ifdef PIC
     lea      r11, [bilinear_filter_vb_m]
@@ -848,12 +851,13 @@ cglobal put_vp8_bilinear%1_h_ssse3, 7,7
     REP_RET
 %endmacro
 
-INIT_MMX
+INIT_MMX ssse3
 FILTER_BILINEAR_SSSE3 4
-INIT_XMM
+INIT_XMM ssse3
 FILTER_BILINEAR_SSSE3 8
 
-cglobal put_vp8_pixels8_mmx, 5,5
+INIT_MMX mmx
+cglobal put_vp8_pixels8, 5,5
 .nextrow:
     movq  mm0, [r2+r3*0]
     movq  mm1, [r2+r3*1]
@@ -866,7 +870,8 @@ cglobal put_vp8_pixels8_mmx, 5,5
     REP_RET
 
 %if ARCH_X86_32
-cglobal put_vp8_pixels16_mmx, 5,5
+INIT_MMX mmx
+cglobal put_vp8_pixels16, 5,5
 .nextrow:
     movq  mm0, [r2+r3*0+0]
     movq  mm1, [r2+r3*0+8]
@@ -883,7 +888,8 @@ cglobal put_vp8_pixels16_mmx, 5,5
     REP_RET
 %endif
 
-cglobal put_vp8_pixels16_sse, 5,5,2
+INIT_XMM sse
+cglobal put_vp8_pixels16, 5,5,2
 .nextrow:
     movups xmm0, [r2+r3*0]
     movups xmm1, [r2+r3*1]
@@ -918,8 +924,8 @@ cglobal put_vp8_pixels16_sse, 5,5,2
     %4 [r1+r2+%3], m5
 %endmacro
 
-INIT_MMX
-cglobal vp8_idct_dc_add_mmx, 3, 3
+INIT_MMX mmx
+cglobal vp8_idct_dc_add, 3, 3
     ; load data
     movd       m0, [r1]
 
@@ -941,8 +947,8 @@ cglobal vp8_idct_dc_add_mmx, 3, 3
     ADD_DC     m0, m1, 0, movh
     RET
 
-INIT_XMM
-cglobal vp8_idct_dc_add_sse4, 3, 3, 6
+INIT_XMM sse4
+cglobal vp8_idct_dc_add, 3, 3, 6
     ; load data
     movd       m0, [r1]
     pxor       m1, m1
@@ -976,8 +982,8 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6
 ;-----------------------------------------------------------------------------
 
 %if ARCH_X86_32
-INIT_MMX
-cglobal vp8_idct_dc_add4y_mmx, 3, 3
+INIT_MMX mmx
+cglobal vp8_idct_dc_add4y, 3, 3
     ; load data
     movd      m0, [r1+32*0] ; A
     movd      m1, [r1+32*2] ; C
@@ -1012,8 +1018,8 @@ cglobal vp8_idct_dc_add4y_mmx, 3, 3
     RET
 %endif
 
-INIT_XMM
-cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
+INIT_XMM sse2
+cglobal vp8_idct_dc_add4y, 3, 3, 6
     ; load data
     movd      m0, [r1+32*0] ; A
     movd      m1, [r1+32*2] ; C
@@ -1046,8 +1052,8 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
 ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
 ;-----------------------------------------------------------------------------
 
-INIT_MMX
-cglobal vp8_idct_dc_add4uv_mmx, 3, 3
+INIT_MMX mmx
+cglobal vp8_idct_dc_add4uv, 3, 3
     ; load data
     movd      m0, [r1+32*0] ; A
     movd      m1, [r1+32*2] ; C
@@ -1118,9 +1124,8 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3
     SWAP                 %4,  %3
 %endmacro
 
-INIT_MMX
-%macro VP8_IDCT_ADD 1
-cglobal vp8_idct_add_%1, 3, 3
+%macro VP8_IDCT_ADD 0
+cglobal vp8_idct_add, 3, 3
     ; load block data
     movq         m0, [r1+ 0]
     movq         m1, [r1+ 8]
@@ -1128,7 +1133,7 @@ cglobal vp8_idct_add_%1, 3, 3
     movq         m3, [r1+24]
     movq         m6, [pw_20091]
     movq         m7, [pw_17734]
-%ifidn %1, sse
+%if cpuflag(sse)
     xorps      xmm0, xmm0
     movaps  [r1+ 0], xmm0
     movaps  [r1+16], xmm0
@@ -1157,9 +1162,11 @@ cglobal vp8_idct_add_%1, 3, 3
 %endmacro
 
 %if ARCH_X86_32
-VP8_IDCT_ADD mmx
+INIT_MMX mmx
+VP8_IDCT_ADD
 %endif
-VP8_IDCT_ADD sse
+INIT_MMX sse
+VP8_IDCT_ADD
 
 ;-----------------------------------------------------------------------------
 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
@@ -1192,13 +1199,13 @@ VP8_IDCT_ADD sse
     SWAP %1, %4, %3
 %endmacro
 
-%macro VP8_DC_WHT 1
-cglobal vp8_luma_dc_wht_%1, 2,3
+%macro VP8_DC_WHT 0
+cglobal vp8_luma_dc_wht, 2, 3
     movq          m0, [r1]
     movq          m1, [r1+8]
     movq          m2, [r1+16]
     movq          m3, [r1+24]
-%ifidn %1, sse
+%if cpuflag(sse)
     xorps      xmm0, xmm0
     movaps  [r1+ 0], xmm0
     movaps  [r1+16], xmm0
@@ -1222,11 +1229,12 @@ cglobal vp8_luma_dc_wht_%1, 2,3
     RET
 %endmacro
 
-INIT_MMX
 %if ARCH_X86_32
-VP8_DC_WHT mmx
+INIT_MMX mmx
+VP8_DC_WHT
 %endif
-VP8_DC_WHT sse
+INIT_MMX sse
+VP8_DC_WHT
 
 ;-----------------------------------------------------------------------------
 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);

From 28170f1a39236c5be91ab6df67e477a213c552b4 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Fri, 2 Mar 2012 20:38:02 -0800
Subject: [PATCH 11/27] vp8: convert loopfilter x86 assembly to use cpuflags().

---
 libavcodec/x86/vp8dsp.asm | 349 +++++++++++++++++---------------------
 1 file changed, 153 insertions(+), 196 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index f21045d405..4dba6db3b7 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -1422,7 +1422,17 @@ VP8_DC_WHT
     add              %4, %5
 %endmacro
 
-%macro WRITE_8W_SSE2 5
+%macro WRITE_8W 5
+%if cpuflag(sse4)
+    pextrw    [%3+%4*4], %1, 0
+    pextrw    [%2+%4*4], %1, 1
+    pextrw    [%3+%4*2], %1, 2
+    pextrw    [%3+%4  ], %1, 3
+    pextrw    [%3     ], %1, 4
+    pextrw    [%2     ], %1, 5
+    pextrw    [%2+%5  ], %1, 6
+    pextrw    [%2+%5*2], %1, 7
+%else
     movd            %2d, %1
     psrldq           %1, 4
     mov       [%3+%4*4], %2w
@@ -1448,67 +1458,51 @@ VP8_DC_WHT
     mov       [%3+%5  ], %2w
     shr              %2, 16
     mov       [%3+%5*2], %2w
+%endif
 %endmacro
 
-%macro WRITE_8W_SSE4 5
-    pextrw    [%3+%4*4], %1, 0
-    pextrw    [%2+%4*4], %1, 1
-    pextrw    [%3+%4*2], %1, 2
-    pextrw    [%3+%4  ], %1, 3
-    pextrw    [%3     ], %1, 4
-    pextrw    [%2     ], %1, 5
-    pextrw    [%2+%5  ], %1, 6
-    pextrw    [%2+%5*2], %1, 7
-%endmacro
-
-%macro SPLATB_REG_MMX 2-3
+%macro SPLATB_REG 2-3
+%if cpuflag(ssse3)
     movd           %1, %2d
-    punpcklbw      %1, %1
-    punpcklwd      %1, %1
-    punpckldq      %1, %1
-%endmacro
-
-%macro SPLATB_REG_MMXEXT 2-3
-    movd           %1, %2d
-    punpcklbw      %1, %1
-    pshufw         %1, %1, 0x0
-%endmacro
-
-%macro SPLATB_REG_SSE2 2-3
+    pshufb         %1, %3
+%elif cpuflag(sse2)
     movd           %1, %2d
     punpcklbw      %1, %1
     pshuflw        %1, %1, 0x0
     punpcklqdq     %1, %1
-%endmacro
-
-%macro SPLATB_REG_SSSE3 3
+%elif cpuflag(mmx2)
     movd           %1, %2d
-    pshufb         %1, %3
+    punpcklbw      %1, %1
+    pshufw         %1, %1, 0x0
+%else
+    movd           %1, %2d
+    punpcklbw      %1, %1
+    punpcklwd      %1, %1
+    punpckldq      %1, %1
+%endif
 %endmacro
 
-%macro SIMPLE_LOOPFILTER 4
-cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
+%macro SIMPLE_LOOPFILTER 2
+cglobal vp8_%1_loop_filter_simple, 3, %2, 8
 %if mmsize == 8 ; mmx/mmxext
     mov            r3, 2
 %endif
-%ifnidn %1, sse2
-%if mmsize == 16
+%if cpuflag(ssse3)
     pxor           m0, m0
-%endif
 %endif
     SPLATB_REG     m7, r2, m0       ; splat "flim" into register
 
     ; set up indexes to address 4 rows
     mov            r2, r1
     neg            r1
-%ifidn %2, h
+%ifidn %1, h
     lea            r0, [r0+4*r2-2]
 %endif
 
 %if mmsize == 8 ; mmx / mmxext
 .next8px
 %endif
-%ifidn %2, v
+%ifidn %1, v
     ; read 4 half/full rows of pixels
     mova           m0, [r0+r1*2]    ; p1
     mova           m1, [r0+r1]      ; p0
@@ -1589,7 +1583,7 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
     psubusb        m6, m3           ; p0+f2
 
     ; store
-%ifidn %2, v
+%ifidn %1, v
     mova         [r0], m4
     mova      [r0+r1], m6
 %else ; h
@@ -1597,12 +1591,12 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
     SBUTTERFLY    bw, 6, 4, 0
 
 %if mmsize == 16 ; sse2
-%ifidn %1, sse4
+%if cpuflag(sse4)
     inc            r4
 %endif
     WRITE_8W       m6, r4, r0, r1, r2
     lea            r4, [r3+r1+1]
-%ifidn %1, sse4
+%if cpuflag(sse4)
     inc            r3
 %endif
     WRITE_8W       m4, r3, r4, r1, r2
@@ -1613,7 +1607,7 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
 
 %if mmsize == 8 ; mmx/mmxext
     ; next 8 pixels
-%ifidn %2, v
+%ifidn %1, v
     add            r0, 8            ; advance 8 cols = pixels
 %else ; h
     lea            r0, [r0+r2*8-1]  ; advance 8 rows = lines
@@ -1627,41 +1621,38 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
 %endmacro
 
 %if ARCH_X86_32
-INIT_MMX
-%define SPLATB_REG SPLATB_REG_MMX
-SIMPLE_LOOPFILTER mmx,    v, 4, 0
-SIMPLE_LOOPFILTER mmx,    h, 5, 0
-%define SPLATB_REG SPLATB_REG_MMXEXT
-SIMPLE_LOOPFILTER mmxext, v, 4, 0
-SIMPLE_LOOPFILTER mmxext, h, 5, 0
+INIT_MMX mmx
+SIMPLE_LOOPFILTER v, 4
+SIMPLE_LOOPFILTER h, 5
+INIT_MMX mmx2
+SIMPLE_LOOPFILTER v, 4
+SIMPLE_LOOPFILTER h, 5
 %endif
 
-INIT_XMM
-%define SPLATB_REG SPLATB_REG_SSE2
-%define WRITE_8W   WRITE_8W_SSE2
-SIMPLE_LOOPFILTER sse2,   v, 3, 8
-SIMPLE_LOOPFILTER sse2,   h, 5, 8
-%define SPLATB_REG SPLATB_REG_SSSE3
-SIMPLE_LOOPFILTER ssse3,  v, 3, 8
-SIMPLE_LOOPFILTER ssse3,  h, 5, 8
-%define WRITE_8W   WRITE_8W_SSE4
-SIMPLE_LOOPFILTER sse4,   h, 5, 8
+INIT_XMM sse2
+SIMPLE_LOOPFILTER v, 3
+SIMPLE_LOOPFILTER h, 5
+INIT_XMM ssse3
+SIMPLE_LOOPFILTER v, 3
+SIMPLE_LOOPFILTER h, 5
+INIT_XMM sse4
+SIMPLE_LOOPFILTER h, 5
 
 ;-----------------------------------------------------------------------------
 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
 ;                                            int flimE, int flimI, int hev_thr);
 ;-----------------------------------------------------------------------------
 
-%macro INNER_LOOPFILTER 5
-%if %4 == 8 ; chroma
-cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5
+%macro INNER_LOOPFILTER 3
+%if %3 == 8 ; chroma
+cglobal vp8_%1_loop_filter8uv_inner, 6, %2, 13
 %define dst8_reg    r1
 %define mstride_reg r2
 %define E_reg       r3
 %define I_reg       r4
 %define hev_thr_reg r5
 %else ; luma
-cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
+cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
 %define mstride_reg r1
 %define E_reg       r2
 %define I_reg       r3
@@ -1681,11 +1672,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
 %define stack_reg   hev_thr_reg
 %endif
 
-%ifnidn %1, sse2
-%if mmsize == 16
+%if cpuflag(ssse3)
     pxor             m7, m7
 %endif
-%endif
 
 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
     ; splat function arguments
@@ -1696,7 +1685,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
     ; align stack
     mov       stack_reg, rsp         ; backup stack pointer
     and             rsp, ~(mmsize-1) ; align stack
-%ifidn %2, v
+%ifidn %1, v
     sub             rsp, mmsize * 4  ; stack layout: [0]=E, [1]=I, [2]=hev_thr
                                      ;               [3]=hev() result
 %else ; h
@@ -1729,14 +1718,14 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
     SPLATB_REG  hev_thr, hev_thr_reg, m7 ; hev_thresh
 %endif
 
-%if mmsize == 8 && %4 == 16 ; mmx/mmxext
+%if mmsize == 8 && %3 == 16 ; mmx/mmxext
     mov         cnt_reg, 2
 %endif
     mov      stride_reg, mstride_reg
     neg     mstride_reg
-%ifidn %2, h
+%ifidn %1, h
     lea         dst_reg, [dst_reg + stride_reg*4-4]
-%if %4 == 8
+%if %3 == 8
     lea        dst8_reg, [dst8_reg+ stride_reg*4-4]
 %endif
 %endif
@@ -1746,8 +1735,8 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
 %endif
     ; read
     lea        dst2_reg, [dst_reg + stride_reg]
-%ifidn %2, v
-%if %4 == 8 && mmsize == 16
+%ifidn %1, v
+%if %3 == 8 && mmsize == 16
 %define movrow movh
 %else
 %define movrow mova
@@ -1758,7 +1747,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
     movrow           m5, [dst2_reg]               ; q1
     movrow           m6, [dst2_reg+ stride_reg]   ; q2
     movrow           m7, [dst2_reg+ stride_reg*2] ; q3
-%if mmsize == 16 && %4 == 8
+%if mmsize == 16 && %3 == 8
     movhps           m0, [dst8_reg+mstride_reg*4]
     movhps           m2, [dst8_reg+mstride_reg*2]
     add        dst8_reg, stride_reg
@@ -1795,7 +1784,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
     SWAP              6, 3
     SWAP              5, 3
 %else ; sse2 (h)
-%if %4 == 16
+%if %3 == 16
     lea        dst8_reg, [dst_reg + stride_reg*8]
 %endif
 
@@ -1882,7 +1871,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
     psubusb          m6, m5          ; q2-q1
     por              m6, m4          ; abs(q2-q1)
 
-%ifidn %1, mmx
+%if notcpuflag(mmx2)
     mova             m4, flim_I
     pxor             m3, m3
     psubusb          m0, m4
@@ -1904,9 +1893,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
 
     ; normal_limit and high_edge_variance for p1-p0, q1-q0
     SWAP              7, 3           ; now m7 is zero
-%ifidn %2, v
+%ifidn %1, v
     movrow           m3, [dst_reg +mstride_reg] ; p0
-%if mmsize == 16 && %4 == 8
+%if mmsize == 16 && %3 == 8
     movhps           m3, [dst8_reg+mstride_reg]
 %endif
 %elifdef m12
@@ -1922,7 +1911,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
     psubusb          m1, m3          ; p1-p0
     psubusb          m6, m2          ; p0-p1
     por              m1, m6          ; abs(p1-p0)
-%ifidn %1, mmx
+%if notcpuflag(mmx2)
     mova             m6, m1
     psubusb          m1, m4
     psubusb          m6, hev_thr
@@ -1936,9 +1925,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
 %endif
 
     SWAP              6, 4           ; now m6 is I
-%ifidn %2, v
+%ifidn %1, v
     movrow           m4, [dst_reg]   ; q0
-%if mmsize == 16 && %4 == 8
+%if mmsize == 16 && %3 == 8
     movhps           m4, [dst8_reg]
 %endif
 %elifdef m8
@@ -1953,7 +1942,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
     psubusb          m1, m5          ; q0-q1
     psubusb          m7, m4          ; q1-q0
     por              m1, m7          ; abs(q1-q0)
-%ifidn %1, mmx
+%if notcpuflag(mmx2)
     mova             m7, m1
     psubusb          m1, m6
     psubusb          m7, hev_thr
@@ -2061,14 +2050,14 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
 %else
     mova             m6, mask_res
 %endif
-%ifidn %1, mmx
+%if notcpuflag(mmx2)
     mova             m7, [pb_1]
 %else ; mmxext/sse2
     pxor             m7, m7
 %endif
     pand             m0, m6
     pand             m1, m6
-%ifidn %1, mmx
+%if notcpuflag(mmx2)
     paddusb          m0, m7
     pand             m1, [pb_FE]
     pandn            m7, m0
@@ -2086,12 +2075,12 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
     paddusb          m2, m0          ; p1+a
 
     ; store
-%ifidn %2, v
+%ifidn %1, v
     movrow [dst_reg +mstride_reg*2], m2
     movrow [dst_reg +mstride_reg  ], m3
     movrow    [dst_reg], m4
     movrow [dst_reg + stride_reg  ], m5
-%if mmsize == 16 && %4 == 8
+%if mmsize == 16 && %3 == 8
     movhps [dst8_reg+mstride_reg*2], m2
     movhps [dst8_reg+mstride_reg  ], m3
     movhps   [dst8_reg], m4
@@ -2108,20 +2097,20 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
     WRITE_4x2D        2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
 %else ; sse2 (h)
     lea        dst8_reg, [dst8_reg+mstride_reg+2]
-    WRITE_4x4D        2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
+    WRITE_4x4D        2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %3
 %endif
 %endif
 
 %if mmsize == 8
-%if %4 == 8 ; chroma
-%ifidn %2, h
+%if %3 == 8 ; chroma
+%ifidn %1, h
     sub         dst_reg, 2
 %endif
     cmp         dst_reg, dst8_reg
     mov         dst_reg, dst8_reg
     jnz .next8px
 %else
-%ifidn %2, h
+%ifidn %1, h
     lea         dst_reg, [dst_reg + stride_reg*8-2]
 %else ; v
     add         dst_reg, 8
@@ -2138,56 +2127,46 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
 %endmacro
 
 %if ARCH_X86_32
-INIT_MMX
-%define SPLATB_REG SPLATB_REG_MMX
-INNER_LOOPFILTER mmx,    v, 6, 16, 0
-INNER_LOOPFILTER mmx,    h, 6, 16, 0
-INNER_LOOPFILTER mmx,    v, 6,  8, 0
-INNER_LOOPFILTER mmx,    h, 6,  8, 0
+INIT_MMX mmx
+INNER_LOOPFILTER v, 6, 16
+INNER_LOOPFILTER h, 6, 16
+INNER_LOOPFILTER v, 6,  8
+INNER_LOOPFILTER h, 6,  8
 
-%define SPLATB_REG SPLATB_REG_MMXEXT
-INNER_LOOPFILTER mmxext, v, 6, 16, 0
-INNER_LOOPFILTER mmxext, h, 6, 16, 0
-INNER_LOOPFILTER mmxext, v, 6,  8, 0
-INNER_LOOPFILTER mmxext, h, 6,  8, 0
+INIT_MMX mmx2
+INNER_LOOPFILTER v, 6, 16
+INNER_LOOPFILTER h, 6, 16
+INNER_LOOPFILTER v, 6,  8
+INNER_LOOPFILTER h, 6,  8
 %endif
 
-INIT_XMM
-%define SPLATB_REG SPLATB_REG_SSE2
-INNER_LOOPFILTER sse2,   v, 5, 16, 13
-%ifdef m8
-INNER_LOOPFILTER sse2,   h, 5, 16, 13
-%else
-INNER_LOOPFILTER sse2,   h, 6, 16, 13
-%endif
-INNER_LOOPFILTER sse2,   v, 6,  8, 13
-INNER_LOOPFILTER sse2,   h, 6,  8, 13
+INIT_XMM sse2
+INNER_LOOPFILTER v, 5, 16
+INNER_LOOPFILTER h, 5 + ARCH_X86_32, 16
+INNER_LOOPFILTER v, 6,  8
+INNER_LOOPFILTER h, 6,  8
 
-%define SPLATB_REG SPLATB_REG_SSSE3
-INNER_LOOPFILTER ssse3,  v, 5, 16, 13
-%ifdef m8
-INNER_LOOPFILTER ssse3,  h, 5, 16, 13
-%else
-INNER_LOOPFILTER ssse3,  h, 6, 16, 13
-%endif
-INNER_LOOPFILTER ssse3,  v, 6,  8, 13
-INNER_LOOPFILTER ssse3,  h, 6,  8, 13
+INIT_XMM ssse3
+INNER_LOOPFILTER v, 5, 16
+INNER_LOOPFILTER h, 5 + ARCH_X86_32, 16
+INNER_LOOPFILTER v, 6,  8
+INNER_LOOPFILTER h, 6,  8
 
 ;-----------------------------------------------------------------------------
 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
 ;                                            int flimE, int flimI, int hev_thr);
 ;-----------------------------------------------------------------------------
 
-%macro MBEDGE_LOOPFILTER 5
-%if %4 == 8 ; chroma
-cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5
+%macro MBEDGE_LOOPFILTER 3
+%if %3 == 8 ; chroma
+cglobal vp8_%1_loop_filter8uv_mbedge, 6, %2, 15
 %define dst8_reg    r1
 %define mstride_reg r2
 %define E_reg       r3
 %define I_reg       r4
 %define hev_thr_reg r5
 %else ; luma
-cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
+cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15
 %define mstride_reg r1
 %define E_reg       r2
 %define I_reg       r3
@@ -2207,14 +2186,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
 %define stack_reg   hev_thr_reg
 %endif
 
-%define ssse3_or_higher 0
-%ifnidn %1, sse2
-%if mmsize == 16
-%define ssse3_or_higher 1
-%endif
-%endif
-
-%if ssse3_or_higher
+%if cpuflag(ssse3)
     pxor             m7, m7
 %endif
 
@@ -2275,14 +2247,14 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
     SPLATB_REG  hev_thr, hev_thr_reg, m7 ; hev_thresh
 %endif
 
-%if mmsize == 8 && %4 == 16 ; mmx/mmxext
+%if mmsize == 8 && %3 == 16 ; mmx/mmxext
     mov         cnt_reg, 2
 %endif
     mov      stride_reg, mstride_reg
     neg     mstride_reg
-%ifidn %2, h
+%ifidn %1, h
     lea         dst_reg, [dst_reg + stride_reg*4-4]
-%if %4 == 8
+%if %3 == 8
     lea        dst8_reg, [dst8_reg+ stride_reg*4-4]
 %endif
 %endif
@@ -2292,8 +2264,8 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
 %endif
     ; read
     lea        dst2_reg, [dst_reg + stride_reg]
-%ifidn %2, v
-%if %4 == 8 && mmsize == 16
+%ifidn %1, v
+%if %3 == 8 && mmsize == 16
 %define movrow movh
 %else
 %define movrow mova
@@ -2304,7 +2276,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
     movrow           m5, [dst2_reg]               ; q1
     movrow           m6, [dst2_reg+ stride_reg]   ; q2
     movrow           m7, [dst2_reg+ stride_reg*2] ; q3
-%if mmsize == 16 && %4 == 8
+%if mmsize == 16 && %3 == 8
     movhps           m0, [dst8_reg+mstride_reg*4]
     movhps           m2, [dst8_reg+mstride_reg*2]
     add        dst8_reg, stride_reg
@@ -2341,7 +2313,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
     SWAP              6, 3
     SWAP              5, 3
 %else ; sse2 (h)
-%if %4 == 16
+%if %3 == 16
     lea        dst8_reg, [dst_reg + stride_reg*8]
 %endif
 
@@ -2430,7 +2402,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
     psubusb          m6, m5          ; q2-q1
     por              m6, m4          ; abs(q2-q1)
 
-%ifidn %1, mmx
+%if notcpuflag(mmx2)
     mova             m4, flim_I
     pxor             m3, m3
     psubusb          m0, m4
@@ -2452,9 +2424,9 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
 
     ; normal_limit and high_edge_variance for p1-p0, q1-q0
     SWAP              7, 3           ; now m7 is zero
-%ifidn %2, v
+%ifidn %1, v
     movrow           m3, [dst_reg +mstride_reg] ; p0
-%if mmsize == 16 && %4 == 8
+%if mmsize == 16 && %3 == 8
     movhps           m3, [dst8_reg+mstride_reg]
 %endif
 %elifdef m12
@@ -2470,7 +2442,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
     psubusb          m1, m3          ; p1-p0
     psubusb          m6, m2          ; p0-p1
     por              m1, m6          ; abs(p1-p0)
-%ifidn %1, mmx
+%if notcpuflag(mmx2)
     mova             m6, m1
     psubusb          m1, m4
     psubusb          m6, hev_thr
@@ -2484,9 +2456,9 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
 %endif
 
     SWAP              6, 4           ; now m6 is I
-%ifidn %2, v
+%ifidn %1, v
     movrow           m4, [dst_reg]   ; q0
-%if mmsize == 16 && %4 == 8
+%if mmsize == 16 && %3 == 8
     movhps           m4, [dst8_reg]
 %endif
 %elifdef m8
@@ -2501,7 +2473,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
     psubusb          m1, m5          ; q0-q1
     psubusb          m7, m4          ; q1-q0
     por              m1, m7          ; abs(q1-q0)
-%ifidn %1, mmx
+%if notcpuflag(mmx2)
     mova             m7, m1
     psubusb          m1, m6
     psubusb          m7, hev_thr
@@ -2613,7 +2585,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
     paddusb          m4, m1          ; q0-f1
 
     ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
-%if ssse3_or_higher
+%if cpuflag(ssse3)
     mova             m7, [pb_1]
 %else
     mova             m7, [pw_63]
@@ -2626,7 +2598,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
     pxor             m0, m0
     mova             m6, m1
     pcmpgtb          m0, m1         ; which are negative
-%if ssse3_or_higher
+%if cpuflag(ssse3)
     punpcklbw        m6, m7         ; interleave with "1" for rounding
     punpckhbw        m1, m7
 %else
@@ -2634,7 +2606,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
     punpckhbw        m1, m0
 %endif
     mova       lim_sign, m0
-%if ssse3_or_higher
+%if cpuflag(ssse3)
     mova             m7, [pb_27_63]
 %ifndef m8
     mova        lim_res, m1
@@ -2667,7 +2639,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
     psubb           m1, m6
     pand            m1, m0          ; -a0
     pandn           m0, m6          ; +a0
-%if ssse3_or_higher
+%if cpuflag(ssse3)
     mova            m6, [pb_18_63]  ; pipelining
 %endif
     psubusb         m3, m1
@@ -2675,7 +2647,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
     paddusb         m3, m0          ; p0+a0
     psubusb         m4, m0          ; q0-a0
 
-%if ssse3_or_higher
+%if cpuflag(ssse3)
     SWAP             6, 7
 %ifdef m10
     SWAP             1, 10
@@ -2707,7 +2679,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
     psubb           m1, m6
     pand            m1, m0          ; -a1
     pandn           m0, m6          ; +a1
-%if ssse3_or_higher
+%if cpuflag(ssse3)
     mova            m6, [pb_9_63]
 %endif
     psubusb         m2, m1
@@ -2715,7 +2687,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
     paddusb         m2, m0          ; p1+a1
     psubusb         m5, m0          ; q1-a1
 
-%if ssse3_or_higher
+%if cpuflag(ssse3)
     SWAP             6, 7
 %ifdef m10
     SWAP             1, 10
@@ -2765,14 +2737,14 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
     psubusb         m6, m7          ; q1-a1
 
     ; store
-%ifidn %2, v
+%ifidn %1, v
     movrow [dst2_reg+mstride_reg*4], m1
     movrow [dst_reg +mstride_reg*2], m2
     movrow [dst_reg +mstride_reg  ], m3
     movrow    [dst_reg], m4
     movrow   [dst2_reg], m5
     movrow [dst2_reg+ stride_reg  ], m6
-%if mmsize == 16 && %4 == 8
+%if mmsize == 16 && %3 == 8
     add        dst8_reg, mstride_reg
     movhps [dst8_reg+mstride_reg*2], m1
     movhps [dst8_reg+mstride_reg  ], m2
@@ -2796,14 +2768,14 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
     WRITE_2x4W       m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg
 %else ; sse2 (h)
     lea        dst8_reg, [dst8_reg+mstride_reg+1]
-    WRITE_4x4D        1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
+    WRITE_4x4D        1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %3
     lea         dst_reg, [dst2_reg+mstride_reg+4]
     lea        dst8_reg, [dst8_reg+mstride_reg+4]
-%ifidn %1, sse4
+%if cpuflag(sse4)
     add        dst2_reg, 4
 %endif
     WRITE_8W         m5, dst2_reg, dst_reg,  mstride_reg, stride_reg
-%ifidn %1, sse4
+%if cpuflag(sse4)
     lea        dst2_reg, [dst8_reg+ stride_reg]
 %endif
     WRITE_8W         m6, dst2_reg, dst8_reg, mstride_reg, stride_reg
@@ -2811,15 +2783,15 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
 %endif
 
 %if mmsize == 8
-%if %4 == 8 ; chroma
-%ifidn %2, h
+%if %3 == 8 ; chroma
+%ifidn %1, h
     sub         dst_reg, 5
 %endif
     cmp         dst_reg, dst8_reg
     mov         dst_reg, dst8_reg
     jnz .next8px
 %else
-%ifidn %2, h
+%ifidn %1, h
     lea         dst_reg, [dst_reg + stride_reg*8-5]
 %else ; v
     add         dst_reg, 8
@@ -2836,46 +2808,31 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
 %endmacro
 
 %if ARCH_X86_32
-INIT_MMX
-%define SPLATB_REG SPLATB_REG_MMX
-MBEDGE_LOOPFILTER mmx,    v, 6, 16, 0
-MBEDGE_LOOPFILTER mmx,    h, 6, 16, 0
-MBEDGE_LOOPFILTER mmx,    v, 6,  8, 0
-MBEDGE_LOOPFILTER mmx,    h, 6,  8, 0
+INIT_MMX mmx
+MBEDGE_LOOPFILTER v, 6, 16
+MBEDGE_LOOPFILTER h, 6, 16
+MBEDGE_LOOPFILTER v, 6,  8
+MBEDGE_LOOPFILTER h, 6,  8
 
-%define SPLATB_REG SPLATB_REG_MMXEXT
-MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0
-MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0
-MBEDGE_LOOPFILTER mmxext, v, 6,  8, 0
-MBEDGE_LOOPFILTER mmxext, h, 6,  8, 0
+INIT_MMX mmx2
+MBEDGE_LOOPFILTER v, 6, 16
+MBEDGE_LOOPFILTER h, 6, 16
+MBEDGE_LOOPFILTER v, 6,  8
+MBEDGE_LOOPFILTER h, 6,  8
 %endif
 
-INIT_XMM
-%define SPLATB_REG SPLATB_REG_SSE2
-%define WRITE_8W   WRITE_8W_SSE2
-MBEDGE_LOOPFILTER sse2,   v, 5, 16, 15
-%ifdef m8
-MBEDGE_LOOPFILTER sse2,   h, 5, 16, 15
-%else
-MBEDGE_LOOPFILTER sse2,   h, 6, 16, 15
-%endif
-MBEDGE_LOOPFILTER sse2,   v, 6,  8, 15
-MBEDGE_LOOPFILTER sse2,   h, 6,  8, 15
+INIT_XMM sse2
+MBEDGE_LOOPFILTER v, 5, 16
+MBEDGE_LOOPFILTER h, 5 + ARCH_X86_32, 16
+MBEDGE_LOOPFILTER v, 6,  8
+MBEDGE_LOOPFILTER h, 6,  8
 
-%define SPLATB_REG SPLATB_REG_SSSE3
-MBEDGE_LOOPFILTER ssse3,  v, 5, 16, 15
-%ifdef m8
-MBEDGE_LOOPFILTER ssse3,  h, 5, 16, 15
-%else
-MBEDGE_LOOPFILTER ssse3,  h, 6, 16, 15
-%endif
-MBEDGE_LOOPFILTER ssse3,  v, 6,  8, 15
-MBEDGE_LOOPFILTER ssse3,  h, 6,  8, 15
+INIT_XMM ssse3
+MBEDGE_LOOPFILTER v, 5, 16
+MBEDGE_LOOPFILTER h, 5 + ARCH_X86_32, 16
+MBEDGE_LOOPFILTER v, 6,  8
+MBEDGE_LOOPFILTER h, 6,  8
 
-%define WRITE_8W   WRITE_8W_SSE4
-%ifdef m8
-MBEDGE_LOOPFILTER sse4,   h, 5, 16, 15
-%else
-MBEDGE_LOOPFILTER sse4,   h, 6, 16, 15
-%endif
-MBEDGE_LOOPFILTER sse4,   h, 6,  8, 15
+INIT_XMM sse4
+MBEDGE_LOOPFILTER h, 5 + ARCH_X86_32, 16
+MBEDGE_LOOPFILTER h, 6,  8

From 21ffc78fd75624626be297cee515b1f11a755ccf Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Sat, 3 Mar 2012 06:46:29 -0800
Subject: [PATCH 12/27] vp8: convert mc x86 assembly to use named arguments.

---
 libavcodec/x86/vp8dsp.asm | 544 +++++++++++++++++++-------------------
 1 file changed, 272 insertions(+), 272 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 4dba6db3b7..2f45f5a478 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -116,23 +116,25 @@ bilinear_filter_vb_m: times 8 db 7, 1
                       times 8 db 1, 7
 
 %ifdef PIC
-%define fourtap_filter_hw    r11
-%define sixtap_filter_hw     r11
-%define fourtap_filter_hb    r11
-%define sixtap_filter_hb     r11
-%define fourtap_filter_v     r11
-%define sixtap_filter_v      r11
-%define bilinear_filter_vw   r11
-%define bilinear_filter_vb   r11
+%define fourtap_filter_hw  picregq
+%define sixtap_filter_hw   picregq
+%define fourtap_filter_hb  picregq
+%define sixtap_filter_hb   picregq
+%define fourtap_filter_v   picregq
+%define sixtap_filter_v    picregq
+%define bilinear_filter_vw picregq
+%define bilinear_filter_vb picregq
+%define npicregs 1
 %else
-%define fourtap_filter_hw fourtap_filter_hw_m
-%define sixtap_filter_hw  sixtap_filter_hw_m
-%define fourtap_filter_hb fourtap_filter_hb_m
-%define sixtap_filter_hb  sixtap_filter_hb_m
-%define fourtap_filter_v  fourtap_filter_v_m
-%define sixtap_filter_v   sixtap_filter_v_m
+%define fourtap_filter_hw  fourtap_filter_hw_m
+%define sixtap_filter_hw   sixtap_filter_hw_m
+%define fourtap_filter_hb  fourtap_filter_hb_m
+%define sixtap_filter_hb   sixtap_filter_hb_m
+%define fourtap_filter_v   fourtap_filter_v_m
+%define sixtap_filter_v    sixtap_filter_v_m
 %define bilinear_filter_vw bilinear_filter_vw_m
 %define bilinear_filter_vb bilinear_filter_vb_m
+%define npicregs 0
 %endif
 
 filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
@@ -174,25 +176,25 @@ SECTION .text
 ;-----------------------------------------------------------------------------
 
 %macro FILTER_SSSE3 1
-cglobal put_vp8_epel%1_h6, 6, 6, 8
-    lea      r5d, [r5*3]
+cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
+    lea      mxd, [mxq*3]
     mova      m3, [filter_h6_shuf2]
     mova      m4, [filter_h6_shuf3]
 %ifdef PIC
-    lea      r11, [sixtap_filter_hb_m]
+    lea  picregq, [sixtap_filter_hb_m]
 %endif
-    mova      m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
-    mova      m6, [sixtap_filter_hb+r5*8-32]
-    mova      m7, [sixtap_filter_hb+r5*8-16]
+    mova      m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
+    mova      m6, [sixtap_filter_hb+mxq*8-32]
+    mova      m7, [sixtap_filter_hb+mxq*8-16]
 
 .nextrow
-    movu      m0, [r2-2]
+    movu      m0, [srcq-2]
     mova      m1, m0
     mova      m2, m0
 %if mmsize == 8
 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
 ; shuffle with a memory operand
-    punpcklbw m0, [r2+3]
+    punpcklbw m0, [srcq+3]
 %else
     pshufb    m0, [filter_h6_shuf1]
 %endif
@@ -206,28 +208,28 @@ cglobal put_vp8_epel%1_h6, 6, 6, 8
     paddsw    m0, [pw_64]
     psraw     m0, 7
     packuswb  m0, m0
-    movh    [r0], m0        ; store
+    movh  [dstq], m0        ; store
 
     ; go to next line
-    add       r0, r1
-    add       r2, r3
-    dec      r4d            ; next row
+    add     dstq, dststrideq
+    add     srcq, srcstrideq
+    dec  heightd            ; next row
     jg .nextrow
     REP_RET
 
-cglobal put_vp8_epel%1_h4, 6, 6, 7
-    shl      r5d, 4
+cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
+    shl      mxd, 4
     mova      m2, [pw_64]
     mova      m3, [filter_h2_shuf]
     mova      m4, [filter_h4_shuf]
 %ifdef PIC
-    lea      r11, [fourtap_filter_hb_m]
+    lea  picregq, [fourtap_filter_hb_m]
 %endif
-    mova      m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
-    mova      m6, [fourtap_filter_hb+r5]
+    mova      m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
+    mova      m6, [fourtap_filter_hb+mxq]
 
 .nextrow
-    movu      m0, [r2-1]
+    movu      m0, [srcq-1]
     mova      m1, m0
     pshufb    m0, m3
     pshufb    m1, m4
@@ -237,33 +239,33 @@ cglobal put_vp8_epel%1_h4, 6, 6, 7
     paddsw    m0, m1
     psraw     m0, 7
     packuswb  m0, m0
-    movh    [r0], m0        ; store
+    movh  [dstq], m0        ; store
 
     ; go to next line
-    add       r0, r1
-    add       r2, r3
-    dec      r4d            ; next row
+    add     dstq, dststrideq
+    add     srcq, srcstrideq
+    dec  heightd            ; next row
     jg .nextrow
     REP_RET
 
-cglobal put_vp8_epel%1_v4, 7, 7, 8
-    shl      r6d, 4
+cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+    shl      myd, 4
 %ifdef PIC
-    lea      r11, [fourtap_filter_hb_m]
+    lea  picregq, [fourtap_filter_hb_m]
 %endif
-    mova      m5, [fourtap_filter_hb+r6-16]
-    mova      m6, [fourtap_filter_hb+r6]
+    mova      m5, [fourtap_filter_hb+myq-16]
+    mova      m6, [fourtap_filter_hb+myq]
     mova      m7, [pw_64]
 
     ; read 3 lines
-    sub       r2, r3
-    movh      m0, [r2]
-    movh      m1, [r2+  r3]
-    movh      m2, [r2+2*r3]
-    add       r2, r3
+    sub     srcq, srcstrideq
+    movh      m0, [srcq]
+    movh      m1, [srcq+  srcstrideq]
+    movh      m2, [srcq+2*srcstrideq]
+    add     srcq, srcstrideq
 
 .nextrow
-    movh      m3, [r2+2*r3]                ; read new row
+    movh      m3, [srcq+2*srcstrideq]      ; read new row
     mova      m4, m0
     mova      m0, m1
     punpcklbw m4, m1
@@ -276,44 +278,44 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8
     paddsw    m4, m7
     psraw     m4, 7
     packuswb  m4, m4
-    movh    [r0], m4
+    movh  [dstq], m4
 
     ; go to next line
-    add        r0, r1
-    add        r2, r3
-    dec       r4d                          ; next row
+    add      dstq, dststrideq
+    add      srcq, srcstrideq
+    dec   heightd                          ; next row
     jg .nextrow
     REP_RET
 
-cglobal put_vp8_epel%1_v6, 7, 7, 8
-    lea      r6d, [r6*3]
+cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+    lea      myd, [myq*3]
 %ifdef PIC
-    lea      r11, [sixtap_filter_hb_m]
+    lea  picregq, [sixtap_filter_hb_m]
 %endif
-    lea       r6, [sixtap_filter_hb+r6*8]
+    lea      myq, [sixtap_filter_hb+myq*8]
 
     ; read 5 lines
-    sub       r2, r3
-    sub       r2, r3
-    movh      m0, [r2]
-    movh      m1, [r2+r3]
-    movh      m2, [r2+r3*2]
-    lea       r2, [r2+r3*2]
-    add       r2, r3
-    movh      m3, [r2]
-    movh      m4, [r2+r3]
+    sub     srcq, srcstrideq
+    sub     srcq, srcstrideq
+    movh      m0, [srcq]
+    movh      m1, [srcq+srcstrideq]
+    movh      m2, [srcq+srcstrideq*2]
+    lea     srcq, [srcq+srcstrideq*2]
+    add     srcq, srcstrideq
+    movh      m3, [srcq]
+    movh      m4, [srcq+srcstrideq]
 
 .nextrow
-    movh      m5, [r2+2*r3]                ; read new row
+    movh      m5, [srcq+2*srcstrideq]      ; read new row
     mova      m6, m0
     punpcklbw m6, m5
     mova      m0, m1
     punpcklbw m1, m2
     mova      m7, m3
     punpcklbw m7, m4
-    pmaddubsw m6, [r6-48]
-    pmaddubsw m1, [r6-32]
-    pmaddubsw m7, [r6-16]
+    pmaddubsw m6, [myq-48]
+    pmaddubsw m1, [myq-32]
+    pmaddubsw m7, [myq-16]
     paddsw    m6, m1
     paddsw    m6, m7
     mova      m1, m2
@@ -323,12 +325,12 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8
     mova      m3, m4
     packuswb  m6, m6
     mova      m4, m5
-    movh    [r0], m6
+    movh  [dstq], m6
 
     ; go to next line
-    add        r0, r1
-    add        r2, r3
-    dec       r4d                          ; next row
+    add      dstq, dststrideq
+    add      srcq, srcstrideq
+    dec   heightd                          ; next row
     jg .nextrow
     REP_RET
 %endmacro
@@ -340,18 +342,18 @@ FILTER_SSSE3 8
 
 ; 4x4 block, H-only 4-tap filter
 INIT_MMX mmx2
-cglobal put_vp8_epel4_h4, 6, 6
-    shl       r5d, 4
+cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
+    shl       mxd, 4
 %ifdef PIC
-    lea       r11, [fourtap_filter_hw_m]
+    lea   picregq, [fourtap_filter_hw_m]
 %endif
-    movq      mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
-    movq      mm5, [fourtap_filter_hw+r5]
+    movq      mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
+    movq      mm5, [fourtap_filter_hw+mxq]
     movq      mm7, [pw_64]
     pxor      mm6, mm6
 
 .nextrow
-    movq      mm1, [r2-1]                  ; (ABCDEFGH) load 8 horizontal pixels
+    movq      mm1, [srcq-1]                ; (ABCDEFGH) load 8 horizontal pixels
 
     ; first set of 2 pixels
     movq      mm2, mm1                     ; byte ABCD..
@@ -377,30 +379,30 @@ cglobal put_vp8_epel4_h4, 6, 6
     paddsw    mm3, mm7                     ; rounding
     psraw     mm3, 7
     packuswb  mm3, mm6                     ; clip and word->bytes
-    movd     [r0], mm3                     ; store
+    movd   [dstq], mm3                     ; store
 
     ; go to next line
-    add        r0, r1
-    add        r2, r3
-    dec       r4d                          ; next row
+    add      dstq, dststrideq
+    add      srcq, srcstrideq
+    dec   heightd                          ; next row
     jg .nextrow
     REP_RET
 
 ; 4x4 block, H-only 6-tap filter
 INIT_MMX mmx2
-cglobal put_vp8_epel4_h6, 6, 6
-    lea       r5d, [r5*3]
+cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
+    lea       mxd, [mxq*3]
 %ifdef PIC
-    lea       r11, [sixtap_filter_hw_m]
+    lea   picregq, [sixtap_filter_hw_m]
 %endif
-    movq      mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
-    movq      mm5, [sixtap_filter_hw+r5*8-32]
-    movq      mm6, [sixtap_filter_hw+r5*8-16]
+    movq      mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
+    movq      mm5, [sixtap_filter_hw+mxq*8-32]
+    movq      mm6, [sixtap_filter_hw+mxq*8-16]
     movq      mm7, [pw_64]
     pxor      mm3, mm3
 
 .nextrow
-    movq      mm1, [r2-2]                  ; (ABCDEFGH) load 8 horizontal pixels
+    movq      mm1, [srcq-2]                ; (ABCDEFGH) load 8 horizontal pixels
 
     ; first set of 2 pixels
     movq      mm2, mm1                     ; byte ABCD..
@@ -420,7 +422,7 @@ cglobal put_vp8_epel4_h6, 6, 6
     paddd     mm1, mm2                     ; finish 1st 2px
 
     ; second set of 2 pixels, use backup of above
-    movd      mm2, [r2+3]                  ; byte FGHI (prevent overreads)
+    movd      mm2, [srcq+3]                ; byte FGHI (prevent overreads)
     pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
     pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
     paddd     mm0, mm3                     ; add to 2nd 2px cache
@@ -435,35 +437,35 @@ cglobal put_vp8_epel4_h6, 6, 6
     paddsw    mm1, mm7                     ; rounding
     psraw     mm1, 7
     packuswb  mm1, mm3                     ; clip and word->bytes
-    movd     [r0], mm1                     ; store
+    movd   [dstq], mm1                     ; store
 
     ; go to next line
-    add        r0, r1
-    add        r2, r3
-    dec       r4d                          ; next row
+    add      dstq, dststrideq
+    add      srcq, srcstrideq
+    dec   heightd                          ; next row
     jg .nextrow
     REP_RET
 
 INIT_XMM sse2
-cglobal put_vp8_epel8_h4, 6, 6, 10
-    shl      r5d, 5
+cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
+    shl      mxd, 5
 %ifdef PIC
-    lea      r11, [fourtap_filter_v_m]
+    lea  picregq, [fourtap_filter_v_m]
 %endif
-    lea       r5, [fourtap_filter_v+r5-32]
+    lea      mxq, [fourtap_filter_v+mxq-32]
     pxor      m7, m7
     mova      m4, [pw_64]
-    mova      m5, [r5+ 0]
-    mova      m6, [r5+16]
+    mova      m5, [mxq+ 0]
+    mova      m6, [mxq+16]
 %ifdef m8
-    mova      m8, [r5+32]
-    mova      m9, [r5+48]
+    mova      m8, [mxq+32]
+    mova      m9, [mxq+48]
 %endif
 .nextrow
-    movq      m0, [r2-1]
-    movq      m1, [r2-0]
-    movq      m2, [r2+1]
-    movq      m3, [r2+2]
+    movq      m0, [srcq-1]
+    movq      m1, [srcq-0]
+    movq      m2, [srcq+1]
+    movq      m3, [srcq+2]
     punpcklbw m0, m7
     punpcklbw m1, m7
     punpcklbw m2, m7
@@ -474,8 +476,8 @@ cglobal put_vp8_epel8_h4, 6, 6, 10
     pmullw    m2, m8
     pmullw    m3, m9
 %else
-    pmullw    m2, [r5+32]
-    pmullw    m3, [r5+48]
+    pmullw    m2, [mxq+32]
+    pmullw    m3, [mxq+48]
 %endif
     paddsw    m0, m1
     paddsw    m2, m3
@@ -483,40 +485,40 @@ cglobal put_vp8_epel8_h4, 6, 6, 10
     paddsw    m0, m4
     psraw     m0, 7
     packuswb  m0, m7
-    movh    [r0], m0        ; store
+    movh  [dstq], m0        ; store
 
     ; go to next line
-    add       r0, r1
-    add       r2, r3
-    dec      r4d            ; next row
+    add     dstq, dststrideq
+    add     srcq, srcstrideq
+    dec  heightd            ; next row
     jg .nextrow
     REP_RET
 
 INIT_XMM sse2
-cglobal put_vp8_epel8_h6, 6, 6, 14
-    lea      r5d, [r5*3]
-    shl      r5d, 4
+cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
+    lea      mxd, [mxq*3]
+    shl      mxd, 4
 %ifdef PIC
-    lea      r11, [sixtap_filter_v_m]
+    lea  picregq, [sixtap_filter_v_m]
 %endif
-    lea       r5, [sixtap_filter_v+r5-96]
+    lea      mxq, [sixtap_filter_v+mxq-96]
     pxor      m7, m7
     mova      m6, [pw_64]
 %ifdef m8
-    mova      m8, [r5+ 0]
-    mova      m9, [r5+16]
-    mova     m10, [r5+32]
-    mova     m11, [r5+48]
-    mova     m12, [r5+64]
-    mova     m13, [r5+80]
+    mova      m8, [mxq+ 0]
+    mova      m9, [mxq+16]
+    mova     m10, [mxq+32]
+    mova     m11, [mxq+48]
+    mova     m12, [mxq+64]
+    mova     m13, [mxq+80]
 %endif
 .nextrow
-    movq      m0, [r2-2]
-    movq      m1, [r2-1]
-    movq      m2, [r2-0]
-    movq      m3, [r2+1]
-    movq      m4, [r2+2]
-    movq      m5, [r2+3]
+    movq      m0, [srcq-2]
+    movq      m1, [srcq-1]
+    movq      m2, [srcq-0]
+    movq      m3, [srcq+1]
+    movq      m4, [srcq+2]
+    movq      m5, [srcq+3]
     punpcklbw m0, m7
     punpcklbw m1, m7
     punpcklbw m2, m7
@@ -531,12 +533,12 @@ cglobal put_vp8_epel8_h6, 6, 6, 14
     pmullw    m4, m12
     pmullw    m5, m13
 %else
-    pmullw    m0, [r5+ 0]
-    pmullw    m1, [r5+16]
-    pmullw    m2, [r5+32]
-    pmullw    m3, [r5+48]
-    pmullw    m4, [r5+64]
-    pmullw    m5, [r5+80]
+    pmullw    m0, [mxq+ 0]
+    pmullw    m1, [mxq+16]
+    pmullw    m2, [mxq+32]
+    pmullw    m3, [mxq+48]
+    pmullw    m4, [mxq+64]
+    pmullw    m5, [mxq+80]
 %endif
     paddsw    m1, m4
     paddsw    m0, m5
@@ -546,52 +548,52 @@ cglobal put_vp8_epel8_h6, 6, 6, 14
     paddsw    m0, m6
     psraw     m0, 7
     packuswb  m0, m7
-    movh    [r0], m0        ; store
+    movh  [dstq], m0        ; store
 
     ; go to next line
-    add       r0, r1
-    add       r2, r3
-    dec      r4d            ; next row
+    add     dstq, dststrideq
+    add     srcq, srcstrideq
+    dec  heightd            ; next row
     jg .nextrow
     REP_RET
 
 %macro FILTER_V 1
 ; 4x4 block, V-only 4-tap filter
-cglobal put_vp8_epel%1_v4, 7, 7, 8
-    shl      r6d, 5
+cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+    shl      myd, 5
 %ifdef PIC
-    lea      r11, [fourtap_filter_v_m]
+    lea  picregq, [fourtap_filter_v_m]
 %endif
-    lea       r6, [fourtap_filter_v+r6-32]
+    lea      myq, [fourtap_filter_v+myq-32]
     mova      m6, [pw_64]
     pxor      m7, m7
-    mova      m5, [r6+48]
+    mova      m5, [myq+48]
 
     ; read 3 lines
-    sub       r2, r3
-    movh      m0, [r2]
-    movh      m1, [r2+  r3]
-    movh      m2, [r2+2*r3]
-    add       r2, r3
+    sub     srcq, srcstrideq
+    movh      m0, [srcq]
+    movh      m1, [srcq+  srcstrideq]
+    movh      m2, [srcq+2*srcstrideq]
+    add     srcq, srcstrideq
     punpcklbw m0, m7
     punpcklbw m1, m7
     punpcklbw m2, m7
 
 .nextrow
     ; first calculate negative taps (to prevent losing positive overflows)
-    movh      m4, [r2+2*r3]                ; read new row
+    movh      m4, [srcq+2*srcstrideq]      ; read new row
     punpcklbw m4, m7
     mova      m3, m4
-    pmullw    m0, [r6+0]
+    pmullw    m0, [myq+0]
     pmullw    m4, m5
     paddsw    m4, m0
 
     ; then calculate positive taps
     mova      m0, m1
-    pmullw    m1, [r6+16]
+    pmullw    m1, [myq+16]
     paddsw    m4, m1
     mova      m1, m2
-    pmullw    m2, [r6+32]
+    pmullw    m2, [myq+32]
     paddsw    m4, m2
     mova      m2, m3
 
@@ -599,36 +601,36 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8
     paddsw    m4, m6
     psraw     m4, 7
     packuswb  m4, m7
-    movh    [r0], m4
+    movh  [dstq], m4
 
     ; go to next line
-    add       r0, r1
-    add       r2, r3
-    dec      r4d                           ; next row
+    add     dstq, dststrideq
+    add     srcq, srcstrideq
+    dec  heightd                           ; next row
     jg .nextrow
     REP_RET
 
 
 ; 4x4 block, V-only 6-tap filter
-cglobal put_vp8_epel%1_v6, 7, 7, 8
-    shl      r6d, 4
-    lea       r6, [r6*3]
+cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+    shl      myd, 4
+    lea      myq, [myq*3]
 %ifdef PIC
-    lea      r11, [sixtap_filter_v_m]
+    lea  picregq, [sixtap_filter_v_m]
 %endif
-    lea       r6, [sixtap_filter_v+r6-96]
+    lea      myq, [sixtap_filter_v+myq-96]
     pxor      m7, m7
 
     ; read 5 lines
-    sub       r2, r3
-    sub       r2, r3
-    movh      m0, [r2]
-    movh      m1, [r2+r3]
-    movh      m2, [r2+r3*2]
-    lea       r2, [r2+r3*2]
-    add       r2, r3
-    movh      m3, [r2]
-    movh      m4, [r2+r3]
+    sub     srcq, srcstrideq
+    sub     srcq, srcstrideq
+    movh      m0, [srcq]
+    movh      m1, [srcq+srcstrideq]
+    movh      m2, [srcq+srcstrideq*2]
+    lea     srcq, [srcq+srcstrideq*2]
+    add     srcq, srcstrideq
+    movh      m3, [srcq]
+    movh      m4, [srcq+srcstrideq]
     punpcklbw m0, m7
     punpcklbw m1, m7
     punpcklbw m2, m7
@@ -638,38 +640,38 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8
 .nextrow
     ; first calculate negative taps (to prevent losing positive overflows)
     mova      m5, m1
-    pmullw    m5, [r6+16]
+    pmullw    m5, [myq+16]
     mova      m6, m4
-    pmullw    m6, [r6+64]
+    pmullw    m6, [myq+64]
     paddsw    m6, m5
 
     ; then calculate positive taps
-    movh      m5, [r2+2*r3]                ; read new row
+    movh      m5, [srcq+2*srcstrideq]      ; read new row
     punpcklbw m5, m7
-    pmullw    m0, [r6+0]
+    pmullw    m0, [myq+0]
     paddsw    m6, m0
     mova      m0, m1
     mova      m1, m2
-    pmullw    m2, [r6+32]
+    pmullw    m2, [myq+32]
     paddsw    m6, m2
     mova      m2, m3
-    pmullw    m3, [r6+48]
+    pmullw    m3, [myq+48]
     paddsw    m6, m3
     mova      m3, m4
     mova      m4, m5
-    pmullw    m5, [r6+80]
+    pmullw    m5, [myq+80]
     paddsw    m6, m5
 
     ; round/clip/store
     paddsw    m6, [pw_64]
     psraw     m6, 7
     packuswb  m6, m7
-    movh    [r0], m6
+    movh  [dstq], m6
 
     ; go to next line
-    add       r0, r1
-    add       r2, r3
-    dec      r4d                           ; next row
+    add     dstq, dststrideq
+    add     srcq, srcstrideq
+    dec  heightd                           ; next row
     jg .nextrow
     REP_RET
 %endmacro
@@ -680,20 +682,19 @@ INIT_XMM sse2
 FILTER_V 8
 
 %macro FILTER_BILINEAR 1
-cglobal put_vp8_bilinear%1_v, 7, 7, 7
-    mov      r5d, 8*16
-    shl      r6d, 4
-    sub      r5d, r6d
+cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
+    shl      myd, 4
 %ifdef PIC
-    lea      r11, [bilinear_filter_vw_m]
+    lea  picregq, [bilinear_filter_vw_m]
 %endif
     pxor      m6, m6
-    mova      m4, [bilinear_filter_vw+r5-16]
-    mova      m5, [bilinear_filter_vw+r6-16]
+    mova      m5, [bilinear_filter_vw+myq-1*16]
+    neg      myq
+    mova      m4, [bilinear_filter_vw+myq+7*16]
 .nextrow
-    movh      m0, [r2+r3*0]
-    movh      m1, [r2+r3*1]
-    movh      m3, [r2+r3*2]
+    movh      m0, [srcq+srcstrideq*0]
+    movh      m1, [srcq+srcstrideq*1]
+    movh      m3, [srcq+srcstrideq*2]
     punpcklbw m0, m6
     punpcklbw m1, m6
     punpcklbw m3, m6
@@ -711,35 +712,34 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7
 %if mmsize == 8
     packuswb  m0, m0
     packuswb  m2, m2
-    movh [r0+r1*0], m0
-    movh [r0+r1*1], m2
+    movh   [dstq+dststrideq*0], m0
+    movh   [dstq+dststrideq*1], m2
 %else
     packuswb  m0, m2
-    movh   [r0+r1*0], m0
-    movhps [r0+r1*1], m0
+    movh   [dstq+dststrideq*0], m0
+    movhps [dstq+dststrideq*1], m0
 %endif
 
-    lea       r0, [r0+r1*2]
-    lea       r2, [r2+r3*2]
-    sub      r4d, 2
+    lea     dstq, [dstq+dststrideq*2]
+    lea     srcq, [srcq+srcstrideq*2]
+    sub  heightd, 2
     jg .nextrow
     REP_RET
 
-cglobal put_vp8_bilinear%1_h, 7, 7, 7
-    mov      r6d, 8*16
-    shl      r5d, 4
-    sub      r6d, r5d
+cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
+    shl      mxd, 4
 %ifdef PIC
-    lea      r11, [bilinear_filter_vw_m]
+    lea  picregq, [bilinear_filter_vw_m]
 %endif
     pxor      m6, m6
-    mova      m4, [bilinear_filter_vw+r6-16]
-    mova      m5, [bilinear_filter_vw+r5-16]
+    mova      m5, [bilinear_filter_vw+mxq-1*16]
+    neg      mxq
+    mova      m4, [bilinear_filter_vw+mxq+7*16]
 .nextrow
-    movh      m0, [r2+r3*0+0]
-    movh      m1, [r2+r3*0+1]
-    movh      m2, [r2+r3*1+0]
-    movh      m3, [r2+r3*1+1]
+    movh      m0, [srcq+srcstrideq*0+0]
+    movh      m1, [srcq+srcstrideq*0+1]
+    movh      m2, [srcq+srcstrideq*1+0]
+    movh      m3, [srcq+srcstrideq*1+1]
     punpcklbw m0, m6
     punpcklbw m1, m6
     punpcklbw m2, m6
@@ -757,17 +757,17 @@ cglobal put_vp8_bilinear%1_h, 7, 7, 7
 %if mmsize == 8
     packuswb  m0, m0
     packuswb  m2, m2
-    movh [r0+r1*0], m0
-    movh [r0+r1*1], m2
+    movh   [dstq+dststrideq*0], m0
+    movh   [dstq+dststrideq*1], m2
 %else
     packuswb  m0, m2
-    movh   [r0+r1*0], m0
-    movhps [r0+r1*1], m0
+    movh   [dstq+dststrideq*0], m0
+    movhps [dstq+dststrideq*1], m0
 %endif
 
-    lea       r0, [r0+r1*2]
-    lea       r2, [r2+r3*2]
-    sub      r4d, 2
+    lea     dstq, [dstq+dststrideq*2]
+    lea     srcq, [srcq+srcstrideq*2]
+    sub  heightd, 2
     jg .nextrow
     REP_RET
 %endmacro
@@ -778,17 +778,17 @@ INIT_XMM sse2
 FILTER_BILINEAR 8
 
 %macro FILTER_BILINEAR_SSSE3 1
-cglobal put_vp8_bilinear%1_v, 7, 7, 5
-    shl      r6d, 4
+cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
+    shl      myd, 4
 %ifdef PIC
-    lea      r11, [bilinear_filter_vb_m]
+    lea  picregq, [bilinear_filter_vb_m]
 %endif
     pxor      m4, m4
-    mova      m3, [bilinear_filter_vb+r6-16]
+    mova      m3, [bilinear_filter_vb+myq-16]
 .nextrow
-    movh      m0, [r2+r3*0]
-    movh      m1, [r2+r3*1]
-    movh      m2, [r2+r3*2]
+    movh      m0, [srcq+srcstrideq*0]
+    movh      m1, [srcq+srcstrideq*1]
+    movh      m2, [srcq+srcstrideq*2]
     punpcklbw m0, m1
     punpcklbw m1, m2
     pmaddubsw m0, m3
@@ -800,31 +800,31 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5
 %if mmsize==8
     packuswb  m0, m0
     packuswb  m1, m1
-    movh [r0+r1*0], m0
-    movh [r0+r1*1], m1
+    movh   [dstq+dststrideq*0], m0
+    movh   [dstq+dststrideq*1], m1
 %else
     packuswb  m0, m1
-    movh   [r0+r1*0], m0
-    movhps [r0+r1*1], m0
+    movh   [dstq+dststrideq*0], m0
+    movhps [dstq+dststrideq*1], m0
 %endif
 
-    lea       r0, [r0+r1*2]
-    lea       r2, [r2+r3*2]
-    sub      r4d, 2
+    lea     dstq, [dstq+dststrideq*2]
+    lea     srcq, [srcq+srcstrideq*2]
+    sub  heightd, 2
     jg .nextrow
     REP_RET
 
-cglobal put_vp8_bilinear%1_h, 7, 7, 5
-    shl      r5d, 4
+cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
+    shl      mxd, 4
 %ifdef PIC
-    lea      r11, [bilinear_filter_vb_m]
+    lea  picregq, [bilinear_filter_vb_m]
 %endif
     pxor      m4, m4
     mova      m2, [filter_h2_shuf]
-    mova      m3, [bilinear_filter_vb+r5-16]
+    mova      m3, [bilinear_filter_vb+mxq-16]
 .nextrow
-    movu      m0, [r2+r3*0]
-    movu      m1, [r2+r3*1]
+    movu      m0, [srcq+srcstrideq*0]
+    movu      m1, [srcq+srcstrideq*1]
     pshufb    m0, m2
     pshufb    m1, m2
     pmaddubsw m0, m3
@@ -836,17 +836,17 @@ cglobal put_vp8_bilinear%1_h, 7, 7, 5
 %if mmsize==8
     packuswb  m0, m0
     packuswb  m1, m1
-    movh [r0+r1*0], m0
-    movh [r0+r1*1], m1
+    movh   [dstq+dststrideq*0], m0
+    movh   [dstq+dststrideq*1], m1
 %else
     packuswb  m0, m1
-    movh   [r0+r1*0], m0
-    movhps [r0+r1*1], m0
+    movh   [dstq+dststrideq*0], m0
+    movhps [dstq+dststrideq*1], m0
 %endif
 
-    lea       r0, [r0+r1*2]
-    lea       r2, [r2+r3*2]
-    sub      r4d, 2
+    lea     dstq, [dstq+dststrideq*2]
+    lea     srcq, [srcq+srcstrideq*2]
+    sub  heightd, 2
     jg .nextrow
     REP_RET
 %endmacro
@@ -857,47 +857,47 @@ INIT_XMM ssse3
 FILTER_BILINEAR_SSSE3 8
 
 INIT_MMX mmx
-cglobal put_vp8_pixels8, 5,5
+cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
 .nextrow:
-    movq  mm0, [r2+r3*0]
-    movq  mm1, [r2+r3*1]
-    lea    r2, [r2+r3*2]
-    movq [r0+r1*0], mm0
-    movq [r0+r1*1], mm1
-    lea    r0, [r0+r1*2]
-    sub   r4d, 2
+    movq    mm0, [srcq+srcstrideq*0]
+    movq    mm1, [srcq+srcstrideq*1]
+    lea    srcq, [srcq+srcstrideq*2]
+    movq [dstq+dststrideq*0], mm0
+    movq [dstq+dststrideq*1], mm1
+    lea    dstq, [dstq+dststrideq*2]
+    sub heightd, 2
     jg .nextrow
     REP_RET
 
 %if ARCH_X86_32
 INIT_MMX mmx
-cglobal put_vp8_pixels16, 5,5
+cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
 .nextrow:
-    movq  mm0, [r2+r3*0+0]
-    movq  mm1, [r2+r3*0+8]
-    movq  mm2, [r2+r3*1+0]
-    movq  mm3, [r2+r3*1+8]
-    lea    r2, [r2+r3*2]
-    movq [r0+r1*0+0], mm0
-    movq [r0+r1*0+8], mm1
-    movq [r0+r1*1+0], mm2
-    movq [r0+r1*1+8], mm3
-    lea    r0, [r0+r1*2]
-    sub   r4d, 2
+    movq    mm0, [srcq+srcstrideq*0+0]
+    movq    mm1, [srcq+srcstrideq*0+8]
+    movq    mm2, [srcq+srcstrideq*1+0]
+    movq    mm3, [srcq+srcstrideq*1+8]
+    lea    srcq, [srcq+srcstrideq*2]
+    movq [dstq+dststrideq*0+0], mm0
+    movq [dstq+dststrideq*0+8], mm1
+    movq [dstq+dststrideq*1+0], mm2
+    movq [dstq+dststrideq*1+8], mm3
+    lea    dstq, [dstq+dststrideq*2]
+    sub heightd, 2
     jg .nextrow
     REP_RET
 %endif
 
 INIT_XMM sse
-cglobal put_vp8_pixels16, 5,5,2
+cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
 .nextrow:
-    movups xmm0, [r2+r3*0]
-    movups xmm1, [r2+r3*1]
-    lea     r2, [r2+r3*2]
-    movaps [r0+r1*0], xmm0
-    movaps [r0+r1*1], xmm1
-    lea     r0, [r0+r1*2]
-    sub    r4d, 2
+    movups xmm0, [srcq+srcstrideq*0]
+    movups xmm1, [srcq+srcstrideq*1]
+    lea    srcq, [srcq+srcstrideq*2]
+    movaps [dstq+dststrideq*0], xmm0
+    movaps [dstq+dststrideq*1], xmm1
+    lea    dstq, [dstq+dststrideq*2]
+    sub heightd, 2
     jg .nextrow
     REP_RET
 

From 8476ca3b4ee5455eceb6ea6e58ea8200d63dff9d Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Sat, 3 Mar 2012 08:04:40 -0800
Subject: [PATCH 13/27] vp8: convert idct x86 assembly to use named arguments.

---
 libavcodec/x86/vp8dsp.asm | 200 ++++++++++++++++++++------------------
 1 file changed, 103 insertions(+), 97 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 2f45f5a478..74456c687e 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -906,10 +906,10 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
 ;-----------------------------------------------------------------------------
 
 %macro ADD_DC 4
-    %4        m2, [r0+%3]
-    %4        m3, [r0+r2+%3]
-    %4        m4, [r1+%3]
-    %4        m5, [r1+r2+%3]
+    %4        m2, [dst1q+%3]
+    %4        m3, [dst1q+strideq+%3]
+    %4        m4, [dst2q+%3]
+    %4        m5, [dst2q+strideq+%3]
     paddusb   m2, %1
     paddusb   m3, %1
     paddusb   m4, %1
@@ -918,22 +918,22 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
     psubusb   m3, %2
     psubusb   m4, %2
     psubusb   m5, %2
-    %4    [r0+%3], m2
-    %4 [r0+r2+%3], m3
-    %4    [r1+%3], m4
-    %4 [r1+r2+%3], m5
+    %4 [dst1q+%3], m2
+    %4 [dst1q+strideq+%3], m3
+    %4 [dst2q+%3], m4
+    %4 [dst2q+strideq+%3], m5
 %endmacro
 
 INIT_MMX mmx
-cglobal vp8_idct_dc_add, 3, 3
+cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
     ; load data
-    movd       m0, [r1]
+    movd       m0, [blockq]
 
     ; calculate DC
     paddw      m0, [pw_4]
     pxor       m1, m1
     psraw      m0, 3
-    movd      [r1], m1
+    movd [blockq], m1
     psubw      m1, m0
     packuswb   m0, m0
     packuswb   m1, m1
@@ -943,24 +943,26 @@ cglobal vp8_idct_dc_add, 3, 3
     punpcklwd  m1, m1
 
     ; add DC
-    lea        r1, [r0+r2*2]
+    DEFINE_ARGS dst1, dst2, stride
+    lea     dst2q, [dst1q+strideq*2]
     ADD_DC     m0, m1, 0, movh
     RET
 
 INIT_XMM sse4
-cglobal vp8_idct_dc_add, 3, 3, 6
+cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
     ; load data
-    movd       m0, [r1]
+    movd       m0, [blockq]
     pxor       m1, m1
 
     ; calculate DC
     paddw      m0, [pw_4]
-    movd     [r1], m1
-    lea        r1, [r0+r2*2]
-    movd       m2, [r0]
-    movd       m3, [r0+r2]
-    movd       m4, [r1]
-    movd       m5, [r1+r2]
+    movd [blockq], m1
+    DEFINE_ARGS dst1, dst2, stride
+    lea     dst2q, [dst1q+strideq*2]
+    movd       m2, [dst1q]
+    movd       m3, [dst1q+strideq]
+    movd       m4, [dst2q]
+    movd       m5, [dst2q+strideq]
     psraw      m0, 3
     pshuflw    m0, m0, 0
     punpcklqdq m0, m0
@@ -971,10 +973,10 @@ cglobal vp8_idct_dc_add, 3, 3, 6
     paddw      m2, m0
     paddw      m4, m0
     packuswb   m2, m4
-    movd      [r0], m2
-    pextrd [r0+r2], m2, 1
-    pextrd    [r1], m2, 2
-    pextrd [r1+r2], m2, 3
+    movd   [dst1q], m2
+    pextrd [dst1q+strideq], m2, 1
+    pextrd [dst2q], m2, 2
+    pextrd [dst2q+strideq], m2, 3
     RET
 
 ;-----------------------------------------------------------------------------
@@ -983,21 +985,21 @@ cglobal vp8_idct_dc_add, 3, 3, 6
 
 %if ARCH_X86_32
 INIT_MMX mmx
-cglobal vp8_idct_dc_add4y, 3, 3
+cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
     ; load data
-    movd      m0, [r1+32*0] ; A
-    movd      m1, [r1+32*2] ; C
-    punpcklwd m0, [r1+32*1] ; A B
-    punpcklwd m1, [r1+32*3] ; C D
+    movd      m0, [blockq+32*0] ; A
+    movd      m1, [blockq+32*2] ; C
+    punpcklwd m0, [blockq+32*1] ; A B
+    punpcklwd m1, [blockq+32*3] ; C D
     punpckldq m0, m1        ; A B C D
     pxor      m6, m6
 
     ; calculate DC
     paddw     m0, [pw_4]
-    movd [r1+32*0], m6
-    movd [r1+32*1], m6
-    movd [r1+32*2], m6
-    movd [r1+32*3], m6
+    movd [blockq+32*0], m6
+    movd [blockq+32*1], m6
+    movd [blockq+32*2], m6
+    movd [blockq+32*3], m6
     psraw     m0, 3
     psubw     m6, m0
     packuswb  m0, m0
@@ -1012,28 +1014,29 @@ cglobal vp8_idct_dc_add4y, 3, 3
     punpckhbw m7, m7 ; CCCCDDDD
 
     ; add DC
-    lea       r1, [r0+r2*2]
+    DEFINE_ARGS dst1, dst2, stride
+    lea    dst2q, [dst1q+strideq*2]
     ADD_DC    m0, m6, 0, mova
     ADD_DC    m1, m7, 8, mova
     RET
 %endif
 
 INIT_XMM sse2
-cglobal vp8_idct_dc_add4y, 3, 3, 6
+cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
     ; load data
-    movd      m0, [r1+32*0] ; A
-    movd      m1, [r1+32*2] ; C
-    punpcklwd m0, [r1+32*1] ; A B
-    punpcklwd m1, [r1+32*3] ; C D
+    movd      m0, [blockq+32*0] ; A
+    movd      m1, [blockq+32*2] ; C
+    punpcklwd m0, [blockq+32*1] ; A B
+    punpcklwd m1, [blockq+32*3] ; C D
     punpckldq m0, m1        ; A B C D
     pxor      m1, m1
 
     ; calculate DC
     paddw     m0, [pw_4]
-    movd [r1+32*0], m1
-    movd [r1+32*1], m1
-    movd [r1+32*2], m1
-    movd [r1+32*3], m1
+    movd [blockq+32*0], m1
+    movd [blockq+32*1], m1
+    movd [blockq+32*2], m1
+    movd [blockq+32*3], m1
     psraw     m0, 3
     psubw     m1, m0
     packuswb  m0, m0
@@ -1044,7 +1047,8 @@ cglobal vp8_idct_dc_add4y, 3, 3, 6
     punpcklbw m1, m1
 
     ; add DC
-    lea       r1, [r0+r2*2]
+    DEFINE_ARGS dst1, dst2, stride
+    lea    dst2q, [dst1q+strideq*2]
     ADD_DC    m0, m1, 0, mova
     RET
 
@@ -1053,21 +1057,21 @@ cglobal vp8_idct_dc_add4y, 3, 3, 6
 ;-----------------------------------------------------------------------------
 
 INIT_MMX mmx
-cglobal vp8_idct_dc_add4uv, 3, 3
+cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
     ; load data
-    movd      m0, [r1+32*0] ; A
-    movd      m1, [r1+32*2] ; C
-    punpcklwd m0, [r1+32*1] ; A B
-    punpcklwd m1, [r1+32*3] ; C D
+    movd      m0, [blockq+32*0] ; A
+    movd      m1, [blockq+32*2] ; C
+    punpcklwd m0, [blockq+32*1] ; A B
+    punpcklwd m1, [blockq+32*3] ; C D
     punpckldq m0, m1        ; A B C D
     pxor      m6, m6
 
     ; calculate DC
     paddw     m0, [pw_4]
-    movd [r1+32*0], m6
-    movd [r1+32*1], m6
-    movd [r1+32*2], m6
-    movd [r1+32*3], m6
+    movd [blockq+32*0], m6
+    movd [blockq+32*1], m6
+    movd [blockq+32*2], m6
+    movd [blockq+32*3], m6
     psraw     m0, 3
     psubw     m6, m0
     packuswb  m0, m0
@@ -1082,10 +1086,11 @@ cglobal vp8_idct_dc_add4uv, 3, 3
     punpckhbw m7, m7 ; CCCCDDDD
 
     ; add DC
-    lea       r1, [r0+r2*2]
+    DEFINE_ARGS dst1, dst2, stride
+    lea    dst2q, [dst1q+strideq*2]
     ADD_DC    m0, m6, 0, mova
-    lea       r0, [r0+r2*4]
-    lea       r1, [r1+r2*4]
+    lea    dst1q, [dst1q+strideq*4]
+    lea    dst2q, [dst2q+strideq*4]
     ADD_DC    m1, m7, 0, mova
     RET
 
@@ -1125,24 +1130,24 @@ cglobal vp8_idct_dc_add4uv, 3, 3
 %endmacro
 
 %macro VP8_IDCT_ADD 0
-cglobal vp8_idct_add, 3, 3
+cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
     ; load block data
-    movq         m0, [r1+ 0]
-    movq         m1, [r1+ 8]
-    movq         m2, [r1+16]
-    movq         m3, [r1+24]
+    movq         m0, [blockq+ 0]
+    movq         m1, [blockq+ 8]
+    movq         m2, [blockq+16]
+    movq         m3, [blockq+24]
     movq         m6, [pw_20091]
     movq         m7, [pw_17734]
 %if cpuflag(sse)
     xorps      xmm0, xmm0
-    movaps  [r1+ 0], xmm0
-    movaps  [r1+16], xmm0
+    movaps [blockq+ 0], xmm0
+    movaps [blockq+16], xmm0
 %else
     pxor         m4, m4
-    movq    [r1+ 0], m4
-    movq    [r1+ 8], m4
-    movq    [r1+16], m4
-    movq    [r1+24], m4
+    movq [blockq+ 0], m4
+    movq [blockq+ 8], m4
+    movq [blockq+16], m4
+    movq [blockq+24], m4
 %endif
 
     ; actual IDCT
@@ -1154,9 +1159,10 @@ cglobal vp8_idct_add, 3, 3
 
     ; store
     pxor         m4, m4
-    lea          r1, [r0+2*r2]
-    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
-    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
+    DEFINE_ARGS dst1, dst2, stride
+    lea       dst2q, [dst1q+2*strideq]
+    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
+    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
 
     RET
 %endmacro
@@ -1173,24 +1179,24 @@ VP8_IDCT_ADD
 ;-----------------------------------------------------------------------------
 
 %macro SCATTER_WHT 3
-    movd  r1d, m%1
-    movd  r2d, m%2
-    mov [r0+2*16*(0+%3)], r1w
-    mov [r0+2*16*(1+%3)], r2w
-    shr   r1d, 16
-    shr   r2d, 16
+    movd dc1d, m%1
+    movd dc2d, m%2
+    mov [blockq+2*16*(0+%3)], dc1w
+    mov [blockq+2*16*(1+%3)], dc2w
+    shr  dc1d, 16
+    shr  dc2d, 16
     psrlq m%1, 32
     psrlq m%2, 32
-    mov [r0+2*16*(4+%3)], r1w
-    mov [r0+2*16*(5+%3)], r2w
-    movd  r1d, m%1
-    movd  r2d, m%2
-    mov [r0+2*16*(8+%3)], r1w
-    mov [r0+2*16*(9+%3)], r2w
-    shr   r1d, 16
-    shr   r2d, 16
-    mov [r0+2*16*(12+%3)], r1w
-    mov [r0+2*16*(13+%3)], r2w
+    mov [blockq+2*16*(4+%3)], dc1w
+    mov [blockq+2*16*(5+%3)], dc2w
+    movd dc1d, m%1
+    movd dc2d, m%2
+    mov [blockq+2*16*(8+%3)], dc1w
+    mov [blockq+2*16*(9+%3)], dc2w
+    shr  dc1d, 16
+    shr  dc2d, 16
+    mov [blockq+2*16*(12+%3)], dc1w
+    mov [blockq+2*16*(13+%3)], dc2w
 %endmacro
 
 %macro HADAMARD4_1D 4
@@ -1200,21 +1206,21 @@ VP8_IDCT_ADD
 %endmacro
 
 %macro VP8_DC_WHT 0
-cglobal vp8_luma_dc_wht, 2, 3
-    movq          m0, [r1]
-    movq          m1, [r1+8]
-    movq          m2, [r1+16]
-    movq          m3, [r1+24]
+cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
+    movq          m0, [dc1q]
+    movq          m1, [dc1q+8]
+    movq          m2, [dc1q+16]
+    movq          m3, [dc1q+24]
 %if cpuflag(sse)
     xorps      xmm0, xmm0
-    movaps  [r1+ 0], xmm0
-    movaps  [r1+16], xmm0
+    movaps [dc1q+ 0], xmm0
+    movaps [dc1q+16], xmm0
 %else
     pxor         m4, m4
-    movq    [r1+ 0], m4
-    movq    [r1+ 8], m4
-    movq    [r1+16], m4
-    movq    [r1+24], m4
+    movq  [dc1q+ 0], m4
+    movq  [dc1q+ 8], m4
+    movq  [dc1q+16], m4
+    movq  [dc1q+24], m4
 %endif
     HADAMARD4_1D  0, 1, 2, 3
     TRANSPOSE4x4W 0, 1, 2, 3, 4

From b4188f0d4688477d1f72914105a485558d86662b Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Sat, 3 Mar 2012 12:55:34 -0800
Subject: [PATCH 14/27] vp8: convert simple loopfilter x86 assembly to use
 named arguments.

---
 libavcodec/x86/vp8dsp.asm | 55 +++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 74456c687e..8e13560b9e 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -1489,20 +1489,25 @@ VP8_DC_WHT
 %endmacro
 
 %macro SIMPLE_LOOPFILTER 2
-cglobal vp8_%1_loop_filter_simple, 3, %2, 8
+cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
 %if mmsize == 8 ; mmx/mmxext
-    mov            r3, 2
+    mov         cntrq, 2
 %endif
 %if cpuflag(ssse3)
     pxor           m0, m0
 %endif
-    SPLATB_REG     m7, r2, m0       ; splat "flim" into register
+    SPLATB_REG     m7, flim, m0     ; splat "flim" into register
 
     ; set up indexes to address 4 rows
-    mov            r2, r1
-    neg            r1
+%if mmsize == 8
+    DEFINE_ARGS dst1, mstride, stride, cntr, dst2
+%else
+    DEFINE_ARGS dst1, mstride, stride, dst3, dst2
+%endif
+    mov       strideq, mstrideq
+    neg      mstrideq
 %ifidn %1, h
-    lea            r0, [r0+4*r2-2]
+    lea         dst1q, [dst1q+4*strideq-2]
 %endif
 
 %if mmsize == 8 ; mmx / mmxext
@@ -1510,17 +1515,17 @@ cglobal vp8_%1_loop_filter_simple, 3, %2, 8
 %endif
 %ifidn %1, v
     ; read 4 half/full rows of pixels
-    mova           m0, [r0+r1*2]    ; p1
-    mova           m1, [r0+r1]      ; p0
-    mova           m2, [r0]         ; q0
-    mova           m3, [r0+r2]      ; q1
+    mova           m0, [dst1q+mstrideq*2]    ; p1
+    mova           m1, [dst1q+mstrideq]      ; p0
+    mova           m2, [dst1q]               ; q0
+    mova           m3, [dst1q+ strideq]      ; q1
 %else ; h
-    lea            r4, [r0+r2]
+    lea         dst2q, [dst1q+ strideq]
 
 %if mmsize == 8 ; mmx/mmxext
-    READ_8x4_INTERLEAVED  0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
+    READ_8x4_INTERLEAVED  0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
 %else ; sse2
-    READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
+    READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
 %endif
     TRANSPOSE4x4W         0, 1, 2, 3, 4
 %endif
@@ -1590,35 +1595,35 @@ cglobal vp8_%1_loop_filter_simple, 3, %2, 8
 
     ; store
 %ifidn %1, v
-    mova         [r0], m4
-    mova      [r0+r1], m6
+    mova      [dst1q], m4
+    mova [dst1q+mstrideq], m6
 %else ; h
-    inc           r0
+    inc        dst1q
     SBUTTERFLY    bw, 6, 4, 0
 
 %if mmsize == 16 ; sse2
 %if cpuflag(sse4)
-    inc            r4
+    inc         dst2q
 %endif
-    WRITE_8W       m6, r4, r0, r1, r2
-    lea            r4, [r3+r1+1]
+    WRITE_8W       m6, dst2q, dst1q, mstrideq, strideq
+    lea         dst2q, [dst3q+mstrideq+1]
 %if cpuflag(sse4)
-    inc            r3
+    inc         dst3q
 %endif
-    WRITE_8W       m4, r3, r4, r1, r2
+    WRITE_8W       m4, dst3q, dst2q, mstrideq, strideq
 %else ; mmx/mmxext
-    WRITE_2x4W     m6, m4, r4, r0, r1, r2
+    WRITE_2x4W     m6, m4, dst2q, dst1q, mstrideq, strideq
 %endif
 %endif
 
 %if mmsize == 8 ; mmx/mmxext
     ; next 8 pixels
 %ifidn %1, v
-    add            r0, 8            ; advance 8 cols = pixels
+    add         dst1q, 8            ; advance 8 cols = pixels
 %else ; h
-    lea            r0, [r0+r2*8-1]  ; advance 8 rows = lines
+    lea         dst1q, [dst1q+strideq*8-1]  ; advance 8 rows = lines
 %endif
-    dec            r3
+    dec         cntrq
     jg .next8px
     REP_RET
 %else ; sse2

From dccb2cd3f9d40198e319dd70cbe997b377541a10 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Sat, 3 Mar 2012 14:15:32 -0800
Subject: [PATCH 15/27] swscale: make %rep unconditional.

Fixes pre-processing with latest versions of nasm.
---
 libswscale/x86/output.asm | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index 4f15ec98cd..68dbf51b02 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -131,8 +131,12 @@ cglobal yuv2planeX_%1, %3, 7, %2, filter, fltsize, src, dst, w, dither, offset
     ; pixels per iteration. In order to not have to keep track of where
     ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
 %if %1 == 8
-%rep 16/mmsize
-%endif ; %1 == 8
+%assign %%repcnt 16/mmsize
+%else
+%assign %%repcnt 1
+%endif
+
+%rep %%repcnt
 
 %if %1 == 8
 %if ARCH_X86_32
@@ -226,10 +230,9 @@ cglobal yuv2planeX_%1, %3, 7, %2, filter, fltsize, src, dst, w, dither, offset
 
     add             r5,  mmsize/2
     sub             wd,  mmsize/2
-%if %1 == 8
+
 %assign %%i %%i+2
 %endrep
-%endif ; %1 == 8
     jg .pixelloop
 
 %if %1 == 8

From 3e9cd8b4b0b7b5cd5c1c2119da7b3e7d4c1fb86a Mon Sep 17 00:00:00 2001
From: Aneesh Dogra <lionaneesh@gmail.com>
Date: Sun, 4 Mar 2012 09:59:43 +0530
Subject: [PATCH 16/27] qpeg: Use bytestream2 functions to prevent buffer
 overreads.

Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
---
 libavcodec/qpeg.c | 87 ++++++++++++++++++++++++-----------------------
 1 file changed, 44 insertions(+), 43 deletions(-)

diff --git a/libavcodec/qpeg.c b/libavcodec/qpeg.c
index 0f1bcd7ac9..f8cbef37f2 100644
--- a/libavcodec/qpeg.c
+++ b/libavcodec/qpeg.c
@@ -25,16 +25,18 @@
  */
 
 #include "avcodec.h"
+#include "bytestream.h"
 
 typedef struct QpegContext{
     AVCodecContext *avctx;
     AVFrame pic;
     uint8_t *refdata;
     uint32_t pal[256];
+    GetByteContext buffer;
 } QpegContext;
 
-static void qpeg_decode_intra(const uint8_t *src, uint8_t *dst, int size,
-                            int stride, int width, int height)
+static void qpeg_decode_intra(QpegContext *qctx, uint8_t *dst,
+                              int stride, int width, int height)
 {
     int i;
     int code;
@@ -47,31 +49,26 @@ static void qpeg_decode_intra(const uint8_t *src, uint8_t *dst, int size,
     height--;
     dst = dst + height * stride;
 
-    while((size > 0) && (rows_to_go > 0)) {
-        code = *src++;
-        size--;
+    while ((bytestream2_get_bytes_left(&qctx->buffer) > 0) && (rows_to_go > 0)) {
+        code = bytestream2_get_byte(&qctx->buffer);
         run = copy = 0;
         if(code == 0xFC) /* end-of-picture code */
             break;
         if(code >= 0xF8) { /* very long run */
-            c0 = *src++;
-            c1 = *src++;
-            size -= 2;
+            c0 = bytestream2_get_byte(&qctx->buffer);
+            c1 = bytestream2_get_byte(&qctx->buffer);
             run = ((code & 0x7) << 16) + (c0 << 8) + c1 + 2;
         } else if (code >= 0xF0) { /* long run */
-            c0 = *src++;
-            size--;
+            c0 = bytestream2_get_byte(&qctx->buffer);
             run = ((code & 0xF) << 8) + c0 + 2;
         } else if (code >= 0xE0) { /* short run */
             run = (code & 0x1F) + 2;
         } else if (code >= 0xC0) { /* very long copy */
-            c0 = *src++;
-            c1 = *src++;
-            size -= 2;
+            c0 = bytestream2_get_byte(&qctx->buffer);
+            c1 = bytestream2_get_byte(&qctx->buffer);
             copy = ((code & 0x3F) << 16) + (c0 << 8) + c1 + 1;
         } else if (code >= 0x80) { /* long copy */
-            c0 = *src++;
-            size--;
+            c0 = bytestream2_get_byte(&qctx->buffer);
             copy = ((code & 0x7F) << 8) + c0 + 1;
         } else { /* short copy */
             copy = code + 1;
@@ -81,8 +78,7 @@ static void qpeg_decode_intra(const uint8_t *src, uint8_t *dst, int size,
         if(run) {
             int p;
 
-            p = *src++;
-            size--;
+            p = bytestream2_get_byte(&qctx->buffer);
             for(i = 0; i < run; i++) {
                 dst[filled++] = p;
                 if (filled >= width) {
@@ -94,9 +90,8 @@ static void qpeg_decode_intra(const uint8_t *src, uint8_t *dst, int size,
                 }
             }
         } else {
-            size -= copy;
             for(i = 0; i < copy; i++) {
-                dst[filled++] = *src++;
+                dst[filled++] = bytestream2_get_byte(&qctx->buffer);
                 if (filled >= width) {
                     filled = 0;
                     dst -= stride;
@@ -115,9 +110,10 @@ static const int qpeg_table_w[16] =
  { 0x00, 0x20, 0x18, 0x08, 0x18, 0x10, 0x20, 0x10, 0x08, 0x10, 0x20, 0x20, 0x08, 0x10, 0x18, 0x04};
 
 /* Decodes delta frames */
-static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size,
-                            int stride, int width, int height,
-                            int delta, const uint8_t *ctable, uint8_t *refdata)
+static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
+                              int stride, int width, int height,
+                              int delta, const uint8_t *ctable,
+                              uint8_t *refdata)
 {
     int i, j;
     int code;
@@ -132,9 +128,8 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size,
     height--;
     dst = dst + height * stride;
 
-    while((size > 0) && (height >= 0)) {
-        code = *src++;
-        size--;
+    while ((bytestream2_get_bytes_left(&qctx->buffer) > 0) && (height >= 0)) {
+        code = bytestream2_get_byte(&qctx->buffer);
 
         if(delta) {
             /* motion compensation */
@@ -151,8 +146,7 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size,
                     me_h = qpeg_table_h[me_idx];
 
                     /* extract motion vector */
-                    corr = *src++;
-                    size--;
+                    corr = bytestream2_get_byte(&qctx->buffer);
 
                     val = corr >> 4;
                     if(val > 7)
@@ -179,8 +173,7 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size,
                         }
                     }
                 }
-                code = *src++;
-                size--;
+                code = bytestream2_get_byte(&qctx->buffer);
             }
         }
 
@@ -190,8 +183,7 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size,
             int p;
 
             code &= 0x1F;
-            p = *src++;
-            size--;
+            p = bytestream2_get_byte(&qctx->buffer);
             for(i = 0; i <= code; i++) {
                 dst[filled++] = p;
                 if(filled >= width) {
@@ -204,14 +196,13 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size,
             code &= 0x1F;
 
             for(i = 0; i <= code; i++) {
-                dst[filled++] = *src++;
+                dst[filled++] = bytestream2_get_byte(&qctx->buffer);
                 if(filled >= width) {
                     filled = 0;
                     dst -= stride;
                     height--;
                 }
             }
-            size -= code + 1;
         } else if(code >= 0x80) { /* skip code: 0x80..0xBF */
             int skip;
 
@@ -219,9 +210,9 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size,
             /* codes 0x80 and 0x81 are actually escape codes,
                skip value minus constant is in the next byte */
             if(!code)
-                skip = (*src++) + 64;
+                skip = bytestream2_get_byte(&qctx->buffer) +  64;
             else if(code == 1)
-                skip = (*src++) + 320;
+                skip = bytestream2_get_byte(&qctx->buffer) + 320;
             else
                 skip = code;
             filled += skip;
@@ -234,8 +225,9 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size,
             }
         } else {
             /* zero code treated as one-pixel skip */
-            if(code)
+            if(code) {
                 dst[filled++] = ctable[code & 0x7F];
+            }
             else
                 filled++;
             if(filled >= width) {
@@ -251,25 +243,34 @@ static int decode_frame(AVCodecContext *avctx,
                         void *data, int *data_size,
                         AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
+    uint8_t ctable[128];
     QpegContext * const a = avctx->priv_data;
     AVFrame * const p = &a->pic;
     uint8_t* outdata;
     int delta;
     const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
 
+    if (avpkt->size < 0x86) {
+        av_log(avctx, AV_LOG_ERROR, "Packet is too small\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    bytestream2_init(&a->buffer, avpkt->data, avpkt->size);
     p->reference = 3;
     if (avctx->reget_buffer(avctx, p) < 0) {
         av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
         return -1;
     }
     outdata = a->pic.data[0];
-    if(buf[0x85] == 0x10) {
-        qpeg_decode_intra(buf+0x86, outdata, buf_size - 0x86, a->pic.linesize[0], avctx->width, avctx->height);
+    bytestream2_skip(&a->buffer, 4);
+    bytestream2_get_buffer(&a->buffer, ctable, 128);
+    bytestream2_skip(&a->buffer, 1);
+
+    delta = bytestream2_get_byte(&a->buffer);
+    if(delta == 0x10) {
+        qpeg_decode_intra(a, outdata, a->pic.linesize[0], avctx->width, avctx->height);
     } else {
-        delta = buf[0x85];
-        qpeg_decode_inter(buf+0x86, outdata, buf_size - 0x86, a->pic.linesize[0], avctx->width, avctx->height, delta, buf + 4, a->refdata);
+        qpeg_decode_inter(a, outdata, a->pic.linesize[0], avctx->width, avctx->height, delta, ctable, a->refdata);
     }
 
     /* make the palette available on the way out */
@@ -282,7 +283,7 @@ static int decode_frame(AVCodecContext *avctx,
     *data_size = sizeof(AVFrame);
     *(AVFrame*)data = a->pic;
 
-    return buf_size;
+    return avpkt->size;
 }
 
 static av_cold int decode_init(AVCodecContext *avctx){

From 6c7a01621ce0633de7a2a2ebbc0a2ccabdda3248 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Mon, 27 Feb 2012 23:32:23 -0500
Subject: [PATCH 17/27] nellymoserenc: use proper MDCT overlap delay

---
 libavcodec/nellymoserenc.c | 48 +++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/libavcodec/nellymoserenc.c b/libavcodec/nellymoserenc.c
index e60b0b5afa..5848191aeb 100644
--- a/libavcodec/nellymoserenc.c
+++ b/libavcodec/nellymoserenc.c
@@ -52,13 +52,11 @@
 typedef struct NellyMoserEncodeContext {
     AVCodecContext  *avctx;
     int             last_frame;
-    int             bufsel;
-    int             have_saved;
     DSPContext      dsp;
     FFTContext      mdct_ctx;
     DECLARE_ALIGNED(32, float, mdct_out)[NELLY_SAMPLES];
     DECLARE_ALIGNED(32, float, in_buff)[NELLY_SAMPLES];
-    DECLARE_ALIGNED(32, float, buf)[2][3 * NELLY_BUF_LEN];     ///< sample buffer
+    DECLARE_ALIGNED(32, float, buf)[3 * NELLY_BUF_LEN];     ///< sample buffer
     float           (*opt )[NELLY_BANDS];
     uint8_t         (*path)[NELLY_BANDS];
 } NellyMoserEncodeContext;
@@ -115,16 +113,17 @@ static const uint8_t quant_lut_offset[8] = { 0, 0, 1, 4, 11, 32, 81, 230 };
 
 static void apply_mdct(NellyMoserEncodeContext *s)
 {
-    s->dsp.vector_fmul(s->in_buff, s->buf[s->bufsel], ff_sine_128, NELLY_BUF_LEN);
-    s->dsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN, ff_sine_128,
-                               NELLY_BUF_LEN);
+    float *in0 = s->buf;
+    float *in1 = s->buf + NELLY_BUF_LEN;
+    float *in2 = s->buf + 2 * NELLY_BUF_LEN;
+
+    s->dsp.vector_fmul        (s->in_buff,                 in0, ff_sine_128, NELLY_BUF_LEN);
+    s->dsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in1, ff_sine_128, NELLY_BUF_LEN);
     s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out, s->in_buff);
 
-    s->dsp.vector_fmul(s->buf[s->bufsel] + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN,
-                       ff_sine_128, NELLY_BUF_LEN);
-    s->dsp.vector_fmul_reverse(s->buf[s->bufsel] + 2 * NELLY_BUF_LEN, s->buf[1 - s->bufsel], ff_sine_128,
-                               NELLY_BUF_LEN);
-    s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN);
+    s->dsp.vector_fmul        (s->in_buff,                 in1, ff_sine_128, NELLY_BUF_LEN);
+    s->dsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in2, ff_sine_128, NELLY_BUF_LEN);
+    s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out + NELLY_BUF_LEN, s->in_buff);
 }
 
 static av_cold int encode_end(AVCodecContext *avctx)
@@ -161,6 +160,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
     }
 
     avctx->frame_size = NELLY_SAMPLES;
+    avctx->delay      = NELLY_BUF_LEN;
     s->avctx = avctx;
     if ((ret = ff_mdct_init(&s->mdct_ctx, 8, 0, 32768.0)) < 0)
         goto error;
@@ -369,32 +369,26 @@ static int encode_frame(AVCodecContext *avctx, uint8_t *frame, int buf_size, voi
 {
     NellyMoserEncodeContext *s = avctx->priv_data;
     const float *samples = data;
-    int i;
 
     if (s->last_frame)
         return 0;
 
+    memcpy(s->buf, s->buf + NELLY_SAMPLES, NELLY_BUF_LEN * sizeof(*s->buf));
     if (data) {
-        memcpy(s->buf[s->bufsel], samples, avctx->frame_size * sizeof(*samples));
-        for (i = avctx->frame_size; i < NELLY_SAMPLES; i++) {
-            s->buf[s->bufsel][i] = 0;
-        }
-        s->bufsel = 1 - s->bufsel;
-        if (!s->have_saved) {
-            s->have_saved = 1;
-            return 0;
+        memcpy(s->buf + NELLY_BUF_LEN, samples, avctx->frame_size * sizeof(*s->buf));
+        if (avctx->frame_size < NELLY_SAMPLES) {
+            memset(s->buf + NELLY_BUF_LEN + avctx->frame_size, 0,
+                   (NELLY_SAMPLES - avctx->frame_size) * sizeof(*s->buf));
+            if (avctx->frame_size >= NELLY_BUF_LEN)
+                s->last_frame = 1;
         }
     } else {
-        memset(s->buf[s->bufsel], 0, sizeof(s->buf[0][0]) * NELLY_BUF_LEN);
-        s->bufsel = 1 - s->bufsel;
+        memset(s->buf + NELLY_BUF_LEN, 0, NELLY_SAMPLES * sizeof(*s->buf));
         s->last_frame = 1;
     }
 
-    if (s->have_saved) {
-        encode_block(s, frame, buf_size);
-        return NELLY_BLOCK_LEN;
-    }
-    return 0;
+    encode_block(s, frame, buf_size);
+    return NELLY_BLOCK_LEN;
 }
 
 AVCodec ff_nellymoser_encoder = {

From 29e2c8531096a5fb67079551564b4ab3f9acd8a6 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Mon, 27 Feb 2012 23:39:50 -0500
Subject: [PATCH 18/27] nellymoserenc: zero any leftover packet bytes

fixes writing of uninitialized packet data
---
 libavcodec/nellymoserenc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavcodec/nellymoserenc.c b/libavcodec/nellymoserenc.c
index 5848191aeb..c0b174675b 100644
--- a/libavcodec/nellymoserenc.c
+++ b/libavcodec/nellymoserenc.c
@@ -363,6 +363,7 @@ static void encode_block(NellyMoserEncodeContext *s, unsigned char *output, int
     }
 
     flush_put_bits(&pb);
+    memset(put_bits_ptr(&pb), 0, output + output_size - put_bits_ptr(&pb));
 }
 
 static int encode_frame(AVCodecContext *avctx, uint8_t *frame, int buf_size, void *data)

From b0350c1c30908dbe7901c4eb07663bb58e575902 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Tue, 28 Feb 2012 01:02:28 -0500
Subject: [PATCH 19/27] ra144enc: fix end-of-stream handling

Use CODEC_CAP_DELAY and CODEC_CAP_SMALL_LAST_FRAME to properly pad and flush
the encoder at the end of encoding. This is needed in order to have all input
samples decoded.
---
 libavcodec/ra144.h    |  1 +
 libavcodec/ra144enc.c | 33 +++++++++++++++++++++++++--------
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/libavcodec/ra144.h b/libavcodec/ra144.h
index f6475d45ff..189c73a716 100644
--- a/libavcodec/ra144.h
+++ b/libavcodec/ra144.h
@@ -36,6 +36,7 @@ typedef struct {
     AVCodecContext *avctx;
     AVFrame frame;
     LPCContext lpc_ctx;
+    int last_frame;
 
     unsigned int     old_energy;        ///< previous frame energy
 
diff --git a/libavcodec/ra144enc.c b/libavcodec/ra144enc.c
index ff8316912f..aec2d6d586 100644
--- a/libavcodec/ra144enc.c
+++ b/libavcodec/ra144enc.c
@@ -53,6 +53,7 @@ static av_cold int ra144_encode_init(AVCodecContext * avctx)
         return -1;
     }
     avctx->frame_size = NBLOCKS * BLOCKSIZE;
+    avctx->delay      = avctx->frame_size;
     avctx->bit_rate = 8000;
     ractx = avctx->priv_data;
     ractx->lpc_coef[0] = ractx->lpc_tables[0];
@@ -433,7 +434,7 @@ static int ra144_encode_frame(AVCodecContext *avctx, uint8_t *frame,
 {
     static const uint8_t sizes[LPC_ORDER] = {64, 32, 32, 16, 16, 8, 8, 8, 8, 4};
     static const uint8_t bit_sizes[LPC_ORDER] = {6, 5, 5, 4, 4, 3, 3, 3, 3, 2};
-    RA144Context *ractx;
+    RA144Context *ractx = avctx->priv_data;
     PutBitContext pb;
     int32_t lpc_data[NBLOCKS * BLOCKSIZE];
     int32_t lpc_coefs[LPC_ORDER][MAX_LPC_ORDER];
@@ -445,11 +446,13 @@ static int ra144_encode_frame(AVCodecContext *avctx, uint8_t *frame,
     int energy = 0;
     int i, idx;
 
+    if (ractx->last_frame)
+        return 0;
+
     if (buf_size < FRAMESIZE) {
         av_log(avctx, AV_LOG_ERROR, "output buffer too small\n");
         return 0;
     }
-    ractx = avctx->priv_data;
 
     /**
      * Since the LPC coefficients are calculated on a frame centered over the
@@ -462,11 +465,15 @@ static int ra144_encode_frame(AVCodecContext *avctx, uint8_t *frame,
         lpc_data[i] = ractx->curr_block[BLOCKSIZE + BLOCKSIZE / 2 + i];
         energy += (lpc_data[i] * lpc_data[i]) >> 4;
     }
-    for (i = 2 * BLOCKSIZE + BLOCKSIZE / 2; i < NBLOCKS * BLOCKSIZE; i++) {
-        lpc_data[i] = *((int16_t *)data + i - 2 * BLOCKSIZE - BLOCKSIZE / 2) >>
-                      2;
-        energy += (lpc_data[i] * lpc_data[i]) >> 4;
+    if (data) {
+        int j;
+        for (j = 0; j < avctx->frame_size && i < NBLOCKS * BLOCKSIZE; i++, j++) {
+            lpc_data[i] = samples[j] >> 2;
+            energy += (lpc_data[i] * lpc_data[i]) >> 4;
+        }
     }
+    if (i < NBLOCKS * BLOCKSIZE)
+        memset(&lpc_data[i], 0, (NBLOCKS * BLOCKSIZE - i) * sizeof(*lpc_data));
     energy = ff_energy_tab[quantize(ff_t_sqrt(energy >> 5) >> 10, ff_energy_tab,
                                     32)];
 
@@ -515,8 +522,17 @@ static int ra144_encode_frame(AVCodecContext *avctx, uint8_t *frame,
     ractx->old_energy = energy;
     ractx->lpc_refl_rms[1] = ractx->lpc_refl_rms[0];
     FFSWAP(unsigned int *, ractx->lpc_coef[0], ractx->lpc_coef[1]);
-    for (i = 0; i < NBLOCKS * BLOCKSIZE; i++)
-        ractx->curr_block[i] = samples[i] >> 2;
+
+    /* copy input samples to current block for processing in next call */
+    i = 0;
+    if (data) {
+        for (; i < avctx->frame_size; i++)
+            ractx->curr_block[i] = samples[i] >> 2;
+    } else
+        ractx->last_frame = 1;
+    memset(&ractx->curr_block[i], 0,
+           (NBLOCKS * BLOCKSIZE - i) * sizeof(*ractx->curr_block));
+
     return FRAMESIZE;
 }
 
@@ -529,6 +545,7 @@ AVCodec ff_ra_144_encoder = {
     .init           = ra144_encode_init,
     .encode         = ra144_encode_frame,
     .close          = ra144_encode_close,
+    .capabilities   = CODEC_CAP_DELAY | CODEC_CAP_SMALL_LAST_FRAME,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
     .long_name      = NULL_IF_CONFIG_SMALL("RealAudio 1.0 (14.4K)"),

From fe78470a8baf0198e9442a9eece24cc9c8462155 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 4 Mar 2012 00:25:45 -0500
Subject: [PATCH 20/27] libopencore-amrnbenc: fix end-of-stream handling

Use CODEC_CAP_DELAY and CODEC_CAP_SMALL_LAST_FRAME to properly pad and flush
the encoder at the end of encoding. This is needed in order to have all input
samples decoded.
---
 libavcodec/libopencore-amr.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/libavcodec/libopencore-amr.c b/libavcodec/libopencore-amr.c
index ebbc0d90d7..9e2c5f160b 100644
--- a/libavcodec/libopencore-amr.c
+++ b/libavcodec/libopencore-amr.c
@@ -85,6 +85,7 @@ typedef struct AMRContext {
     int   enc_bitrate;
     int   enc_mode;
     int   enc_dtx;
+    int   enc_last_frame;
 } AMRContext;
 
 static const AVOption options[] = {
@@ -195,6 +196,7 @@ static av_cold int amr_nb_encode_init(AVCodecContext *avctx)
     }
 
     avctx->frame_size  = 160;
+    avctx->delay       =  50;
     avctx->coded_frame = avcodec_alloc_frame();
     if (!avctx->coded_frame)
         return AVERROR(ENOMEM);
@@ -227,17 +229,40 @@ static int amr_nb_encode_frame(AVCodecContext *avctx,
 {
     AMRContext *s = avctx->priv_data;
     int written;
+    int16_t *flush_buf = NULL;
+    const int16_t *samples = data;
 
     if (s->enc_bitrate != avctx->bit_rate) {
         s->enc_mode    = get_bitrate_mode(avctx->bit_rate, avctx);
         s->enc_bitrate = avctx->bit_rate;
     }
 
-    written = Encoder_Interface_Encode(s->enc_state, s->enc_mode, data,
+    if (data) {
+        if (avctx->frame_size < 160) {
+            flush_buf = av_mallocz(160 * sizeof(*flush_buf));
+            if (!flush_buf)
+                return AVERROR(ENOMEM);
+            memcpy(flush_buf, samples, avctx->frame_size * sizeof(*flush_buf));
+            samples = flush_buf;
+            if (avctx->frame_size < 110)
+                s->enc_last_frame = -1;
+        }
+    } else {
+        if (s->enc_last_frame < 0)
+            return 0;
+        flush_buf = av_mallocz(160 * sizeof(*flush_buf));
+        if (!flush_buf)
+            return AVERROR(ENOMEM);
+        samples = flush_buf;
+        s->enc_last_frame = -1;
+    }
+
+    written = Encoder_Interface_Encode(s->enc_state, s->enc_mode, samples,
                                        frame, 0);
     av_dlog(avctx, "amr_nb_encode_frame encoded %u bytes, bitrate %u, first byte was %#02x\n",
             written, s->enc_mode, frame[0]);
 
+    av_freep(&flush_buf);
     return written;
 }
 
@@ -249,6 +274,7 @@ AVCodec ff_libopencore_amrnb_encoder = {
     .init           = amr_nb_encode_init,
     .encode         = amr_nb_encode_frame,
     .close          = amr_nb_encode_close,
+    .capabilities   = CODEC_CAP_DELAY | CODEC_CAP_SMALL_LAST_FRAME,
     .sample_fmts = (const enum AVSampleFormat[]){AV_SAMPLE_FMT_S16,AV_SAMPLE_FMT_NONE},
     .long_name = NULL_IF_CONFIG_SMALL("OpenCORE Adaptive Multi-Rate (AMR) Narrow-Band"),
     .priv_class = &class,

From 1ba08c94f5bb4d1c3c2d3651b5e01edb4ce172e2 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Wed, 29 Feb 2012 02:56:01 -0500
Subject: [PATCH 21/27] vorbisenc: add output buffer overwrite protection

---
 libavcodec/vorbisenc.c | 59 ++++++++++++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 16 deletions(-)

diff --git a/libavcodec/vorbisenc.c b/libavcodec/vorbisenc.c
index 9257333c1c..1566f446ce 100644
--- a/libavcodec/vorbisenc.c
+++ b/libavcodec/vorbisenc.c
@@ -137,13 +137,16 @@ typedef struct {
 #define RESIDUE_PART_SIZE      32
 #define NUM_RESIDUE_PARTITIONS (RESIDUE_SIZE/RESIDUE_PART_SIZE)
 
-static inline void put_codeword(PutBitContext *pb, vorbis_enc_codebook *cb,
-                                int entry)
+static inline int put_codeword(PutBitContext *pb, vorbis_enc_codebook *cb,
+                               int entry)
 {
     assert(entry >= 0);
     assert(entry < cb->nentries);
     assert(cb->lens[entry]);
+    if (pb->size_in_bits - put_bits_count(pb) < cb->lens[entry])
+        return AVERROR(EINVAL);
     put_bits(pb, cb->lens[entry], cb->codewords[entry]);
+    return 0;
 }
 
 static int cb_lookup_vals(int lookup, int dimentions, int entries)
@@ -751,14 +754,16 @@ static int render_point(int x0, int y0, int x1, int y1, int x)
     return y0 +  (x - x0) * (y1 - y0) / (x1 - x0);
 }
 
-static void floor_encode(vorbis_enc_context *venc, vorbis_enc_floor *fc,
-                         PutBitContext *pb, uint16_t *posts,
-                         float *floor, int samples)
+static int floor_encode(vorbis_enc_context *venc, vorbis_enc_floor *fc,
+                        PutBitContext *pb, uint16_t *posts,
+                        float *floor, int samples)
 {
     int range = 255 / fc->multiplier + 1;
     int coded[MAX_FLOOR_VALUES]; // first 2 values are unused
     int i, counter;
 
+    if (pb->size_in_bits - put_bits_count(pb) < 1 + 2 * ilog(range - 1))
+        return AVERROR(EINVAL);
     put_bits(pb, 1, 1); // non zero
     put_bits(pb, ilog(range - 1), posts[0]);
     put_bits(pb, ilog(range - 1), posts[1]);
@@ -816,7 +821,8 @@ static void floor_encode(vorbis_enc_context *venc, vorbis_enc_floor *fc,
                 cval   |= l << cshift;
                 cshift += c->subclass;
             }
-            put_codeword(pb, book, cval);
+            if (put_codeword(pb, book, cval))
+                return AVERROR(EINVAL);
         }
         for (k = 0; k < c->dim; k++) {
             int book  = c->books[cval & (csub-1)];
@@ -826,12 +832,15 @@ static void floor_encode(vorbis_enc_context *venc, vorbis_enc_floor *fc,
                 continue;
             if (entry == -1)
                 entry = 0;
-            put_codeword(pb, &venc->codebooks[book], entry);
+            if (put_codeword(pb, &venc->codebooks[book], entry))
+                return AVERROR(EINVAL);
         }
     }
 
     ff_vorbis_floor1_render_list(fc->list, fc->values, posts, coded,
                                  fc->multiplier, floor, samples);
+
+    return 0;
 }
 
 static float *put_vector(vorbis_enc_codebook *book, PutBitContext *pb,
@@ -852,13 +861,14 @@ static float *put_vector(vorbis_enc_codebook *book, PutBitContext *pb,
             distance = d;
         }
     }
-    put_codeword(pb, book, entry);
+    if (put_codeword(pb, book, entry))
+        return NULL;
     return &book->dimentions[entry * book->ndimentions];
 }
 
-static void residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc,
-                           PutBitContext *pb, float *coeffs, int samples,
-                           int real_ch)
+static int residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc,
+                          PutBitContext *pb, float *coeffs, int samples,
+                          int real_ch)
 {
     int pass, i, j, p, k;
     int psize      = rc->partition_size;
@@ -894,7 +904,8 @@ static void residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc,
                         entry *= rc->classifications;
                         entry += classes[j][p + i];
                     }
-                    put_codeword(pb, book, entry);
+                    if (put_codeword(pb, book, entry))
+                        return AVERROR(EINVAL);
                 }
             for (i = 0; i < classwords && p < partitions; i++, p++) {
                 for (j = 0; j < channels; j++) {
@@ -909,8 +920,10 @@ static void residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc,
 
                     if (rc->type == 0) {
                         for (k = 0; k < psize; k += book->ndimentions) {
-                            float *a = put_vector(book, pb, &buf[k]);
                             int l;
+                            float *a = put_vector(book, pb, &buf[k]);
+                            if (!a)
+                                return AVERROR(EINVAL);
                             for (l = 0; l < book->ndimentions; l++)
                                 buf[k + l] -= a[l];
                         }
@@ -930,6 +943,8 @@ static void residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc,
                                 }
                             }
                             pv = put_vector(book, pb, vec);
+                            if (!pv)
+                                return AVERROR(EINVAL);
                             for (dim = book->ndimentions; dim--; ) {
                                 coeffs[a1 + b1] -= *pv++;
                                 if ((a1 += samples) == s) {
@@ -943,6 +958,7 @@ static void residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc,
             }
         }
     }
+    return 0;
 }
 
 static int apply_window_and_mdct(vorbis_enc_context *venc, const signed short *audio,
@@ -1017,6 +1033,11 @@ static int vorbis_encode_frame(AVCodecContext *avccontext,
 
     init_put_bits(&pb, packets, buf_size);
 
+    if (pb.size_in_bits - put_bits_count(&pb) < 1 + ilog(venc->nmodes - 1)) {
+        av_log(avccontext, AV_LOG_ERROR, "output buffer is too small\n");
+        return AVERROR(EINVAL);
+    }
+
     put_bits(&pb, 1, 0); // magic bit
 
     put_bits(&pb, ilog(venc->nmodes - 1), 0); // 0 bits, the mode
@@ -1032,7 +1053,10 @@ static int vorbis_encode_frame(AVCodecContext *avccontext,
         vorbis_enc_floor *fc = &venc->floors[mapping->floor[mapping->mux[i]]];
         uint16_t posts[MAX_FLOOR_VALUES];
         floor_fit(venc, fc, &venc->coeffs[i * samples], posts, samples);
-        floor_encode(venc, fc, &pb, posts, &venc->floor[i * samples], samples);
+        if (floor_encode(venc, fc, &pb, posts, &venc->floor[i * samples], samples)) {
+            av_log(avccontext, AV_LOG_ERROR, "output buffer is too small\n");
+            return AVERROR(EINVAL);
+        }
     }
 
     for (i = 0; i < venc->channels * samples; i++)
@@ -1052,8 +1076,11 @@ static int vorbis_encode_frame(AVCodecContext *avccontext,
         }
     }
 
-    residue_encode(venc, &venc->residues[mapping->residue[mapping->mux[0]]],
-                   &pb, venc->coeffs, samples, venc->channels);
+    if (residue_encode(venc, &venc->residues[mapping->residue[mapping->mux[0]]],
+                       &pb, venc->coeffs, samples, venc->channels)) {
+        av_log(avccontext, AV_LOG_ERROR, "output buffer is too small\n");
+        return AVERROR(EINVAL);
+    }
 
     avccontext->coded_frame->pts = venc->sample_count;
     venc->sample_count += avccontext->frame_size;

From 4db4b53dc8ac81f10414b48aa6d954ba2c232a92 Mon Sep 17 00:00:00 2001
From: Kostya Shishkov <kostya.shishkov@gmail.com>
Date: Sat, 3 Mar 2012 19:14:35 +0100
Subject: [PATCH 22/27] proresenc: give user a possibility to alter some
 encoding parameters

This allows user to select quantisation matrix from different profile,
stamp frames with custom vendor string and change target bitrate.
---
 libavcodec/proresenc.c | 178 ++++++++++++++++++++++++++++-------------
 1 file changed, 122 insertions(+), 56 deletions(-)

diff --git a/libavcodec/proresenc.c b/libavcodec/proresenc.c
index 8e6f93fe2b..9f26def1df 100644
--- a/libavcodec/proresenc.c
+++ b/libavcodec/proresenc.c
@@ -42,6 +42,67 @@ enum {
     PRORES_PROFILE_HQ,
 };
 
+enum {
+    QUANT_MAT_PROXY = 0,
+    QUANT_MAT_LT,
+    QUANT_MAT_STANDARD,
+    QUANT_MAT_HQ,
+    QUANT_MAT_DEFAULT,
+};
+
+static const uint8_t prores_quant_matrices[][64] = {
+    { // proxy
+         4,  7,  9, 11, 13, 14, 15, 63,
+         7,  7, 11, 12, 14, 15, 63, 63,
+         9, 11, 13, 14, 15, 63, 63, 63,
+        11, 11, 13, 14, 63, 63, 63, 63,
+        11, 13, 14, 63, 63, 63, 63, 63,
+        13, 14, 63, 63, 63, 63, 63, 63,
+        13, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63,
+    },
+    { // LT
+         4,  5,  6,  7,  9, 11, 13, 15,
+         5,  5,  7,  8, 11, 13, 15, 17,
+         6,  7,  9, 11, 13, 15, 15, 17,
+         7,  7,  9, 11, 13, 15, 17, 19,
+         7,  9, 11, 13, 14, 16, 19, 23,
+         9, 11, 13, 14, 16, 19, 23, 29,
+         9, 11, 13, 15, 17, 21, 28, 35,
+        11, 13, 16, 17, 21, 28, 35, 41,
+    },
+    { // standard
+         4,  4,  5,  5,  6,  7,  7,  9,
+         4,  4,  5,  6,  7,  7,  9,  9,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  6,  7,  7,  8,  9, 10, 12,
+         6,  7,  7,  8,  9, 10, 12, 15,
+         6,  7,  7,  9, 10, 11, 14, 17,
+         7,  7,  9, 10, 11, 14, 17, 21,
+    },
+    { // high quality
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  5,
+         4,  4,  4,  4,  4,  4,  5,  5,
+         4,  4,  4,  4,  4,  5,  5,  6,
+         4,  4,  4,  4,  5,  5,  6,  7,
+         4,  4,  4,  4,  5,  6,  7,  7,
+    },
+    { // codec default
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+    },
+};
+
 #define NUM_MB_LIMITS 4
 static const int prores_mb_limits[NUM_MB_LIMITS] = {
     1620, // up to 720x576
@@ -56,7 +117,7 @@ static const struct prores_profile {
     int         min_quant;
     int         max_quant;
     int         br_tab[NUM_MB_LIMITS];
-    uint8_t     quant[64];
+    int         quant;
 } prores_profile_info[4] = {
     {
         .full_name = "proxy",
@@ -64,16 +125,7 @@ static const struct prores_profile {
         .min_quant = 4,
         .max_quant = 8,
         .br_tab    = { 300, 242, 220, 194 },
-        .quant     = {
-             4,  7,  9, 11, 13, 14, 15, 63,
-             7,  7, 11, 12, 14, 15, 63, 63,
-             9, 11, 13, 14, 15, 63, 63, 63,
-            11, 11, 13, 14, 63, 63, 63, 63,
-            11, 13, 14, 63, 63, 63, 63, 63,
-            13, 14, 63, 63, 63, 63, 63, 63,
-            13, 63, 63, 63, 63, 63, 63, 63,
-            63, 63, 63, 63, 63, 63, 63, 63,
-        },
+        .quant     = QUANT_MAT_PROXY,
     },
     {
         .full_name = "LT",
@@ -81,16 +133,7 @@ static const struct prores_profile {
         .min_quant = 1,
         .max_quant = 9,
         .br_tab    = { 720, 560, 490, 440 },
-        .quant     = {
-             4,  5,  6,  7,  9, 11, 13, 15,
-             5,  5,  7,  8, 11, 13, 15, 17,
-             6,  7,  9, 11, 13, 15, 15, 17,
-             7,  7,  9, 11, 13, 15, 17, 19,
-             7,  9, 11, 13, 14, 16, 19, 23,
-             9, 11, 13, 14, 16, 19, 23, 29,
-             9, 11, 13, 15, 17, 21, 28, 35,
-            11, 13, 16, 17, 21, 28, 35, 41,
-        },
+        .quant     = QUANT_MAT_LT,
     },
     {
         .full_name = "standard",
@@ -98,16 +141,7 @@ static const struct prores_profile {
         .min_quant = 1,
         .max_quant = 6,
         .br_tab    = { 1050, 808, 710, 632 },
-        .quant     = {
-             4,  4,  5,  5,  6,  7,  7,  9,
-             4,  4,  5,  6,  7,  7,  9,  9,
-             5,  5,  6,  7,  7,  9,  9, 10,
-             5,  5,  6,  7,  7,  9,  9, 10,
-             5,  6,  7,  7,  8,  9, 10, 12,
-             6,  7,  7,  8,  9, 10, 12, 15,
-             6,  7,  7,  9, 10, 11, 14, 17,
-             7,  7,  9, 10, 11, 14, 17, 21,
-        },
+        .quant     = QUANT_MAT_STANDARD,
     },
     {
         .full_name = "high quality",
@@ -115,16 +149,7 @@ static const struct prores_profile {
         .min_quant = 1,
         .max_quant = 6,
         .br_tab    = { 1566, 1216, 1070, 950 },
-        .quant     = {
-             4,  4,  4,  4,  4,  4,  4,  4,
-             4,  4,  4,  4,  4,  4,  4,  4,
-             4,  4,  4,  4,  4,  4,  4,  4,
-             4,  4,  4,  4,  4,  4,  4,  5,
-             4,  4,  4,  4,  4,  4,  5,  5,
-             4,  4,  4,  4,  4,  5,  5,  6,
-             4,  4,  4,  4,  5,  5,  6,  7,
-             4,  4,  4,  4,  5,  6,  7,  7,
-        },
+        .quant     = QUANT_MAT_HQ,
     }
 // for 4444 profile bitrate numbers are { 2350, 1828, 1600, 1425 }
 };
@@ -147,6 +172,7 @@ typedef struct ProresContext {
     DECLARE_ALIGNED(16, uint16_t, emu_buf)[16*16];
     int16_t quants[MAX_STORED_Q][64];
     int16_t custom_q[64];
+    const uint8_t *quant_mat;
 
     ProresDSPContext dsp;
     ScanTable  scantable;
@@ -159,6 +185,9 @@ typedef struct ProresContext {
     int num_planes;
     int bits_per_mb;
 
+    char *vendor;
+    int quant_sel;
+
     int frame_size;
 
     int profile;
@@ -373,7 +402,7 @@ static int encode_slice(AVCodecContext *avctx, const AVFrame *pic,
     } else {
         qmat = ctx->custom_q;
         for (i = 0; i < 64; i++)
-            qmat[i] = ctx->profile_info->quant[i] * quant;
+            qmat[i] = ctx->quant_mat[i] * quant;
     }
 
     for (i = 0; i < ctx->num_planes; i++) {
@@ -591,7 +620,7 @@ static int find_slice_quant(AVCodecContext *avctx, const AVFrame *pic,
             } else {
                 qmat = ctx->custom_q;
                 for (i = 0; i < 64; i++)
-                    qmat[i] = ctx->profile_info->quant[i] * q;
+                    qmat[i] = ctx->quant_mat[i] * q;
             }
             for (i = 0; i < ctx->num_planes; i++) {
                 bits += estimate_slice_plane(ctx, &error, i,
@@ -684,7 +713,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     tmp = buf;
     buf += 2;                                   // frame header size will be stored here
     bytestream_put_be16  (&buf, 0);             // version 1
-    bytestream_put_buffer(&buf, "Lavc", 4);     // creator
+    bytestream_put_buffer(&buf, ctx->vendor, 4);
     bytestream_put_be16  (&buf, avctx->width);
     bytestream_put_be16  (&buf, avctx->height);
     bytestream_put_byte  (&buf, ctx->chroma_factor << 6); // frame flags
@@ -694,13 +723,17 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     bytestream_put_byte  (&buf, avctx->colorspace);
     bytestream_put_byte  (&buf, 0x40);          // source format and alpha information
     bytestream_put_byte  (&buf, 0);             // reserved
-    bytestream_put_byte  (&buf, 0x03);          // matrix flags - both matrices are present
-    // luma quantisation matrix
-    for (i = 0; i < 64; i++)
-        bytestream_put_byte(&buf, ctx->profile_info->quant[i]);
-    // chroma quantisation matrix
-    for (i = 0; i < 64; i++)
-        bytestream_put_byte(&buf, ctx->profile_info->quant[i]);
+    if (ctx->quant_sel != QUANT_MAT_DEFAULT) {
+        bytestream_put_byte  (&buf, 0x03);      // matrix flags - both matrices are present
+        // luma quantisation matrix
+        for (i = 0; i < 64; i++)
+            bytestream_put_byte(&buf, ctx->quant_mat[i]);
+        // chroma quantisation matrix
+        for (i = 0; i < 64; i++)
+            bytestream_put_byte(&buf, ctx->quant_mat[i]);
+    } else {
+        bytestream_put_byte  (&buf, 0x00);      // matrix flags - default matrices are used
+    }
     bytestream_put_be16  (&tmp, buf - orig_buf); // write back frame header size
 
     // picture header
@@ -816,10 +849,25 @@ static av_cold int encode_init(AVCodecContext *avctx)
     ctx->slices_width += av_popcount(ctx->mb_width - ctx->slices_width * mps);
     ctx->num_slices    = ctx->mb_height * ctx->slices_width;
 
-    for (i = 0; i < NUM_MB_LIMITS - 1; i++)
-        if (prores_mb_limits[i] >= ctx->mb_width * ctx->mb_height)
-            break;
-    ctx->bits_per_mb   = ctx->profile_info->br_tab[i];
+    if (ctx->quant_sel == -1)
+        ctx->quant_mat = prores_quant_matrices[ctx->profile_info->quant];
+    else
+        ctx->quant_mat = prores_quant_matrices[ctx->quant_sel];
+
+    if (strlen(ctx->vendor) != 4) {
+        av_log(avctx, AV_LOG_ERROR, "vendor ID should be 4 bytes\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (!ctx->bits_per_mb) {
+        for (i = 0; i < NUM_MB_LIMITS - 1; i++)
+            if (prores_mb_limits[i] >= ctx->mb_width * ctx->mb_height)
+                break;
+        ctx->bits_per_mb   = ctx->profile_info->br_tab[i];
+    } else if (ctx->bits_per_mb < 128) {
+        av_log(avctx, AV_LOG_ERROR, "too few bits per MB, please set at least 128\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     ctx->frame_size = ctx->num_slices * (2 + 2 * ctx->num_planes
                                          + (2 * mps * ctx->bits_per_mb) / 8)
@@ -829,7 +877,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
     max_quant = ctx->profile_info->max_quant;
     for (i = min_quant; i < MAX_STORED_Q; i++) {
         for (j = 0; j < 64; j++)
-            ctx->quants[i][j] = ctx->profile_info->quant[j] * i;
+            ctx->quants[i][j] = ctx->quant_mat[j] * i;
     }
 
     avctx->codec_tag   = ctx->profile_info->tag;
@@ -877,6 +925,24 @@ static const AVOption options[] = {
         0, 0, VE, "profile" },
     { "hq",            NULL, 0, AV_OPT_TYPE_CONST, { PRORES_PROFILE_HQ },
         0, 0, VE, "profile" },
+    { "vendor", "vendor ID", OFFSET(vendor),
+        AV_OPT_TYPE_STRING, { .str = "Lavc" }, CHAR_MIN, CHAR_MAX, VE },
+    { "bits_per_mb", "desired bits per macroblock", OFFSET(bits_per_mb),
+        AV_OPT_TYPE_INT, { 0 }, 0, 8192, VE },
+    { "quant_mat", "quantiser matrix", OFFSET(quant_sel), AV_OPT_TYPE_INT,
+        { -1 }, -1, QUANT_MAT_DEFAULT, VE, "quant_mat" },
+    { "auto",          NULL, 0, AV_OPT_TYPE_CONST, { -1 },
+        0, 0, VE, "quant_mat" },
+    { "proxy",         NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_PROXY },
+        0, 0, VE, "quant_mat" },
+    { "lt",            NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_LT },
+        0, 0, VE, "quant_mat" },
+    { "standard",      NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_STANDARD },
+        0, 0, VE, "quant_mat" },
+    { "hq",            NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_HQ },
+        0, 0, VE, "quant_mat" },
+    { "default",       NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_DEFAULT },
+        0, 0, VE, "quant_mat" },
     { NULL }
 };
 

From 1c97b5c4a3c7091932b905d20e5661fc6f9585c2 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Sat, 3 Mar 2012 21:38:03 -0800
Subject: [PATCH 23/27] swscale: remove "cpu flags" from -sws_flags
 description.

---
 libswscale/options.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libswscale/options.c b/libswscale/options.c
index 8072d436d2..7ed5254aa8 100644
--- a/libswscale/options.c
+++ b/libswscale/options.c
@@ -34,7 +34,7 @@ static const char *sws_context_to_name(void *ptr)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 
 static const AVOption options[] = {
-    { "sws_flags",       "scaler/cpu flags",              OFFSET(flags),     AV_OPT_TYPE_FLAGS,  { .dbl = DEFAULT            }, 0,       UINT_MAX,       VE, "sws_flags" },
+    { "sws_flags",       "scaler flags",                  OFFSET(flags),     AV_OPT_TYPE_FLAGS,  { .dbl = DEFAULT            }, 0,       UINT_MAX,       VE, "sws_flags" },
     { "fast_bilinear",   "fast bilinear",                 0,                 AV_OPT_TYPE_CONST,  { .dbl = SWS_FAST_BILINEAR  }, INT_MIN, INT_MAX,        VE, "sws_flags" },
     { "bilinear",        "bilinear",                      0,                 AV_OPT_TYPE_CONST,  { .dbl = SWS_BILINEAR       }, INT_MIN, INT_MAX,        VE, "sws_flags" },
     { "bicubic",         "bicubic",                       0,                 AV_OPT_TYPE_CONST,  { .dbl = SWS_BICUBIC        }, INT_MIN, INT_MAX,        VE, "sws_flags" },

From 87392b1fd5c59004b8e559463b47836e418d1d27 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Sat, 3 Mar 2012 06:26:39 +0100
Subject: [PATCH 24/27] libcdio: add a forgotten AVClass to the private
 context.

---
 libavdevice/libcdio.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavdevice/libcdio.c b/libavdevice/libcdio.c
index 747adf9141..ae2b7d4e29 100644
--- a/libavdevice/libcdio.c
+++ b/libavdevice/libcdio.c
@@ -37,6 +37,7 @@
 #undef free
 
 typedef struct CDIOContext {
+    AVClass             *class;
     cdrom_drive_t       *drive;
     cdrom_paranoia_t *paranoia;
     int32_t last_sector;

From 02beb9826b29166b5d7c9b306ac1648abb449be0 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Mon, 27 Feb 2012 18:52:13 +0100
Subject: [PATCH 25/27] lavc: deprecate AVCodecContext.sub_id.

In most places where it's used, it's as a pointless write-only field.

Only rv10 decoder actually reads from it, but it stores some internal
version info in it. There is no reason for it to be in a public field.
---
 libavcodec/avcodec.h            | 11 ++++------
 libavcodec/mpeg12.c             |  4 ----
 libavcodec/mpegaudiodec.c       |  2 --
 libavcodec/mpegaudiodecheader.c |  1 -
 libavcodec/mpegvideo_parser.c   |  2 --
 libavcodec/options.c            |  2 ++
 libavcodec/pthread.c            |  1 -
 libavcodec/rv10.c               | 38 ++++++++++++++++++++-------------
 libavcodec/version.h            |  3 +++
 9 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 3598aaaa85..491fb16a70 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -1228,15 +1228,12 @@ typedef struct AVCodecContext {
      */
     unsigned int stream_codec_tag;
 
+#if FF_API_SUB_ID
     /**
-     * Some codecs need additional format info. It is stored here.
-     * If any muxer uses this then ALL demuxers/parsers AND encoders for the
-     * specific codec MUST set it correctly otherwise stream copy breaks.
-     * In general use of this field by muxers is not recommended.
-     * - encoding: Set by libavcodec.
-     * - decoding: Set by libavcodec. (FIXME: Is this OK?)
+     * @deprecated this field is unused
      */
-    int sub_id;
+    attribute_deprecated int sub_id;
+#endif
 
     void *priv_data;
 
diff --git a/libavcodec/mpeg12.c b/libavcodec/mpeg12.c
index a5dafbd1fc..c49343f4b2 100644
--- a/libavcodec/mpeg12.c
+++ b/libavcodec/mpeg12.c
@@ -1237,7 +1237,6 @@ static int mpeg_decode_postinit(AVCodecContext *avctx)
          * that behave like P-frames. */
         avctx->has_b_frames = !s->low_delay;
 
-        assert((avctx->sub_id == 1) == (avctx->codec_id == CODEC_ID_MPEG1VIDEO));
         if (avctx->codec_id == CODEC_ID_MPEG1VIDEO) {
             //MPEG-1 fps
             avctx->time_base.den = avpriv_frame_rate_tab[s->frame_rate_index].num;
@@ -1382,7 +1381,6 @@ static void mpeg_decode_sequence_extension(Mpeg1Context *s1)
 
     av_dlog(s->avctx, "sequence extension\n");
     s->codec_id      = s->avctx->codec_id = CODEC_ID_MPEG2VIDEO;
-    s->avctx->sub_id = 2; /* indicates MPEG-2 found */
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO)
         av_log(s->avctx, AV_LOG_DEBUG, "profile: %d, level: %d vbv buffer: %d, bitrate:%d\n",
@@ -2000,7 +1998,6 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
     s->frame_pred_frame_dct = 1;
     s->chroma_format        = 1;
     s->codec_id             = s->avctx->codec_id = CODEC_ID_MPEG1VIDEO;
-    avctx->sub_id           = 1; /* indicates MPEG-1 */
     s->out_format           = FMT_MPEG1;
     s->swap_uv              = 0; // AFAIK VCR2 does not have SEQ_HEADER
     if (s->flags & CODEC_FLAG_LOW_DELAY)
@@ -2060,7 +2057,6 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
     s->frame_pred_frame_dct  = 1;
     s->chroma_format         = 1;
     s->codec_id              = s->avctx->codec_id = CODEC_ID_MPEG2VIDEO;
-    avctx->sub_id            = 2; /* indicates MPEG-2 */
     s1->save_width           = s->width;
     s1->save_height          = s->height;
     s1->save_progressive_seq = s->progressive_sequence;
diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c
index d6a09c86a8..dd43beb4fe 100644
--- a/libavcodec/mpegaudiodec.c
+++ b/libavcodec/mpegaudiodec.c
@@ -1660,7 +1660,6 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame_ptr,
     avctx->channel_layout = s->nb_channels == 1 ? AV_CH_LAYOUT_MONO : AV_CH_LAYOUT_STEREO;
     if (!avctx->bit_rate)
         avctx->bit_rate = s->bit_rate;
-    avctx->sub_id = s->layer;
 
     if (s->frame_size <= 0 || s->frame_size > buf_size) {
         av_log(avctx, AV_LOG_ERROR, "incomplete frame\n");
@@ -1733,7 +1732,6 @@ static int decode_frame_adu(AVCodecContext *avctx, void *data,
     avctx->channels    = s->nb_channels;
     if (!avctx->bit_rate)
         avctx->bit_rate = s->bit_rate;
-    avctx->sub_id = s->layer;
 
     s->frame_size = len;
 
diff --git a/libavcodec/mpegaudiodecheader.c b/libavcodec/mpegaudiodecheader.c
index dbd67ff0e3..428137ddaf 100644
--- a/libavcodec/mpegaudiodecheader.c
+++ b/libavcodec/mpegaudiodecheader.c
@@ -142,6 +142,5 @@ int avpriv_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_r
     *sample_rate = s->sample_rate;
     *channels = s->nb_channels;
     *bit_rate = s->bit_rate;
-    avctx->sub_id = s->layer;
     return s->frame_size;
 }
diff --git a/libavcodec/mpegvideo_parser.c b/libavcodec/mpegvideo_parser.c
index f0b3b202eb..af4b6e42ba 100644
--- a/libavcodec/mpegvideo_parser.c
+++ b/libavcodec/mpegvideo_parser.c
@@ -69,7 +69,6 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
                 pc->frame_rate.num = avctx->time_base.num = avpriv_frame_rate_tab[frame_rate_index].den;
                 avctx->bit_rate = ((buf[4]<<10) | (buf[5]<<2) | (buf[6]>>6))*400;
                 avctx->codec_id = CODEC_ID_MPEG1VIDEO;
-                avctx->sub_id = 1;
             }
             break;
         case EXT_START_CODE:
@@ -94,7 +93,6 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
                         avctx->time_base.den = pc->frame_rate.den * (frame_rate_ext_n + 1) * 2;
                         avctx->time_base.num = pc->frame_rate.num * (frame_rate_ext_d + 1);
                         avctx->codec_id = CODEC_ID_MPEG2VIDEO;
-                        avctx->sub_id = 2; /* forces MPEG2 */
                     }
                     break;
                 case 0x8: /* picture coding extension */
diff --git a/libavcodec/options.c b/libavcodec/options.c
index d25b64aee8..51187b842c 100644
--- a/libavcodec/options.c
+++ b/libavcodec/options.c
@@ -109,7 +109,9 @@ static const AVOption options[]={
 #endif
 {"noout", "skip bitstream encoding", 0, AV_OPT_TYPE_CONST, {.dbl = CODEC_FLAG2_NO_OUTPUT }, INT_MIN, INT_MAX, V|E, "flags2"},
 {"local_header", "place global headers at every keyframe instead of in extradata", 0, AV_OPT_TYPE_CONST, {.dbl = CODEC_FLAG2_LOCAL_HEADER }, INT_MIN, INT_MAX, V|E, "flags2"},
+#if FF_API_SUB_ID
 {"sub_id", NULL, OFFSET(sub_id), AV_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX},
+#endif
 {"me_method", "set motion estimation method", OFFSET(me_method), AV_OPT_TYPE_INT, {.dbl = ME_EPZS }, INT_MIN, INT_MAX, V|E, "me_method"},
 {"zero", "zero motion estimation (fastest)", 0, AV_OPT_TYPE_CONST, {.dbl = ME_ZERO }, INT_MIN, INT_MAX, V|E, "me_method" },
 {"full", "full motion estimation (slowest)", 0, AV_OPT_TYPE_CONST, {.dbl = ME_FULL }, INT_MIN, INT_MAX, V|E, "me_method" },
diff --git a/libavcodec/pthread.c b/libavcodec/pthread.c
index f17f34e8e1..4a02823dde 100644
--- a/libavcodec/pthread.c
+++ b/libavcodec/pthread.c
@@ -402,7 +402,6 @@ static int update_context_from_thread(AVCodecContext *dst, AVCodecContext *src,
     int err = 0;
 
     if (dst != src) {
-        dst->sub_id    = src->sub_id;
         dst->time_base = src->time_base;
         dst->width     = src->width;
         dst->height    = src->height;
diff --git a/libavcodec/rv10.c b/libavcodec/rv10.c
index 2b1a09dc69..577522504c 100644
--- a/libavcodec/rv10.c
+++ b/libavcodec/rv10.c
@@ -40,6 +40,11 @@
 
 #define DC_VLC_BITS 14 //FIXME find a better solution
 
+typedef struct RVDecContext {
+    MpegEncContext m;
+    int sub_id;
+} RVDecContext;
+
 static const uint16_t rv_lum_code[256] =
 {
  0x3e7f, 0x0f00, 0x0f01, 0x0f02, 0x0f03, 0x0f04, 0x0f05, 0x0f06,
@@ -293,8 +298,9 @@ static int rv10_decode_picture_header(MpegEncContext *s)
     return mb_count;
 }
 
-static int rv20_decode_picture_header(MpegEncContext *s)
+static int rv20_decode_picture_header(RVDecContext *rv)
 {
+    MpegEncContext *s = &rv->m;
     int seq, mb_pos, i;
     int rpr_bits;
 
@@ -325,10 +331,10 @@ static int rv20_decode_picture_header(MpegEncContext *s)
         return -1;
     }
 
-    if(RV_GET_MINOR_VER(s->avctx->sub_id) >= 2)
+    if(RV_GET_MINOR_VER(rv->sub_id) >= 2)
         s->loop_filter = get_bits1(&s->gb);
 
-    if(RV_GET_MINOR_VER(s->avctx->sub_id) <= 1)
+    if(RV_GET_MINOR_VER(rv->sub_id) <= 1)
         seq = get_bits(&s->gb, 8) << 7;
     else
         seq = get_bits(&s->gb, 13) << 2;
@@ -393,7 +399,7 @@ static int rv20_decode_picture_header(MpegEncContext *s)
 av_log(s->avctx, AV_LOG_DEBUG, "\n");*/
     s->no_rounding= get_bits1(&s->gb);
 
-    if(RV_GET_MINOR_VER(s->avctx->sub_id) <= 1 && s->pict_type == AV_PICTURE_TYPE_B)
+    if(RV_GET_MINOR_VER(rv->sub_id) <= 1 && s->pict_type == AV_PICTURE_TYPE_B)
         skip_bits(&s->gb, 5); // binary decoder reads 3+2 bits here but they don't seem to be used
 
     s->f_code = 1;
@@ -418,7 +424,8 @@ av_log(s->avctx, AV_LOG_DEBUG, "\n");*/
 
 static av_cold int rv10_decode_init(AVCodecContext *avctx)
 {
-    MpegEncContext *s = avctx->priv_data;
+    RVDecContext  *rv = avctx->priv_data;
+    MpegEncContext *s = &rv->m;
     static int done=0;
     int major_ver, minor_ver, micro_ver;
 
@@ -438,11 +445,11 @@ static av_cold int rv10_decode_init(AVCodecContext *avctx)
     s->orig_height= s->height = avctx->coded_height;
 
     s->h263_long_vectors= ((uint8_t*)avctx->extradata)[3] & 1;
-    avctx->sub_id= AV_RB32((uint8_t*)avctx->extradata + 4);
+    rv->sub_id = AV_RB32((uint8_t*)avctx->extradata + 4);
 
-    major_ver = RV_GET_MAJOR_VER(avctx->sub_id);
-    minor_ver = RV_GET_MINOR_VER(avctx->sub_id);
-    micro_ver = RV_GET_MICRO_VER(avctx->sub_id);
+    major_ver = RV_GET_MAJOR_VER(rv->sub_id);
+    minor_ver = RV_GET_MINOR_VER(rv->sub_id);
+    micro_ver = RV_GET_MICRO_VER(rv->sub_id);
 
     s->low_delay = 1;
     switch (major_ver) {
@@ -457,13 +464,13 @@ static av_cold int rv10_decode_init(AVCodecContext *avctx)
         }
         break;
     default:
-        av_log(s->avctx, AV_LOG_ERROR, "unknown header %X\n", avctx->sub_id);
+        av_log(s->avctx, AV_LOG_ERROR, "unknown header %X\n", rv->sub_id);
         av_log_missing_feature(avctx, "RV1/2 version", 1);
         return AVERROR_PATCHWELCOME;
     }
 
     if(avctx->debug & FF_DEBUG_PICT_INFO){
-        av_log(avctx, AV_LOG_DEBUG, "ver:%X ver0:%X\n", avctx->sub_id, avctx->extradata_size >= 4 ? ((uint32_t*)avctx->extradata)[0] : -1);
+        av_log(avctx, AV_LOG_DEBUG, "ver:%X ver0:%X\n", rv->sub_id, avctx->extradata_size >= 4 ? ((uint32_t*)avctx->extradata)[0] : -1);
     }
 
     avctx->pix_fmt = PIX_FMT_YUV420P;
@@ -498,7 +505,8 @@ static av_cold int rv10_decode_end(AVCodecContext *avctx)
 static int rv10_decode_packet(AVCodecContext *avctx,
                              const uint8_t *buf, int buf_size, int buf_size2)
 {
-    MpegEncContext *s = avctx->priv_data;
+    RVDecContext  *rv = avctx->priv_data;
+    MpegEncContext *s = &rv->m;
     int mb_count, mb_pos, left, start_mb_x, active_bits_size;
 
     active_bits_size = buf_size * 8;
@@ -506,7 +514,7 @@ static int rv10_decode_packet(AVCodecContext *avctx,
     if(s->codec_id ==CODEC_ID_RV10)
         mb_count = rv10_decode_picture_header(s);
     else
-        mb_count = rv20_decode_picture_header(s);
+        mb_count = rv20_decode_picture_header(rv);
     if (mb_count < 0) {
         av_log(s->avctx, AV_LOG_ERROR, "HEADER ERROR\n");
         return -1;
@@ -714,7 +722,7 @@ AVCodec ff_rv10_decoder = {
     .name           = "rv10",
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = CODEC_ID_RV10,
-    .priv_data_size = sizeof(MpegEncContext),
+    .priv_data_size = sizeof(RVDecContext),
     .init           = rv10_decode_init,
     .close          = rv10_decode_end,
     .decode         = rv10_decode_frame,
@@ -728,7 +736,7 @@ AVCodec ff_rv20_decoder = {
     .name           = "rv20",
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = CODEC_ID_RV20,
-    .priv_data_size = sizeof(MpegEncContext),
+    .priv_data_size = sizeof(RVDecContext),
     .init           = rv10_decode_init,
     .close          = rv10_decode_end,
     .decode         = rv10_decode_frame,
diff --git a/libavcodec/version.h b/libavcodec/version.h
index f3b40ca511..6fbe01c3a8 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -59,5 +59,8 @@
 #ifndef FF_API_INTER_THRESHOLD
 #define FF_API_INTER_THRESHOLD  (LIBAVCODEC_VERSION_MAJOR < 55)
 #endif
+#ifndef FF_API_SUB_ID
+#define FF_API_SUB_ID           (LIBAVCODEC_VERSION_MAJOR < 55)
+#endif
 
 #endif /* AVCODEC_VERSION_H */

From 44fe77b350fd812c84cd866b7d03e436acc3bab2 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Fri, 2 Mar 2012 17:00:53 +0100
Subject: [PATCH 26/27] lavc: make codec_is_decoder/encoder() public.

---
 doc/APIchanges       |  3 +++
 libavcodec/avcodec.h | 10 ++++++++++
 libavcodec/utils.c   | 18 +++++++++---------
 libavcodec/version.h |  2 +-
 4 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/doc/APIchanges b/doc/APIchanges
index 00cee5ad90..ef630e8e2a 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -12,6 +12,9 @@ libavutil:   2011-04-18
 
 API changes, most recent first:
 
+2012-03-xx - xxxxxxx - lavc 54.7.0 - avcodec.h
+  Add av_codec_is_encoder/decoder().
+
 2012-xx-xx - xxxxxxx - lavc 54.3.0 - avcodec.h
   Add av_packet_shrink_side_data.
 
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 491fb16a70..a99dcbd553 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -4297,4 +4297,14 @@ const AVClass *avcodec_get_class(void);
  */
 int avcodec_is_open(AVCodecContext *s);
 
+/**
+ * @return a non-zero number if codec is an encoder, zero otherwise
+ */
+int av_codec_is_encoder(AVCodec *codec);
+
+/**
+ * @return a non-zero number if codec is a decoder, zero otherwise
+ */
+int av_codec_is_decoder(AVCodec *codec);
+
 #endif /* AVCODEC_AVCODEC_H */
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index f9927a1383..d4384108df 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -112,12 +112,12 @@ static void avcodec_init(void)
     ff_dsputil_static_init();
 }
 
-static av_always_inline int codec_is_encoder(AVCodec *codec)
+int av_codec_is_encoder(AVCodec *codec)
 {
     return codec && (codec->encode || codec->encode2);
 }
 
-static av_always_inline int codec_is_decoder(AVCodec *codec)
+int av_codec_is_decoder(AVCodec *codec)
 {
     return codec && codec->decode;
 }
@@ -704,7 +704,7 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, AVCodec *codec, AVD
 
     /* if the decoder init function was already called previously,
        free the already allocated subtitle_header before overwriting it */
-    if (codec_is_decoder(codec))
+    if (av_codec_is_decoder(codec))
         av_freep(&avctx->subtitle_header);
 
 #define SANE_NB_CHANNELS 128U
@@ -748,7 +748,7 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, AVCodec *codec, AVD
         ret = AVERROR(EINVAL);
         goto free_and_end;
     }
-    if (codec_is_encoder(avctx->codec)) {
+    if (av_codec_is_encoder(avctx->codec)) {
         int i;
         if (avctx->codec->sample_fmts) {
             for (i = 0; avctx->codec->sample_fmts[i] != AV_SAMPLE_FMT_NONE; i++)
@@ -1367,7 +1367,7 @@ av_cold int avcodec_close(AVCodecContext *avctx)
         av_opt_free(avctx->priv_data);
     av_opt_free(avctx);
     av_freep(&avctx->priv_data);
-    if (codec_is_encoder(avctx->codec))
+    if (av_codec_is_encoder(avctx->codec))
         av_freep(&avctx->extradata);
     avctx->codec = NULL;
     avctx->active_thread_type = 0;
@@ -1385,7 +1385,7 @@ AVCodec *avcodec_find_encoder(enum CodecID id)
     AVCodec *p, *experimental=NULL;
     p = first_avcodec;
     while (p) {
-        if (codec_is_encoder(p) && p->id == id) {
+        if (av_codec_is_encoder(p) && p->id == id) {
             if (p->capabilities & CODEC_CAP_EXPERIMENTAL && !experimental) {
                 experimental = p;
             } else
@@ -1403,7 +1403,7 @@ AVCodec *avcodec_find_encoder_by_name(const char *name)
         return NULL;
     p = first_avcodec;
     while (p) {
-        if (codec_is_encoder(p) && strcmp(name,p->name) == 0)
+        if (av_codec_is_encoder(p) && strcmp(name,p->name) == 0)
             return p;
         p = p->next;
     }
@@ -1415,7 +1415,7 @@ AVCodec *avcodec_find_decoder(enum CodecID id)
     AVCodec *p;
     p = first_avcodec;
     while (p) {
-        if (codec_is_decoder(p) && p->id == id)
+        if (av_codec_is_decoder(p) && p->id == id)
             return p;
         p = p->next;
     }
@@ -1429,7 +1429,7 @@ AVCodec *avcodec_find_decoder_by_name(const char *name)
         return NULL;
     p = first_avcodec;
     while (p) {
-        if (codec_is_decoder(p) && strcmp(name,p->name) == 0)
+        if (av_codec_is_decoder(p) && strcmp(name,p->name) == 0)
             return p;
         p = p->next;
     }
diff --git a/libavcodec/version.h b/libavcodec/version.h
index 6fbe01c3a8..1ecfb215f6 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -21,7 +21,7 @@
 #define AVCODEC_VERSION_H
 
 #define LIBAVCODEC_VERSION_MAJOR 54
-#define LIBAVCODEC_VERSION_MINOR  6
+#define LIBAVCODEC_VERSION_MINOR  7
 #define LIBAVCODEC_VERSION_MICRO  0
 
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \

From 3faa141d15bf9945fa54331e51b3f10b9970d5d2 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Fri, 2 Mar 2012 17:05:01 +0100
Subject: [PATCH 27/27] cmdutils: use new avcodec_is_decoder/encoder()
 functions.

Fixes listing encoders.
---
 cmdutils.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmdutils.c b/cmdutils.c
index 985931cce6..ed0813f986 100644
--- a/cmdutils.c
+++ b/cmdutils.c
@@ -658,9 +658,9 @@ void show_codecs(void)
                 decode = encode = cap = 0;
             }
             if (p2 && strcmp(p->name, p2->name) == 0) {
-                if (p->decode)
+                if (av_codec_is_decoder(p))
                     decode = 1;
-                if (p->encode)
+                if (av_codec_is_encoder(p))
                     encode = 1;
                 cap |= p->capabilities;
             }