avutil/mips: refactor msa load and store macros.

Replace STnxm_UB and LDnxm_SH with new macros ST_{H/W/D}{1/2/4/8}.
The old macros are difficult to use because they don't follow the same parameter passing rules.
Changing details as following:
1. remove LD4x4_SH.
2. replace ST2x4_UB with ST_H4.
3. replace ST4x2_UB with ST_W2.
4. replace ST4x4_UB with ST_W4.
5. replace ST4x8_UB with ST_W8.
6. replace ST6x4_UB with ST_W2 and ST_H2.
7. replace ST8x1_UB with ST_D1.
8. replace ST8x2_UB with ST_D2.
9. replace ST8x4_UB with ST_D4.
10. replace ST8x8_UB with ST_D8.
11. replace ST12x4_UB with ST_D4 and ST_W4.

Examples of new macro: ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)
ST_H4 store four half-word elements in vector 'in' to pdst with stride.
About the macro name:
1) 'ST' means store operation.
2) 'H/W/D' means type of vector element is 'half-word/word/double-word'.
3) Number '1/2/4/8' means how many elements will be stored.
About the macro parameter:
1) 'in0, in1...' 128-bits vector.
2) 'idx0, idx1...' elements index.
3) 'pdst' destination pointer to store to
4) 'stride' stride of each store operation.

Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
Shiyou Yin 2019-07-17 17:35:00 +08:00 committed by Michael Niedermayer
parent 00ed04d614
commit 153c607525
24 changed files with 989 additions and 1149 deletions

View File

@ -86,10 +86,7 @@ static void h263_h_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)
ILVR_B2_SH(in3, in0, in1, in2, temp0, temp1);
in0 = (v16u8) __msa_ilvr_h(temp1, temp0);
in3 = (v16u8) __msa_ilvl_h(temp1, temp0);
ST4x4_UB(in0, in0, 0, 1, 2, 3, src, stride);
src += 4 * stride;
ST4x4_UB(in3, in3, 0, 1, 2, 3, src, stride);
src += 4 * stride;
ST_W8(in0, in3, 0, 1, 2, 3, 0, 1, 2, 3, src, stride);
}
static void h263_v_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)

View File

@ -85,7 +85,7 @@ static void avc_chroma_hz_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
res_r = __msa_sat_u_h(res_r, 7);
res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
ST2x4_UB(res, 0, dst, stride);
ST_H4(res, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hz_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -121,7 +121,7 @@ static void avc_chroma_hz_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
res_r = __msa_sat_u_h(res_r, 7);
res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
ST4x2_UB(res, dst, stride);
ST_W2(res, 0, 1, dst, stride);
}
static void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -144,7 +144,7 @@ static void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H2_UH(res0_r, res1_r, 6);
SAT_UH2_UH(res0_r, res1_r, 7);
out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
ST_W4(out, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -168,7 +168,7 @@ static void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H4_UH(res0, res1, res2, res3, 6);
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
ST4x8_UB(out0, out1, dst, stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hz_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -204,7 +204,7 @@ static void avc_chroma_hz_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H4_UH(res0, res1, res2, res3, 6);
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -237,7 +237,7 @@ static void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SAT_UH4_UH(res4, res5, res6, res7, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
ST8x8_UB(out0, out1, out2, out3, dst, stride);
ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst,
@ -266,7 +266,7 @@ static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst,
SRARI_H4_UH(res0, res1, res2, res3, 6);
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
}
@ -283,7 +283,7 @@ static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst,
res0 = __msa_sat_u_h(res0, 7);
res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
ST8x1_UB(res0, dst);
ST_D1(res0, 0, dst);
dst += stride;
}
}
@ -359,7 +359,7 @@ static void avc_chroma_vt_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
ST2x4_UB(res, 0, dst, stride);
ST_H4(res, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_vt_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -394,7 +394,7 @@ static void avc_chroma_vt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
res_r = __msa_sat_u_h(res_r, 7);
res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
ST4x2_UB(res, dst, stride);
ST_W2(res, 0, 1, dst, stride);
}
static void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -418,7 +418,7 @@ static void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H2_UH(res0_r, res1_r, 6);
SAT_UH2_UH(res0_r, res1_r, 7);
out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
ST_W4(out, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -446,7 +446,7 @@ static void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H4_UH(res0, res1, res2, res3, 6);
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
ST4x8_UB(out0, out1, dst, stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_vt_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -480,7 +480,7 @@ static void avc_chroma_vt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H4_UH(res0, res1, res2, res3, 6);
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -512,7 +512,7 @@ static void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
ST8x8_UB(out0, out1, out2, out3, dst, stride);
ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -592,7 +592,7 @@ static void avc_chroma_hv_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
ST2x4_UB(res, 0, dst, stride);
ST_H4(res, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hv_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -634,7 +634,7 @@ static void avc_chroma_hv_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
res_vt0 = __msa_sat_u_h(res_vt0, 7);
res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
ST4x2_UB(res, dst, stride);
ST_W2(res, 0, 1, dst, stride);
}
static void avc_chroma_hv_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -666,7 +666,8 @@ static void avc_chroma_hv_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H2_UH(res_vt0, res_vt1, 6);
SAT_UH2_UH(res_vt0, res_vt1, 7);
PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, stride);
ST_W2(res0, 0, 1, dst, stride);
ST_W2(res1, 0, 1, dst + 2 * stride, stride);
}
static void avc_chroma_hv_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -706,7 +707,7 @@ static void avc_chroma_hv_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
ST4x8_UB(res0, res1, dst, stride);
ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hv_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -766,7 +767,7 @@ static void avc_chroma_hv_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_hv_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -822,7 +823,7 @@ static void avc_chroma_hv_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
ST8x8_UB(out0, out1, out2, out3, dst, stride);
ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_hv_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -918,7 +919,7 @@ static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
dst0 = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
dst0 = __msa_aver_u_b(dst0, dst_data);
ST2x4_UB(dst0, 0, dst, stride);
ST_H4(dst0, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
@ -962,7 +963,7 @@ static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
dst_data = __msa_aver_u_b((v16u8) res, dst_data);
ST4x2_UB(dst_data, dst, stride);
ST_W2(dst_data, 0, 1, dst, stride);
}
static void avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
@ -991,7 +992,7 @@ static void avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
SAT_UH2_UH(res0_r, res1_r, 7);
out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
out = __msa_aver_u_b(out, dst_data);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
ST_W4(out, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
@ -1023,7 +1024,7 @@ static void avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
ST4x8_UB(out0, out1, dst, stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
@ -1066,7 +1067,7 @@ static void avc_chroma_hz_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
@ -1110,7 +1111,7 @@ static void avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
ST8x8_UB(out0, out1, out2, out3, dst, stride);
ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
@ -1200,7 +1201,7 @@ static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
ST2x4_UB(res, 0, dst, stride);
ST_H4(res, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
@ -1243,7 +1244,7 @@ static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
res = __msa_aver_u_b(res, dst_data);
ST4x2_UB(res, dst, stride);
ST_W2(res, 0, 1, dst, stride);
}
static void avc_chroma_vt_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
@ -1273,7 +1274,7 @@ static void avc_chroma_vt_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
SAT_UH2_UH(res0_r, res1_r, 7);
out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
out = __msa_aver_u_b(out, dst0);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
ST_W4(out, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_vt_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
@ -1309,7 +1310,7 @@ static void avc_chroma_vt_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
ST4x8_UB(out0, out1, dst, stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
@ -1351,7 +1352,7 @@ static void avc_chroma_vt_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_vt_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
@ -1394,7 +1395,7 @@ static void avc_chroma_vt_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
ST8x8_UB(out0, out1, out2, out3, dst, stride);
ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
@ -1492,7 +1493,7 @@ static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
dst0 = __msa_aver_u_b((v16u8) res, dst0);
ST2x4_UB(dst0, 0, dst, stride);
ST_H4(dst0, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
@ -1545,7 +1546,7 @@ static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
dst0 = (v16u8) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
dst0 = __msa_aver_u_b(dst0, dst_data);
ST4x2_UB(dst0, dst, stride);
ST_W2(dst0, 0, 1, dst, stride);
}
static void avc_chroma_hv_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
@ -1584,7 +1585,7 @@ static void avc_chroma_hv_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
SAT_UH2_UH(res_vt0, res_vt1, 7);
out = (v16u8) __msa_pckev_b((v16i8) res_vt1, (v16i8) res_vt0);
out = __msa_aver_u_b(out, dst_data);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
ST_W4(out, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hv_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
@ -1633,7 +1634,7 @@ static void avc_chroma_hv_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
ST4x8_UB(res0, res1, dst, stride);
ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
@ -1701,7 +1702,7 @@ static void avc_chroma_hv_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_hv_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
@ -1770,7 +1771,7 @@ static void avc_chroma_hv_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
ST8x8_UB(out0, out1, out2, out3, dst, stride);
ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
@ -1848,21 +1849,21 @@ static void avg_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
ST4x8_UB(dst0, dst1, dst, stride);
ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
} else if (4 == height) {
LW4(src, stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
LW4(dst, stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
dst0 = __msa_aver_u_b(src0, dst0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
ST_W4(dst0, 0, 1, 2, 3, dst, stride);
} else if (2 == height) {
LW2(src, stride, tp0, tp1);
INSERT_W2_UB(tp0, tp1, src0);
LW2(dst, stride, tp0, tp1);
INSERT_W2_UB(tp0, tp1, dst0);
dst0 = __msa_aver_u_b(src0, dst0);
ST4x2_UB(dst0, dst, stride);
ST_W2(dst0, 0, 1, dst, stride);
}
}
@ -1889,7 +1890,7 @@ static void avg_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
INSERT_D2_UB(tp6, tp7, dst3);
AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
dst2, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
} else if (4 == height) {
LD4(src, stride, tp0, tp1, tp2, tp3);
INSERT_D2_UB(tp0, tp1, src0);
@ -1898,7 +1899,7 @@ static void avg_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
INSERT_D2_UB(tp0, tp1, dst0);
INSERT_D2_UB(tp2, tp3, dst1);
AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
}
}

View File

@ -45,7 +45,7 @@ static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride,
tmp0 = __msa_srlr_h(tmp0, denom);
tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
ST4x2_UB(src0, data, stride);
ST_W2(src0, 0, 1, data, stride);
}
static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
@ -71,7 +71,7 @@ static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
tmp1 = __msa_srlr_h(tmp1, denom);
SAT_UH2_SH(tmp0, tmp1, 7);
src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST4x4_UB(src0, src0, 0, 1, 2, 3, data, stride);
ST_W4(src0, 0, 1, 2, 3, data, stride);
}
static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
@ -102,7 +102,7 @@ static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
ST4x8_UB(src0, src1, data, stride);
ST_W8(src0, src1, 0, 1, 2, 3, 0, 1, 2, 3, data, stride);
}
static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
@ -133,7 +133,7 @@ static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
ST8x4_UB(src0, src1, data, stride);
ST_D4(src0, src1, 0, 1, 0, 1, data, stride);
}
static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
@ -175,7 +175,7 @@ static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
src2, src3);
ST8x8_UB(src0, src1, src2, src3, data, stride);
ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
}
static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
@ -218,7 +218,7 @@ static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
src2, src3);
ST8x8_UB(src0, src1, src2, src3, data, stride);
ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
data += 8 * stride;
}
}
@ -253,7 +253,7 @@ static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
tmp0 = __msa_maxi_s_h(tmp0, 0);
tmp0 = __msa_min_s_h(max255, tmp0);
dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
ST4x2_UB(dst0, dst, stride);
ST_W2(dst0, 0, 1, dst, stride);
}
static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -287,7 +287,7 @@ static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
tmp1 >>= denom;
CLIP_SH2_0_255(tmp0, tmp1);
dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -327,7 +327,7 @@ static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
ST4x8_UB(dst0, dst1, dst, stride);
ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
}
static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -365,7 +365,7 @@ static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
}
static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -417,7 +417,7 @@ static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
@ -479,7 +479,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
dst0, dst1, dst2, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
dst += 8 * stride;
}
}
@ -955,18 +955,18 @@ static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data,
ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
src = data - 3;
ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src, img_width);
ST2x4_UB(tmp2, 0, src + 4, img_width);
ST_W4(tmp3, 0, 1, 2, 3, src, img_width);
ST_H4(tmp2, 0, 1, 2, 3, src + 4, img_width);
src += 4 * img_width;
ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src, img_width);
ST2x4_UB(tmp2, 4, src + 4, img_width);
ST_W4(tmp4, 0, 1, 2, 3, src, img_width);
ST_H4(tmp2, 4, 5, 6, 7, src + 4, img_width);
src += 4 * img_width;
ST4x4_UB(tmp6, tmp6, 0, 1, 2, 3, src, img_width);
ST2x4_UB(tmp5, 0, src + 4, img_width);
ST_W4(tmp6, 0, 1, 2, 3, src, img_width);
ST_H4(tmp5, 0, 1, 2, 3, src + 4, img_width);
src += 4 * img_width;
ST4x4_UB(tmp7, tmp7, 0, 1, 2, 3, src, img_width);
ST2x4_UB(tmp5, 4, src + 4, img_width);
ST_W4(tmp7, 0, 1, 2, 3, src, img_width);
ST_H4(tmp5, 4, 5, 6, 7, src + 4, img_width);
}
}
}
@ -1274,9 +1274,9 @@ static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr,
tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
data_cb_or_cr -= 1;
ST2x4_UB(tmp1, 0, data_cb_or_cr, img_width);
ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width);
data_cb_or_cr += 4 * img_width;
ST2x4_UB(tmp1, 4, data_cb_or_cr, img_width);
ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width);
}
}
@ -2110,9 +2110,9 @@ static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data,
q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
src = data - 1;
ST2x4_UB(tmp1, 0, src, img_width);
ST_H4(tmp1, 0, 1, 2, 3, src, img_width);
src += 4 * img_width;
ST2x4_UB(tmp1, 4, src, img_width);
ST_H4(tmp1, 4, 5, 6, 7, src, img_width);
}
}
}
@ -2136,7 +2136,7 @@ static void avc_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride,
}
AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
ST2x4_UB(res, 0, (src - 1), stride);
ST_H4(res, 0, 1, 2, 3, (src - 1), stride);
src += (4 * stride);
}
}

View File

@ -237,9 +237,7 @@ static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
CLIP_SH4_0_255(res4, res5, res6, res7);
PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
dst0, dst1, dst2, dst3);
ST8x4_UB(dst0, dst1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x4_UB(dst2, dst3, dst, dst_stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
}
static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
@ -269,9 +267,7 @@ static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
CLIP_SH4_0_255(dst4_r, dst5_r, dst6_r, dst7_r);
PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r,
dst0, dst1, dst2, dst3);
ST8x4_UB(dst0, dst1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x4_UB(dst2, dst3, dst, dst_stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
}
void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
@ -340,7 +336,7 @@ void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
ADD2(pred_r, input_dc, pred_l, input_dc, pred_r, pred_l);
CLIP_SH2_0_255(pred_r, pred_l);
out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,

View File

@ -149,7 +149,7 @@ static void avc_luma_hv_qrt_4x4_msa(const uint8_t *src_x, const uint8_t *src_y,
SAT_SH2_SH(out0, out1, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
ST_W4(out, 0, 1, 2, 3, dst, stride);
}
static void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y,
@ -220,7 +220,7 @@ static void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y,
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
@ -256,8 +256,7 @@ static void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y,
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
ST8x4_UB(out0, out1, dst, stride);
dst += (4 * stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
static void avc_luma_hv_qrt_16x16_msa(const uint8_t *src_x,
@ -337,7 +336,7 @@ static void avc_luma_hv_qrt_16x16_msa(const uint8_t *src_x,
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
ST8x4_UB(tmp0, tmp1, dst, stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
src_vt0 = src_vt4;
@ -419,7 +418,7 @@ static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x,
res = PCKEV_XORI128_UB(res0, res1);
dst0 = __msa_aver_u_b(res, dst0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
@ -498,7 +497,7 @@ static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
@ -539,8 +538,7 @@ static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, stride);
dst += (4 * stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
}
static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x,
@ -627,7 +625,7 @@ static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x,
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
src_vt0 = src_vt4;
@ -723,7 +721,7 @@ void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
dst2, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
@ -739,7 +737,7 @@ void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
dst0 = __msa_aver_u_b(src0, dst0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
@ -930,7 +928,7 @@ void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
tmp2 = __msa_aver_s_b(tmp2, src4);
tmp3 = __msa_aver_s_b(tmp3, src5);
XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
@ -985,7 +983,7 @@ void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
tmp2 = __msa_aver_s_b(tmp2, src4);
tmp3 = __msa_aver_s_b(tmp3, src5);
XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
@ -1016,7 +1014,7 @@ void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
res = __msa_aver_s_b(res, src0);
res = (v16i8) __msa_xori_b((v16u8) res, 128);
ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
ST_W4(res, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
@ -1047,7 +1045,7 @@ void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
res = __msa_aver_s_b(res, src0);
res = (v16i8) __msa_xori_b((v16u8) res, 128);
ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
ST_W4(res, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
@ -1153,7 +1151,7 @@ void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
out1 = PCKEV_XORI128_UB(res2, res3);
out2 = PCKEV_XORI128_UB(res4, res5);
out3 = PCKEV_XORI128_UB(res6, res7);
ST8x8_UB(out0, out1, out2, out3, dst, stride);
ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
@ -1178,7 +1176,7 @@ void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
SRARI_H2_SH(res0, res1, 5);
SAT_SH2_SH(res0, res1, 7);
out = PCKEV_XORI128_UB(res0, res1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
ST_W4(out, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
@ -1378,7 +1376,7 @@ void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
out2 = __msa_aver_s_b(out2, tmp2);
out3 = __msa_aver_s_b(out3, tmp3);
XORI_B4_128_SB(out0, out1, out2, out3);
ST8x8_UB(out0, out1, out2, out3, dst, stride);
ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
@ -1431,7 +1429,7 @@ void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
out2 = __msa_aver_s_b(out2, tmp2);
out3 = __msa_aver_s_b(out3, tmp3);
XORI_B4_128_SB(out0, out1, out2, out3);
ST8x8_UB(out0, out1, out2, out3, dst, stride);
ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
@ -1472,7 +1470,7 @@ void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
out = __msa_aver_u_b(out, (v16u8) src32_r);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
ST_W4(out, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
@ -1513,7 +1511,7 @@ void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
out = __msa_aver_u_b(out, (v16u8) src32_r);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
ST_W4(out, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
@ -1691,7 +1689,7 @@ void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
hz_out0 = hz_out4;
@ -1804,7 +1802,7 @@ void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
hz_out0 = hz_out4;
@ -1905,7 +1903,7 @@ void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD_SB4(src, stride, src9, src10, src11, src12);
@ -1951,7 +1949,7 @@ void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
@ -2040,7 +2038,7 @@ void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD_SB4(src, stride, src9, src10, src11, src12);
@ -2086,7 +2084,7 @@ void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
@ -2150,7 +2148,7 @@ void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
dst1 = __msa_aver_s_h(dst1, hz_out4);
res = PCKEV_XORI128_UB(dst0, dst1);
ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
ST_W4(res, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
@ -2215,7 +2213,7 @@ void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
dst1 = __msa_aver_s_h(dst1, hz_out1);
res = PCKEV_XORI128_UB(dst0, dst1);
ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
ST_W4(res, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
@ -2332,7 +2330,7 @@ void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
out1 = PCKEV_XORI128_UB(out2_r, out3_r);
out2 = PCKEV_XORI128_UB(out4_r, out5_r);
out3 = PCKEV_XORI128_UB(out6_r, out7_r);
ST8x8_UB(out0, out1, out2, out3, dst, stride);
ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
@ -2369,7 +2367,7 @@ void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
SRARI_H2_SH(out10, out32, 5);
SAT_SH2_SH(out10, out32, 7);
out = PCKEV_XORI128_UB(out10, out32);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
ST_W4(out, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
@ -2601,7 +2599,7 @@ void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
dst0 = __msa_aver_s_h(dst2, dst0);
dst1 = __msa_aver_s_h(dst3, dst1);
out = PCKEV_XORI128_UB(dst0, dst1);
ST8x2_UB(out, dst, stride);
ST_D2(out, 0, 1, dst, stride);
dst += (2 * stride);
src0 = src2;
@ -2677,7 +2675,7 @@ void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
dst0 = __msa_aver_s_h(dst2, dst0);
dst1 = __msa_aver_s_h(dst3, dst1);
out = PCKEV_XORI128_UB(dst0, dst1);
ST8x2_UB(out, dst, stride);
ST_D2(out, 0, 1, dst, stride);
dst += (2 * stride);
src0 = src2;
@ -2777,7 +2775,7 @@ void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
out = PCKEV_XORI128_UB(dst0, dst2);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
ST_W4(out, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
@ -2873,7 +2871,7 @@ void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
out = PCKEV_XORI128_UB(dst0, dst2);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
ST_W4(out, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
@ -2961,7 +2959,7 @@ void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
hz_out0 = hz_out4;
@ -3049,7 +3047,7 @@ void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD_SB4(src, stride, src0, src1, src2, src3);
@ -3086,7 +3084,7 @@ void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
@ -3141,7 +3139,7 @@ void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
filt2);
dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
res = PCKEV_XORI128_UB(dst0, dst1);
ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
ST_W4(res, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
@ -3350,7 +3348,7 @@ void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
INSERT_D2_UB(tp2, tp3, dst3);
AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
@ -3415,7 +3413,7 @@ void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
INSERT_D2_UB(tp2, tp3, dst3);
AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
@ -3451,7 +3449,7 @@ void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
LW4(dst, stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
dst0 = __msa_aver_u_b((v16u8) res, dst0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
@ -3487,7 +3485,7 @@ void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
LW4(dst, stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
dst0 = __msa_aver_u_b((v16u8) res, dst0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
@ -3608,7 +3606,7 @@ void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
INSERT_D2_UB(tp2, tp3, out7);
AVER_UB2_UB(out0, out2, out1, out3, out0, out1);
AVER_UB2_UB(out4, out6, out5, out7, out4, out5);
ST8x8_UB(out0, out1, out4, out5, dst, stride);
ST_D8(out0, out1, out4, out5, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
@ -3637,7 +3635,7 @@ void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
LW4(dst, stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
res = __msa_aver_u_b(res, dst0);
ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
ST_W4(res, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
@ -3856,7 +3854,7 @@ void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
XORI_B4_128_SB(out0, out1, out2, out3);
AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
dst2, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
@ -3922,7 +3920,7 @@ void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
XORI_B4_128_SB(out0, out1, out2, out3);
AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
dst2, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
@ -3967,7 +3965,7 @@ void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
res = PCKEV_XORI128_UB(out10, out32);
res = __msa_aver_u_b(res, (v16u8) src32_r);
dst0 = __msa_aver_u_b(res, dst0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
@ -4013,7 +4011,7 @@ void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
res = __msa_aver_u_b(res, (v16u8) src32_r);
dst0 = __msa_aver_u_b(res, dst0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
@ -4196,7 +4194,7 @@ void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
dst0 = __msa_aver_u_b(out0, dst0);
ST8x2_UB(dst0, dst, stride);
ST_D2(dst0, 0, 1, dst, stride);
dst += (2 * stride);
LD_SB2(src, stride, src7, src8);
@ -4232,7 +4230,7 @@ void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
dst1 = __msa_aver_u_b(out1, dst1);
ST8x2_UB(dst1, dst, stride);
ST_D2(dst1, 0, 1, dst, stride);
dst += (2 * stride);
hz_out0 = hz_out4;
@ -4326,7 +4324,7 @@ void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
INSERT_D2_UB(tp0, tp1, dst0);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
dst0 = __msa_aver_u_b(out0, dst0);
ST8x2_UB(dst0, dst, stride);
ST_D2(dst0, 0, 1, dst, stride);
dst += (2 * stride);
LD_SB2(src, stride, src7, src8);
@ -4361,7 +4359,7 @@ void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
INSERT_D2_UB(tp2, tp3, dst1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
dst1 = __msa_aver_u_b(out1, dst1);
ST8x2_UB(dst1, dst, stride);
ST_D2(dst1, 0, 1, dst, stride);
dst += (2 * stride);
hz_out0 = hz_out4;
@ -4468,7 +4466,7 @@ void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD_SB4(src, stride, src9, src10, src11, src12);
@ -4519,7 +4517,7 @@ void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
@ -4614,7 +4612,7 @@ void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD_SB4(src, stride, src9, src10, src11, src12);
@ -4665,7 +4663,7 @@ void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
@ -4732,7 +4730,7 @@ void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
res = PCKEV_XORI128_UB(dst0, dst1);
res = __msa_aver_u_b(res, out);
ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
ST_W4(res, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
@ -4800,7 +4798,7 @@ void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
res = PCKEV_XORI128_UB(dst0, dst1);
res = __msa_aver_u_b(res, out);
ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
ST_W4(res, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
@ -4936,7 +4934,7 @@ void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
out3 = PCKEV_XORI128_UB(out6_r, out7_r);
AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
dst2, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
@ -4977,7 +4975,7 @@ void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
res = PCKEV_XORI128_UB(out10, out32);
dst0 = __msa_aver_u_b(res, dst0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
@ -5217,7 +5215,7 @@ void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
tmp1 = __msa_aver_s_h(tmp3, tmp1);
out = PCKEV_XORI128_UB(tmp0, tmp1);
out = __msa_aver_u_b(out, dst0);
ST8x2_UB(out, dst, stride);
ST_D2(out, 0, 1, dst, stride);
dst += (2 * stride);
src0 = src2;
@ -5297,7 +5295,7 @@ void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
tmp1 = __msa_aver_s_h(tmp3, tmp1);
out = PCKEV_XORI128_UB(tmp0, tmp1);
out = __msa_aver_u_b(out, dst0);
ST8x2_UB(out, dst, stride);
ST_D2(out, 0, 1, dst, stride);
dst += (2 * stride);
src0 = src2;
@ -5401,7 +5399,7 @@ void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
out = PCKEV_XORI128_UB(dst0, dst2);
out = __msa_aver_u_b(out, dstv);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
ST_W4(out, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
@ -5500,7 +5498,7 @@ void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
out = PCKEV_XORI128_UB(dst0, dst2);
out = __msa_aver_u_b(out, dstv);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
ST_W4(out, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
@ -5592,7 +5590,7 @@ void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(res0, res1);
out1 = PCKEV_XORI128_UB(res2, res3);
AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
ST8x4_UB(out0, out1, dst, stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
hz_out0 = hz_out4;
@ -5685,7 +5683,7 @@ void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(res0, res1);
out1 = PCKEV_XORI128_UB(res2, res3);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD_SB4(src, stride, src0, src1, src2, src3);
@ -5726,7 +5724,7 @@ void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(res0, res1);
out1 = PCKEV_XORI128_UB(res2, res3);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
@ -5785,5 +5783,5 @@ void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
res = PCKEV_XORI128_UB(res0, res1);
res = __msa_aver_u_b(res, dst0);
ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
ST_W4(res, 0, 1, 2, 3, dst, stride);
}

View File

@ -727,7 +727,7 @@ static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0);
CLIP_SH2_0_255(dst_r0, dst_l0);
dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0);
ST4x4_UB(dst_vec, dst_vec, 0, 1, 2, 3, dst, stride);
ST_W4(dst_vec, 0, 1, 2, 3, dst, stride);
}
static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
@ -752,8 +752,7 @@ static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
dst_r0, dst_l0, dst_r1, dst_l1);
CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
ST8x4_UB(dst_r0, dst_r1, dst, stride);
dst += (4 * stride);
ST_D4(dst_r0, dst_r1, 0, 1, 0, 1, dst, stride);
LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
INSERT_D2_SD(dst0, dst1, dst_vec0);
@ -764,7 +763,7 @@ static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
dst_r0, dst_l0, dst_r1, dst_l1);
CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
ST8x4_UB(dst_r0, dst_r1, dst, stride);
ST_D4(dst_r0, dst_r1, 0, 1, 0, 1, dst + 4 * stride, stride);
}
static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)

View File

@ -199,11 +199,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
ST8x4_UB(dst0, dst1, p2, stride);
p2 += (4 * stride);
SD(dst_val0, p2);
p2 += stride;
SD(dst_val1, p2);
ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
SD(dst_val0, p2 + 4 * stride);
SD(dst_val1, p2 + 5 * stride);
/* strong filter ends */
} else if (flag0 == flag1) { /* weak only */
/* weak filter */
@ -288,7 +286,7 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3);
p2 += stride;
ST8x4_UB(dst0, dst1, p2, stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
/* weak filter ends */
} else { /* strong + weak */
/* strong filter */
@ -442,11 +440,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
ST8x4_UB(dst0, dst1, p2, stride);
p2 += (4 * stride);
SD(dst_val0, p2);
p2 += stride;
SD(dst_val1, p2);
ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
SD(dst_val0, p2 + 4 * stride);
SD(dst_val1, p2 + 5 * stride);
}
}
}
@ -976,7 +972,7 @@ static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride,
temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
ST8x2_UB(temp0, p0_ptr, stride);
ST_D2(temp0, 0, 1, p0_ptr, stride);
}
}
@ -1037,9 +1033,7 @@ static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride,
temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
src += 1;
ST2x4_UB(temp0, 0, src, stride);
src += (4 * stride);
ST2x4_UB(temp0, 4, src, stride);
ST_H8(temp0, 0, 1, 2, 3, 4, 5, 6, 7, src, stride);
}
}
@ -1087,7 +1081,7 @@ static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride,
LD_UB4(src, src_stride, src0, src1, src2, src3);
/* store results */
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
}
@ -1102,7 +1096,7 @@ static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride,
dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
/* store results */
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
@ -1153,7 +1147,7 @@ static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
XORI_B2_128_SB(dst0, dst1);
/* store results */
ST8x4_UB(dst0, dst1, dst, dst_stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
dst += dst_stride << 2;
}
@ -1173,7 +1167,7 @@ static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
XORI_B2_128_SB(dst0, dst1);
/* store results */
ST8x4_UB(dst0, dst1, dst, dst_stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
}
static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst,

View File

@ -86,7 +86,7 @@ static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
ST4x2_UB(dst0, dst, dst_stride);
ST_W2(dst0, 0, 1, dst, dst_stride);
} else if (4 == height) {
LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
@ -97,7 +97,7 @@ static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
SLLI_2V(dst0, dst1, 6);
HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
} else if (0 == height % 8) {
for (loop_cnt = (height >> 3); loop_cnt--;) {
LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
@ -120,7 +120,7 @@ static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
dst3, 7, dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
ST4x8_UB(dst0, dst1, dst, dst_stride);
ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
@ -165,9 +165,15 @@ static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr,
7, dst4, dst5, dst6, dst7);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
ST6x4_UB(out0, out1, dst, dst_stride);
ST_W2(out0, 0, 2, dst, dst_stride);
ST_H2(out0, 2, 6, dst + 4, dst_stride);
ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
ST6x4_UB(out2, out3, dst, dst_stride);
ST_W2(out2, 0, 2, dst, dst_stride);
ST_H2(out2, 2, 6, dst + 4, dst_stride);
ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
}
}
@ -195,7 +201,7 @@ static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
SLLI_2V(dst0, dst1, 6);
HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
ST8x2_UB(out0, dst, dst_stride);
ST_D2(out0, 0, 1, dst, dst_stride);
} else if (4 == height) {
LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
INSERT_D2_SB(tp0, tp1, src0);
@ -207,7 +213,7 @@ static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
7, dst0, dst1, dst2, dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
} else if (6 == height) {
LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
src0_ptr += 4 * src_stride;
@ -225,9 +231,8 @@ static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
7, dst0, dst1, dst2, dst3);
HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x2_UB(out2, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
} else if (0 == height % 8) {
uint32_t loop_cnt;
@ -255,7 +260,7 @@ static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
dst7, 7, dst4, dst5, dst6, dst7);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
}
}
@ -294,7 +299,8 @@ static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr,
7, dst0, dst1, dst2, dst3);
HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
ST12x4_UB(out0, out1, out2, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
@ -378,7 +384,7 @@ static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr,
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
ST_UB4(out0, out1, out3, out4, dst, dst_stride);
ST8x4_UB(out2, out5, dst + 16, dst_stride);
ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
dst += (4 * dst_stride);
}
}
@ -588,7 +594,7 @@ static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr,
dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
ST4x8_UB(dst0, dst1, dst, dst_stride);
ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
@ -656,7 +662,7 @@ static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr,
dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, dst_stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -1242,7 +1248,7 @@ static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr,
dst10, dst32, dst54, dst76);
PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
ST4x8_UB(dst10, dst54, dst, dst_stride);
ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
src2110 = src10998;
@ -1316,7 +1322,7 @@ static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr,
dst0_r, dst1_r, dst2_r, dst3_r);
PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
@ -1420,7 +1426,8 @@ static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr,
PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
@ -1721,7 +1728,7 @@ static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr,
SRARI_H2_SH(out0, out1, 7);
CLIP_SH2_0_255_MAX_SATU(out0, out1);
out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
dst10 = dst54;
@ -1849,7 +1856,7 @@ static void hevc_hv_bi_8t_8multx1mult_msa(uint8_t *src0_ptr,
tmp = __msa_srari_h(tmp, 7);
tmp = CLIP_SH_0_255_MAX_SATU(tmp);
out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
ST8x1_UB(out, dst_tmp);
ST_D1(out, 0, dst_tmp);
dst_tmp += dst_stride;
dst0 = dst1;
@ -1995,7 +2002,7 @@ static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
tmp = __msa_srari_h(tmp, 7);
tmp = CLIP_SH_0_255_MAX_SATU(tmp);
out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
ST8x1_UB(out, dst_tmp);
ST_D1(out, 0, dst_tmp);
dst_tmp += dst_stride;
dst0 = dst1;
@ -2083,7 +2090,7 @@ static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
SRARI_H2_SH(out0, out1, 7);
CLIP_SH2_0_255_MAX_SATU(out0, out1);
out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
dst10 = dst54;
@ -2211,7 +2218,7 @@ static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr,
tmp0 = CLIP_SH_0_255(tmp0);
dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
ST4x2_UB(dst0, dst, dst_stride);
ST_W2(dst0, 0, 1, dst, dst_stride);
}
static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
@ -2257,7 +2264,7 @@ static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
@ -2318,7 +2325,7 @@ static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
ST4x8_UB(dst0, dst1, dst, dst_stride);
ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
@ -2398,7 +2405,10 @@ static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr,
dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
ST6x4_UB(dst0, dst1, dst, dst_stride);
ST_W2(dst0, 0, 2, dst, dst_stride);
ST_H2(dst0, 2, 6, dst + 4, dst_stride);
ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
}
}
@ -2443,7 +2453,7 @@ static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr,
HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
ST8x2_UB(dst0, dst, dst_stride);
ST_D2(dst0, 0, 1, dst, dst_stride);
}
static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr,
@ -2506,9 +2516,8 @@ static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr,
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
ST8x4_UB(dst0, dst1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x2_UB(dst2, dst, dst_stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
ST_D2(dst2, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
@ -2564,7 +2573,7 @@ static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, dst_stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -2659,7 +2668,8 @@ static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr,
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
ST_W4(dst2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
@ -2825,7 +2835,7 @@ static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr,
dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
ST8x4_UB(dst0, dst1, dst_tmp, dst_stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
}
}
@ -2936,7 +2946,7 @@ static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr,
dst10 = CLIP_SH_0_255(dst10);
dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
ST4x2_UB(dst10, dst, dst_stride);
ST_W2(dst10, 0, 1, dst, dst_stride);
}
static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr,
@ -2985,7 +2995,7 @@ static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr,
HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32);
dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
@ -3056,7 +3066,7 @@ static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
dst10, dst32, dst54, dst76);
PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
ST4x8_UB(dst10, dst54, dst, dst_stride);
ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
@ -3147,7 +3157,10 @@ static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr,
dst0_r, dst1_r, dst2_r, dst3_r);
PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
ST_W2(dst0_r, 0, 2, dst, dst_stride);
ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
@ -3171,7 +3184,10 @@ static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr,
dst0_r, dst1_r, dst2_r, dst3_r);
PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
ST_W2(dst0_r, 0, 2, dst, dst_stride);
ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
}
@ -3216,7 +3232,7 @@ static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr,
HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
ST8x2_UB(dst0_r, dst, dst_stride);
ST_D2(dst0_r, 0, 1, dst, dst_stride);
}
static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr,
@ -3275,9 +3291,8 @@ static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr,
PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
dst += (4 * dst_stride);
ST8x2_UB(dst2_r, dst, dst_stride);
ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
ST_D2(dst2_r, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
@ -3337,7 +3352,7 @@ static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
dst0_r, dst1_r, dst2_r, dst3_r);
PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -3436,7 +3451,8 @@ static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr,
PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
src2 = src6;
@ -3610,7 +3626,7 @@ static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr,
PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
ST_SH2(dst0_r, dst1_r, dst, dst_stride);
ST8x2_UB(dst2_r, dst + 16, dst_stride);
ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
dst += (2 * dst_stride);
/* 16width */
@ -3650,7 +3666,7 @@ static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr,
PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
ST_SH2(dst0_r, dst1_r, dst, dst_stride);
ST8x2_UB(dst2_r, dst + 16, dst_stride);
ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
dst += (2 * dst_stride);
}
}
@ -3829,7 +3845,7 @@ static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr,
tmp = __msa_srari_h(tmp, 7);
tmp = CLIP_SH_0_255_MAX_SATU(tmp);
out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
ST4x2_UB(out, dst, dst_stride);
ST_W2(out, 0, 1, dst, dst_stride);
}
static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
@ -3905,7 +3921,7 @@ static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
SRARI_H2_SH(tmp0, tmp1, 7);
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
@ -4018,7 +4034,7 @@ static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
@ -4186,7 +4202,7 @@ static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
src1_ptr += (4 * src2_stride);
@ -4198,9 +4214,7 @@ static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
SRARI_H2_SH(tmp4, tmp5, 7);
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
ST2x4_UB(out2, 0, dst + 4, dst_stride);
dst += 4 * dst_stride;
ST2x4_UB(out2, 4, dst + 4, dst_stride);
ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
}
static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
@ -4274,7 +4288,7 @@ static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
SRARI_H2_SH(tmp0, tmp1, 7);
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST8x2_UB(out, dst, dst_stride);
ST_D2(out, 0, 1, dst, dst_stride);
}
static void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr,
@ -4368,7 +4382,7 @@ static void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr,
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += 8;
}
}
@ -4485,9 +4499,8 @@ static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr,
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x2_UB(out2, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
@ -4599,7 +4612,7 @@ static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst_tmp, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
dst10_r = dst54_r;
@ -4749,7 +4762,7 @@ static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst_tmp, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
dst10_r = dst54_r;
@ -4835,7 +4848,7 @@ static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;

View File

@ -126,7 +126,7 @@ static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
ST4x2_UB(out0, dst, dst_stride);
ST_W2(out0, 0, 1, dst, dst_stride);
} else if (4 == height) {
LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
@ -138,7 +138,7 @@ static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec,
offset_vec, dst0, dst1);
out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
} else if (0 == height % 8) {
for (loop_cnt = (height >> 3); loop_cnt--;) {
LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
@ -162,7 +162,7 @@ static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
in3, weight_vec, rnd_vec, offset_vec,
dst0, dst1, dst2, dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
@ -214,7 +214,10 @@ static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr,
weight_vec, rnd_vec, offset_vec,
dst0, dst1, dst2, dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
ST6x4_UB(out0, out1, dst, dst_stride);
ST_W2(out0, 0, 2, dst, dst_stride);
ST_H2(out0, 2, 6, dst + 4, dst_stride);
ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
}
}
@ -261,7 +264,7 @@ static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
dst0, dst1);
out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
ST8x2_UB(out0, dst, dst_stride);
ST_D2(out0, 0, 1, dst, dst_stride);
} else if (6 == height) {
LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
src0_ptr += 4 * src_stride;
@ -281,9 +284,8 @@ static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
offset_vec, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x2_UB(out2, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
} else if (0 == height % 4) {
uint32_t loop_cnt;
@ -302,7 +304,7 @@ static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
in3, weight_vec, rnd_vec, offset_vec,
dst0, dst1, dst2, dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -361,7 +363,8 @@ static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr,
HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
offset_vec, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
ST12x4_UB(out0, out1, out2, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
@ -480,7 +483,7 @@ static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr,
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
ST_UB4(out0, out1, out3, out4, dst, dst_stride);
ST8x4_UB(out2, out5, dst + 16, dst_stride);
ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
dst += (4 * dst_stride);
}
}
@ -720,7 +723,7 @@ static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr,
out0, out1);
out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -800,7 +803,7 @@ static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr,
out0, out1, out2, out3);
PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -876,7 +879,7 @@ static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr,
weight_vec, rnd_vec, offset_vec, out0, out1, out2,
out3);
PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3);
src0_ptr += (4 * src_stride);
@ -895,7 +898,7 @@ static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr,
HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec,
offset_vec, out0, out1);
out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
@ -1483,7 +1486,7 @@ static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr,
out0, out1, out2, out3);
PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
src2110 = src10998;
@ -1568,7 +1571,7 @@ static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr,
out0, out1, out2, out3);
PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
@ -1674,8 +1677,8 @@ static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr,
dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
out2 = CLIP_SH_0_255(dst2_r);
PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
ST8x2_UB(out0, dst, dst_stride);
ST4x2_UB(out2, dst + 8, dst_stride);
ST_D2(out0, 0, 1, dst, dst_stride);
ST_W2(out2, 0, 1, dst + 8, dst_stride);
dst += (2 * dst_stride);
src10_r = src32_r;
@ -2048,7 +2051,7 @@ static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
dst10 = dst54;
@ -2226,7 +2229,7 @@ static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr,
CLIP_SW4_0_255_MAX_SATU(dst0_l, dst0_r, dst1_l, dst1_r);
PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST8x2_UB(out, dst_tmp, dst_stride);
ST_D2(out, 0, 1, dst_tmp, dst_stride);
dst_tmp += (2 * dst_stride);
dst0 = dst2;
@ -2412,7 +2415,7 @@ static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
CLIP_SW4_0_255_MAX_SATU(dst1, dst0, dst3, dst2);
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST8x2_UB(out, dst_tmp, dst_stride);
ST_D2(out, 0, 1, dst_tmp, dst_stride);
dst_tmp += (2 * dst_stride);
dsth0 = dsth2;
@ -2503,7 +2506,7 @@ static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
dst10 = dst54;
@ -2683,7 +2686,7 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
out0 = CLIP_SH_0_255(dst0_r);
out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
ST4x2_UB(out0, dst, dst_stride);
ST_W2(out0, 0, 1, dst, dst_stride);
}
static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
@ -2743,7 +2746,7 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
dst0, dst1);
dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
@ -2816,7 +2819,7 @@ static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
ST4x8_UB(dst0, dst1, dst, dst_stride);
ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
@ -2918,7 +2921,10 @@ static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr,
dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
ST6x4_UB(dst0, dst1, dst, dst_stride);
ST_W2(dst0, 0, 2, dst, dst_stride);
ST_H2(dst0, 2, 6, dst + 4, dst_stride);
ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
}
}
@ -2976,7 +2982,7 @@ static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
dst0, dst1);
dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
ST8x2_UB(dst0, dst, dst_stride);
ST_D2(dst0, 0, 1, dst, dst_stride);
}
static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
@ -3049,9 +3055,8 @@ static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
ST8x4_UB(dst0, dst1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x2_UB(dst3, dst, dst_stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
ST_D2(dst3, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
@ -3119,7 +3124,7 @@ static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, dst_stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -3235,7 +3240,8 @@ static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr,
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
ST12x4_UB(dst0, dst1, dst3, dst, dst_stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
ST_W4(dst3, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
@ -3411,7 +3417,7 @@ static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr,
dst0, dst1);
dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
ST8x2_UB(dst0, (dst + 16), dst_stride);
ST_D2(dst0, 0, 1, (dst + 16), dst_stride);
dst += (2 * dst_stride);
}
}
@ -3551,7 +3557,7 @@ static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
dst10_r = (v4i32) __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
out = CLIP_SH_0_255(dst10_r);
out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out);
ST4x2_UB(out, dst, dst_stride);
ST_W2(out, 0, 1, dst, dst_stride);
}
static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
@ -3617,7 +3623,7 @@ static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
dst10, dst32);
dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
}
@ -3702,7 +3708,7 @@ static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
dst10, dst32, dst54, dst76);
PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
ST4x8_UB(dst10, dst32, dst, dst_stride);
ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
@ -3807,7 +3813,10 @@ static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr,
tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
ST6x4_UB(tmp0, tmp1, dst, dst_stride);
ST_W2(tmp0, 0, 2, dst, dst_stride);
ST_H2(tmp0, 2, 6, dst + 4, dst_stride);
ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
}
}
@ -3866,7 +3875,7 @@ static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
tmp0, tmp1);
tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST8x2_UB(tmp0, dst, dst_stride);
ST_D2(tmp0, 0, 1, dst, dst_stride);
}
static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
@ -3936,9 +3945,8 @@ static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x2_UB(tmp3, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
ST_D2(tmp3, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
@ -4010,7 +4018,7 @@ static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -4132,7 +4140,8 @@ static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr,
PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
ST12x4_UB(tmp0, tmp1, tmp2, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
@ -4323,7 +4332,7 @@ static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr,
/* 8width */
tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
ST_SH2(tmp0, tmp1, dst, dst_stride);
ST8x2_UB(tmp2, dst + 16, dst_stride);
ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
dst += (2 * dst_stride);
/* 16width */
@ -4363,7 +4372,7 @@ static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr,
/* 8width */
tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
ST_SH2(tmp0, tmp1, dst, dst_stride);
ST8x2_UB(tmp2, dst + 16, dst_stride);
ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
dst += (2 * dst_stride);
}
}
@ -4568,7 +4577,7 @@ static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
tmp = CLIP_SH_0_255_MAX_SATU(tmp);
out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
ST4x2_UB(out, dst, dst_stride);
ST_W2(out, 0, 1, dst, dst_stride);
}
static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
@ -4665,7 +4674,7 @@ static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
@ -4803,7 +4812,7 @@ static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
@ -5001,7 +5010,7 @@ static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
@ -5023,9 +5032,7 @@ static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
ST2x4_UB(out2, 0, dst + 4, dst_stride);
dst += 4 * dst_stride;
ST2x4_UB(out2, 4, dst + 4, dst_stride);
ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
}
static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
@ -5121,7 +5128,7 @@ static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST8x2_UB(out, dst, dst_stride);
ST_D2(out, 0, 1, dst, dst_stride);
}
static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr,
@ -5243,7 +5250,7 @@ static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr,
tmp0, tmp1, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += 8;
}
}
@ -5394,9 +5401,8 @@ static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x2_UB(out2, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
@ -5533,7 +5539,7 @@ static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
tmp0, tmp1, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst_tmp, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
dst10_r = dst54_r;
@ -5720,7 +5726,7 @@ static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
tmp0, tmp1, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst_tmp, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
dst10_r = dst54_r;
@ -5816,7 +5822,7 @@ static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
tmp0, tmp1, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;

View File

@ -309,7 +309,7 @@ static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
SRARI_H2_SH(out0, out1, 6);
SAT_SH2_SH(out0, out1, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
@ -344,10 +344,9 @@ static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 6);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
}
static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
@ -382,11 +381,10 @@ static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 6);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
dst += (8 * dst_stride);
LD_SB4(src, src_stride, src0, src1, src2, src3);
XORI_B4_128_SB(src0, src1, src2, src3);
@ -402,10 +400,9 @@ static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 6);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
}
static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
@ -468,7 +465,7 @@ static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -546,8 +543,8 @@ static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
tmp1 = PCKEV_XORI128_UB(out2, out3);
tmp2 = PCKEV_XORI128_UB(out4, out5);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
@ -670,7 +667,7 @@ static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out0, out8, out2, out9, 7);
SAT_SH2_SH(out1, out3, 7);
out = PCKEV_XORI128_UB(out8, out9);
ST8x2_UB(out, dst + 16, dst_stride);
ST_D2(out, 0, 1, dst + 16, dst_stride);
out = PCKEV_XORI128_UB(out0, out1);
ST_UB(out, dst);
dst += dst_stride;
@ -965,10 +962,8 @@ static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
SAT_SH2_SH(out54, out76, 7);
out0 = PCKEV_XORI128_UB(out10, out32);
out1 = PCKEV_XORI128_UB(out54, out76);
ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
src2110 = src10998;
src4332 = src12111110;
@ -1019,7 +1014,7 @@ static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
@ -1458,10 +1453,8 @@ static void hevc_hv_uni_8t_4w_msa(uint8_t *src,
PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
dst32_r = dst1110_r;
@ -1595,7 +1588,7 @@ static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src,
PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
out = PCKEV_XORI128_UB(dst0, dst1);
ST8x2_UB(out, dst_tmp, dst_stride);
ST_D2(out, 0, 1, dst_tmp, dst_stride);
dst_tmp += (2 * dst_stride);
dst0 = dst2;
@ -1741,7 +1734,7 @@ static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
out0 = PCKEV_XORI128_UB(dst0, dst1);
ST8x2_UB(out0, dst_tmp, dst_stride);
ST_D2(out0, 0, 1, dst_tmp, dst_stride);
dst_tmp += (2 * dst_stride);
dst0 = dst2;
@ -1845,10 +1838,8 @@ static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
dst32_r = dst1110_r;
@ -1944,7 +1935,7 @@ static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
res0 = __msa_srari_h(res0, 6);
res0 = __msa_sat_s_h(res0, 7);
out = PCKEV_XORI128_UB(res0, res0);
ST4x2_UB(out, dst, dst_stride);
ST_W2(out, 0, 1, dst, dst_stride);
}
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
@ -1971,7 +1962,7 @@ static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
SRARI_H2_SH(out0, out1, 6);
SAT_SH2_SH(out0, out1, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
@ -2004,10 +1995,9 @@ static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 6);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
}
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
@ -2038,11 +2028,10 @@ static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 6);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
dst += (8 * dst_stride);
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride);
@ -2054,10 +2043,9 @@ static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 6);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
}
static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
@ -2102,7 +2090,10 @@ static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out0, out1, out2, out3, 7);
out4 = PCKEV_XORI128_UB(out0, out1);
out5 = PCKEV_XORI128_UB(out2, out3);
ST6x4_UB(out4, out5, dst, dst_stride);
ST_W2(out4, 0, 2, dst, dst_stride);
ST_H2(out4, 2, 6, dst + 4, dst_stride);
ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
LD_SB4(src, src_stride, src0, src1, src2, src3);
@ -2115,8 +2106,10 @@ static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out0, out1, out2, out3, 7);
out4 = PCKEV_XORI128_UB(out0, out1);
out5 = PCKEV_XORI128_UB(out2, out3);
ST6x4_UB(out4, out5, dst, dst_stride);
dst += (4 * dst_stride);
ST_W2(out4, 0, 2, dst, dst_stride);
ST_H2(out4, 2, 6, dst + 4, dst_stride);
ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
}
static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
@ -2148,7 +2141,7 @@ static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
SRARI_H2_SH(vec0, vec1, 6);
SAT_SH2_SH(vec0, vec1, 7);
out = PCKEV_XORI128_UB(vec0, vec1);
ST8x2_UB(out, dst, dst_stride);
ST_D2(out, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
}
}
@ -2182,7 +2175,7 @@ static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -2235,7 +2228,7 @@ static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
SRARI_H2_SH(out0, out1, 6);
SAT_SH2_SH(out0, out1, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
ST_W4(tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
@ -2249,7 +2242,7 @@ static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out2, out3, out4, out5, 7);
tmp0 = PCKEV_XORI128_UB(out2, out3);
tmp1 = PCKEV_XORI128_UB(out4, out5);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -2395,7 +2388,7 @@ static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride);
dst1 += (4 * dst_stride);
}
}
@ -2496,7 +2489,7 @@ static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
out10 = __msa_srari_h(out10, 6);
out10 = __msa_sat_s_h(out10, 7);
out = PCKEV_XORI128_UB(out10, out10);
ST4x2_UB(out, dst, dst_stride);
ST_W2(out, 0, 1, dst, dst_stride);
}
static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
@ -2540,7 +2533,7 @@ static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
SRARI_H2_SH(out10, out32, 6);
SAT_SH2_SH(out10, out32, 7);
out = PCKEV_XORI128_UB(out10, out32);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -2596,7 +2589,10 @@ static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
ST6x4_UB(out0, out1, dst, dst_stride);
ST_W2(out0, 0, 2, dst, dst_stride);
ST_H2(out0, 2, 6, dst + 4, dst_stride);
ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
LD_SB2(src, src_stride, src3, src4);
@ -2619,7 +2615,10 @@ static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
ST6x4_UB(out0, out1, dst, dst_stride);
ST_W2(out0, 0, 2, dst, dst_stride);
ST_H2(out0, 2, 6, dst + 4, dst_stride);
ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
}
static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
@ -2645,7 +2644,7 @@ static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
SRARI_H2_SH(tmp0, tmp1, 6);
SAT_SH2_SH(tmp0, tmp1, 7);
out = PCKEV_XORI128_UB(tmp0, tmp1);
ST8x2_UB(out, dst, dst_stride);
ST_D2(out, 0, 1, dst, dst_stride);
}
static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
@ -2737,7 +2736,7 @@ static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src98_r;
@ -2811,9 +2810,9 @@ static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
SAT_SH2_SH(dst0_l, dst1_l, 7);
out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
src2 = src6;
@ -2982,12 +2981,12 @@ static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
out = PCKEV_XORI128_UB(out0_r, out0_l);
ST_UB(out, dst);
out = PCKEV_XORI128_UB(out2_r, out2_r);
ST8x1_UB(out, dst + 16);
ST_D1(out, 0, dst + 16);
dst += dst_stride;
out = PCKEV_XORI128_UB(out1_r, out1_l);
ST_UB(out, dst);
out = PCKEV_XORI128_UB(out3_r, out3_r);
ST8x1_UB(out, dst + 16);
ST_D1(out, 0, dst + 16);
dst += dst_stride;
}
}
@ -3137,7 +3136,7 @@ static void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
tmp = __msa_srari_h(tmp, 6);
tmp = __msa_sat_s_h(tmp, 7);
out = PCKEV_XORI128_UB(tmp, tmp);
ST4x2_UB(out, dst, dst_stride);
ST_W2(out, 0, 1, dst, dst_stride);
}
static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
@ -3196,7 +3195,7 @@ static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
SRARI_H2_SH(tmp0, tmp1, 6);
SAT_SH2_SH(tmp0, tmp1, 7);
out = PCKEV_XORI128_UB(tmp0, tmp1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
@ -3288,7 +3287,7 @@ static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
@ -3432,10 +3431,8 @@ static void hevc_hv_uni_4t_6w_msa(uint8_t *src,
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
out2 = PCKEV_XORI128_UB(tmp4, tmp5);
ST4x8_UB(out0, out1, dst, dst_stride);
ST2x4_UB(out2, 0, dst + 4, dst_stride);
dst += 4 * dst_stride;
ST2x4_UB(out2, 4, dst + 4, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
}
static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
@ -3497,7 +3494,7 @@ static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
SRARI_H2_SH(out0_r, out1_r, 6);
SAT_SH2_SH(out0_r, out1_r, 7);
out = PCKEV_XORI128_UB(out0_r, out1_r);
ST8x2_UB(out, dst, dst_stride);
ST_D2(out, 0, 1, dst, dst_stride);
}
static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src,
@ -3580,7 +3577,7 @@ static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src,
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += 8;
}
}
@ -3684,9 +3681,8 @@ static void hevc_hv_uni_4t_8x6_msa(uint8_t *src,
out1 = PCKEV_XORI128_UB(out2_r, out3_r);
out2 = PCKEV_XORI128_UB(out4_r, out5_r);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x2_UB(out2, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src,
@ -3788,7 +3784,7 @@ static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src,
SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
out0 = PCKEV_XORI128_UB(out0_r, out1_r);
out1 = PCKEV_XORI128_UB(out2_r, out3_r);
ST8x4_UB(out0, out1, dst_tmp, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
dst10_r = dst54_r;
@ -3919,7 +3915,7 @@ static void hevc_hv_uni_4t_12w_msa(uint8_t *src,
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
ST8x4_UB(out0, out1, dst_tmp, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
dst10_r = dst54_r;
@ -3985,7 +3981,7 @@ static void hevc_hv_uni_4t_12w_msa(uint8_t *src,
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;

View File

@ -90,7 +90,7 @@ static void hevc_uniwgt_copy_4w_msa(uint8_t *src,
dst0 += offset_vec;
dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
ST4x2_UB(out0, dst, dst_stride);
ST_W2(out0, 0, 1, dst, dst_stride);
} else if (4 == height) {
LW4(src, src_stride, tp0, tp1, tp2, tp3);
INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
@ -99,7 +99,7 @@ static void hevc_uniwgt_copy_4w_msa(uint8_t *src,
HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
rnd_vec, dst0, dst1);
out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
} else if (0 == (height % 8)) {
for (loop_cnt = (height >> 3); loop_cnt--;) {
LW4(src, src_stride, tp0, tp1, tp2, tp3);
@ -115,7 +115,7 @@ static void hevc_uniwgt_copy_4w_msa(uint8_t *src,
offset_vec, rnd_vec, dst0, dst1,
dst2, dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += 8 * dst_stride;
}
}
@ -170,9 +170,15 @@ static void hevc_uniwgt_copy_6w_msa(uint8_t *src,
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
ST6x4_UB(out0, out1, dst, dst_stride);
ST_W2(out0, 0, 2, dst, dst_stride);
ST_H2(out0, 2, 6, dst + 4, dst_stride);
ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
ST6x4_UB(out2, out3, dst, dst_stride);
ST_W2(out2, 0, 2, dst, dst_stride);
ST_H2(out2, 2, 6, dst + 4, dst_stride);
ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
}
}
@ -207,7 +213,7 @@ static void hevc_uniwgt_copy_8w_msa(uint8_t *src,
HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
rnd_vec, dst0, dst1);
out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
ST8x2_UB(out0, dst, dst_stride);
ST_D2(out0, 0, 1, dst, dst_stride);
} else if (4 == height) {
LD4(src, src_stride, tp0, tp1, tp2, tp3);
INSERT_D2_SB(tp0, tp1, src0);
@ -219,7 +225,7 @@ static void hevc_uniwgt_copy_8w_msa(uint8_t *src,
offset_vec, rnd_vec, dst0, dst1, dst2,
dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
} else if (6 == height) {
LD4(src, src_stride, tp0, tp1, tp2, tp3);
src += 4 * src_stride;
@ -238,9 +244,8 @@ static void hevc_uniwgt_copy_8w_msa(uint8_t *src,
HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
rnd_vec, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x2_UB(out2, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
} else if (0 == height % 8) {
for (loop_cnt = (height >> 3); loop_cnt--;) {
LD4(src, src_stride, tp0, tp1, tp2, tp3);
@ -266,10 +271,9 @@ static void hevc_uniwgt_copy_8w_msa(uint8_t *src,
dst6, dst7);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x4_UB(out2, out3, dst, dst_stride);
dst += (4 * dst_stride);
ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1,
dst, dst_stride);
dst += (8 * dst_stride);
}
}
}
@ -313,7 +317,8 @@ static void hevc_uniwgt_copy_12w_msa(uint8_t *src,
rnd_vec, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
ST12x4_UB(out0, out1, out2, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
@ -409,7 +414,7 @@ static void hevc_uniwgt_copy_24w_msa(uint8_t *src,
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
ST_UB4(out0, out1, out3, out4, dst, dst_stride);
ST8x4_UB(out2, out5, dst + 16, dst_stride);
ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
dst += (4 * dst_stride);
}
}
@ -651,7 +656,7 @@ static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src,
dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
@ -729,7 +734,7 @@ static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src,
dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -822,8 +827,8 @@ static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src,
rnd_vec, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
ST8x4_UB(out0, out1, dst, dst_stride);
ST4x4_UB(out2, out2, 0, 1, 2, 3, dst + 8, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
@ -994,7 +999,7 @@ static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src,
PCKEV_B3_UB(dst1, dst0, dst4, dst3, dst5, dst2, out0, out1, out2);
ST_UB2(out0, out1, dst, dst_stride);
ST8x2_UB(out2, dst + 16, dst_stride);
ST_D2(out2, 0, 1, dst + 16, dst_stride);
dst += (2 * dst_stride);
}
}
@ -1368,7 +1373,7 @@ static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src,
dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
src2110 = src10998;
@ -1444,7 +1449,7 @@ static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src,
dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
@ -1543,8 +1548,8 @@ static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src,
rnd_vec, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
ST8x4_UB(out0, out1, dst, dst_stride);
ST4x4_UB(out2, out2, 0, 1, 2, 3, dst + 8, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
@ -1861,7 +1866,7 @@ static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst2_r, dst3_r);
PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
dst10_r = dst54_r;
@ -2013,7 +2018,7 @@ static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src,
PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
ST8x2_UB(dst0_r, dst_tmp, dst_stride);
ST_D2(dst0_r, 0, 1, dst_tmp, dst_stride);
dst_tmp += (2 * dst_stride);
dst10_r = dst32_r;
@ -2163,7 +2168,7 @@ static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src,
CLIP_SW2_0_255_MAX_SATU(dst0_r, dst0_l);
dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
ST8x1_UB(out, dst_tmp);
ST_D1(out, 0, dst_tmp);
dst_tmp += dst_stride;
dst0 = dst1;
@ -2244,7 +2249,7 @@ static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src,
CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst2_r, dst3_r);
PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
dst10_r = dst54_r;
@ -2391,7 +2396,7 @@ static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src,
dst0 = __msa_adds_s_h(dst0, offset_vec);
dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
ST4x2_UB(out, dst, dst_stride);
ST_W2(out, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
@ -2448,7 +2453,7 @@ static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src,
dst0, dst1);
out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
}
@ -2515,7 +2520,7 @@ static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src,
dst0, dst1, dst2, dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
@ -2613,9 +2618,15 @@ static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src,
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
ST6x4_UB(out0, out1, dst, dst_stride);
ST_W2(out0, 0, 2, dst, dst_stride);
ST_H2(out0, 2, 6, dst + 4, dst_stride);
ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
ST6x4_UB(out2, out3, dst, dst_stride);
ST_W2(out2, 0, 2, dst, dst_stride);
ST_H2(out2, 2, 6, dst + 4, dst_stride);
ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
}
static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src,
@ -2670,7 +2681,7 @@ static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src,
dst0, dst1);
out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
ST8x2_UB(out, dst, dst_stride);
ST_D2(out, 0, 1, dst, dst_stride);
}
static void hevc_hz_uniwgt_4t_8x4_msa(uint8_t *src,
@ -2727,7 +2738,7 @@ static void hevc_hz_uniwgt_4t_8x4_msa(uint8_t *src,
dst0, dst1, dst2, dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
}
static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src,
@ -2796,9 +2807,8 @@ static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src,
dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x2_UB(out2, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_hz_uniwgt_4t_8x8multiple_msa(uint8_t *src,
@ -2876,7 +2886,7 @@ static void hevc_hz_uniwgt_4t_8x8multiple_msa(uint8_t *src,
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
}
}
@ -2981,7 +2991,8 @@ static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src,
rnd_vec, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
ST12x4_UB(out0, out1, out2, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
@ -3142,7 +3153,7 @@ static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src,
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
ST_UB2(out0, out1, dst, dst_stride);
ST8x2_UB(out2, dst + 16, dst_stride);
ST_D2(out2, 0, 1, dst + 16, dst_stride);
dst += (2 * dst_stride);
}
}
@ -3286,7 +3297,7 @@ static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src,
dst0 = __msa_adds_s_h(dst0, offset_vec);
dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
ST4x2_UB(out, dst, dst_stride);
ST_W2(out, 0, 1, dst, dst_stride);
}
static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src,
@ -3340,7 +3351,7 @@ static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src,
dst0, dst1);
out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src,
@ -3411,7 +3422,7 @@ static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src,
dst0, dst1, dst2, dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
src2 = src10;
@ -3509,9 +3520,15 @@ static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src,
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
ST6x4_UB(out0, out1, dst, dst_stride);
ST_W2(out0, 0, 2, dst, dst_stride);
ST_H2(out0, 2, 6, dst + 4, dst_stride);
ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
ST6x4_UB(out2, out3, dst, dst_stride);
ST_W2(out2, 0, 2, dst, dst_stride);
ST_H2(out2, 2, 6, dst + 4, dst_stride);
ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
}
static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src,
@ -3562,7 +3579,7 @@ static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src,
dst0, dst1);
out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
ST8x2_UB(out, dst, dst_stride);
ST_D2(out, 0, 1, dst, dst_stride);
}
static void hevc_vt_uniwgt_4t_8x4_msa(uint8_t *src,
@ -3617,7 +3634,7 @@ static void hevc_vt_uniwgt_4t_8x4_msa(uint8_t *src,
offset_vec, rnd_vec, dst0, dst1, dst2,
dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
}
static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src,
@ -3679,9 +3696,8 @@ static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src,
HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec,
dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x2_UB(out2, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_vt_uniwgt_4t_8x8mult_msa(uint8_t *src,
@ -3754,7 +3770,7 @@ static void hevc_vt_uniwgt_4t_8x8mult_msa(uint8_t *src,
dst7);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
src2 = src10;
@ -3861,7 +3877,8 @@ static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src,
HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
rnd_vec, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
ST12x4_UB(out0, out1, out2, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
ILVRL_B2_SB(src7, src6, src76_r, src76_l);
@ -3882,7 +3899,8 @@ static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src,
HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst10, dst11, weight_vec, offset_vec,
rnd_vec, dst10, dst11);
PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
ST12x4_UB(out3, out4, out5, dst, dst_stride);
ST_D4(out3, out4, 0, 1, 0, 1, dst, dst_stride);
ST_W4(out5, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
src2 = src10;
@ -4062,7 +4080,7 @@ static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src,
out2, out3);
PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
ST_UB4(out0, out1, out2, out3, dst, dst_stride);
ST8x4_UB(out4, out5, dst + 16, dst_stride);
ST_D4(out4, out5, 0, 1, 0, 1, dst + 16, dst_stride);
dst += (4 * dst_stride);
src2 = src6;
@ -4231,7 +4249,7 @@ static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src,
tmp += offset_vec;
tmp = CLIP_SH_0_255_MAX_SATU(tmp);
out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
ST4x2_UB(out, dst, dst_stride);
ST_W2(out, 0, 1, dst, dst_stride);
}
static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src,
@ -4300,7 +4318,7 @@ static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src,
ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src,
@ -4401,7 +4419,7 @@ static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src,
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
@ -4559,10 +4577,8 @@ static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src,
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
ST4x8_UB(out0, out1, dst, dst_stride);
ST2x4_UB(out2, 0, dst + 4, dst_stride);
dst += 4 * dst_stride;
ST2x4_UB(out2, 4, dst + 4, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
}
static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src,
@ -4638,7 +4654,7 @@ static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src,
ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST8x2_UB(out, dst, dst_stride);
ST_D2(out, 0, 1, dst, dst_stride);
}
static void hevc_hv_uniwgt_4t_8multx4_msa(uint8_t *src,
@ -4731,7 +4747,7 @@ static void hevc_hv_uniwgt_4t_8multx4_msa(uint8_t *src,
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += 8;
}
}
@ -4848,9 +4864,8 @@ static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src,
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x2_UB(out2, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src,
@ -4960,7 +4975,7 @@ static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src,
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst_tmp, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
dst10_r = dst54_r;
@ -5107,7 +5122,7 @@ static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src,
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst_tmp, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
dst10_r = dst54_r;
@ -5174,7 +5189,7 @@ static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src,
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST4x8_UB(out0, out1, dst, dst_stride);
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;

View File

@ -44,7 +44,7 @@ static void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride,
src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
in0 = (v8i16) __msa_ilvr_b(zero, src0);
in0 <<= 6;
ST8x2_UB(in0, dst, 2 * dst_stride);
ST_D2(in0, 0, 1, dst, dst_stride);
} else if (4 == height) {
v16i8 src0, src1, src2, src3;
v8i16 in0, in1;
@ -55,7 +55,7 @@ static void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride,
ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
in0 <<= 6;
in1 <<= 6;
ST8x4_UB(in0, in1, dst, 2 * dst_stride);
ST_D4(in0, in1, 0, 1, 0, 1, dst, dst_stride);
} else if (0 == height % 8) {
v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
v8i16 in0, in1, in2, in3;
@ -71,7 +71,7 @@ static void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride,
ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
in0, in1, in2, in3);
SLLI_4V(in0, in1, in2, in3, 6);
ST8x8_UB(in0, in1, in2, in3, dst, 2 * dst_stride);
ST_D8(in0, in1, in2, in3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
}
}
@ -183,7 +183,7 @@ static void hevc_copy_12w_msa(uint8_t *src, int32_t src_stride,
in0 <<= 6;
in1 <<= 6;
ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
ST_D4(in0, in1, 0, 1, 0, 1, dst + 8, dst_stride);
dst += (4 * dst_stride);
ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
@ -194,7 +194,7 @@ static void hevc_copy_12w_msa(uint8_t *src, int32_t src_stride,
in0 <<= 6;
in1 <<= 6;
ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
ST_D4(in0, in1, 0, 1, 0, 1, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
@ -495,7 +495,7 @@ static void hevc_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
dst3, dst3, dst3, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
}
}
@ -1047,7 +1047,7 @@ static void hevc_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
src2110 = src10998;
@ -1191,7 +1191,7 @@ static void hevc_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
dst1_l, dst1_l, dst1_l, dst1_l);
ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
ST8x4_UB(dst0_l, dst1_l, dst + 8, 2 * dst_stride);
ST_D4(dst0_l, dst1_l, 0, 1, 0, 1, dst + 8, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
@ -1363,7 +1363,6 @@ static void hevc_hv_8t_4w_msa(uint8_t *src, int32_t src_stride,
int32_t height)
{
uint32_t loop_cnt;
int32_t dst_stride_in_bytes = 2 * dst_stride;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v8i16 filt0, filt1, filt2, filt3;
v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
@ -1452,7 +1451,7 @@ static void hevc_hv_8t_4w_msa(uint8_t *src, int32_t src_stride,
filt_h0, filt_h1, filt_h2, filt_h3);
SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
ST8x4_UB(dst0_r, dst2_r, dst, dst_stride_in_bytes);
ST_D4(dst0_r, dst2_r, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
dst10_r = dst54_r;
@ -1607,7 +1606,6 @@ static void hevc_hv_8t_12w_msa(uint8_t *src, int32_t src_stride,
int32_t height)
{
uint32_t loop_cnt;
int32_t dst_stride_in_bytes = 2 * dst_stride;
uint8_t *src_tmp;
int16_t *dst_tmp;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
@ -1784,7 +1782,7 @@ static void hevc_hv_8t_12w_msa(uint8_t *src, int32_t src_stride,
filt_h1, filt_h2, filt_h3);
SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
ST8x4_UB(dst0_r, dst2_r, dst, dst_stride_in_bytes);
ST_D4(dst0_r, dst2_r, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
dst10_r = dst54_r;
@ -1872,7 +1870,7 @@ static void hevc_hz_4t_4x2_msa(uint8_t *src,
dst0 = const_vec;
DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
ST8x2_UB(dst0, dst, 2 * dst_stride);
ST_D2(dst0, 0, 1, dst, dst_stride);
}
static void hevc_hz_4t_4x4_msa(uint8_t *src,
@ -1909,7 +1907,7 @@ static void hevc_hz_4t_4x4_msa(uint8_t *src,
dst1 = const_vec;
DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
ST8x4_UB(dst0, dst1, dst, 2 * dst_stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
}
static void hevc_hz_4t_4x8multiple_msa(uint8_t *src,
@ -1956,7 +1954,7 @@ static void hevc_hz_4t_4x8multiple_msa(uint8_t *src,
dst3 = const_vec;
DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
}
}
@ -2218,7 +2216,7 @@ static void hevc_hz_4t_12w_msa(uint8_t *src,
DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
ST8x4_UB(dst4, dst5, dst + 8, 2 * dst_stride);
ST_D4(dst4, dst5, 0, 1, 0, 1, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
@ -2478,7 +2476,7 @@ static void hevc_vt_4t_4x2_msa(uint8_t *src,
dst10 = const_vec;
DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
ST8x2_UB(dst10, dst, 2 * dst_stride);
ST_D2(dst10, 0, 1, dst, dst_stride);
}
static void hevc_vt_4t_4x4_msa(uint8_t *src,
@ -2515,7 +2513,7 @@ static void hevc_vt_4t_4x4_msa(uint8_t *src,
dst32 = const_vec;
DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
ST8x4_UB(dst10, dst32, dst, 2 * dst_stride);
ST_D4(dst10, dst32, 0, 1, 0, 1, dst, dst_stride);
}
static void hevc_vt_4t_4x8_msa(uint8_t *src,
@ -2564,8 +2562,7 @@ static void hevc_vt_4t_4x8_msa(uint8_t *src,
DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
dst += (8 * dst_stride);
ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
}
static void hevc_vt_4t_4x16_msa(uint8_t *src, int32_t src_stride,
@ -2610,7 +2607,7 @@ static void hevc_vt_4t_4x16_msa(uint8_t *src, int32_t src_stride,
DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
src2 = src10;
@ -2635,8 +2632,7 @@ static void hevc_vt_4t_4x16_msa(uint8_t *src, int32_t src_stride,
DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
dst += (8 * dst_stride);
ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
}
static void hevc_vt_4t_4w_msa(uint8_t *src,
@ -2955,7 +2951,7 @@ static void hevc_vt_4t_12w_msa(uint8_t *src,
DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
ST8x4_UB(dst0_l, dst1_l, dst + 8, (2 * dst_stride));
ST_D4(dst0_l, dst1_l, 0, 1, 0, 1, dst + 8, dst_stride);
dst += (4 * dst_stride);
src2 = src6;
@ -3243,7 +3239,6 @@ static void hevc_hv_4t_4x2_msa(uint8_t *src,
const int8_t *filter_x,
const int8_t *filter_y)
{
int32_t dst_stride_in_bytes = 2 * dst_stride;
v16i8 src0, src1, src2, src3, src4;
v8i16 filt0, filt1;
v8i16 filt_h0, filt_h1;
@ -3288,7 +3283,7 @@ static void hevc_hv_4t_4x2_msa(uint8_t *src,
dst0 >>= 6;
dst1 >>= 6;
dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
ST8x2_UB(dst0, dst, dst_stride_in_bytes);
ST_D2(dst0, 0, 1, dst, dst_stride);
}
static void hevc_hv_4t_4x4_msa(uint8_t *src,
@ -3298,7 +3293,6 @@ static void hevc_hv_4t_4x4_msa(uint8_t *src,
const int8_t *filter_x,
const int8_t *filter_y)
{
int32_t dst_stride_in_bytes = 2 * dst_stride;
v16i8 src0, src1, src2, src3, src4, src5, src6;
v8i16 filt0, filt1;
v8i16 filt_h0, filt_h1;
@ -3351,7 +3345,7 @@ static void hevc_hv_4t_4x4_msa(uint8_t *src,
dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
SRA_4V(dst0, dst1, dst2, dst3, 6);
PCKEV_H2_SW(dst1, dst0, dst3, dst2, dst0, dst2);
ST8x4_UB(dst0, dst2, dst, dst_stride_in_bytes);
ST_D4(dst0, dst2, 0, 1, 0, 1, dst, dst_stride);
}
@ -3442,7 +3436,7 @@ static void hevc_hv_4t_4multx8mult_msa(uint8_t *src,
SRA_4V(dst4, dst5, dst6, dst7, 6);
PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
dst0, dst1, dst2, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
@ -3479,7 +3473,6 @@ static void hevc_hv_4t_6w_msa(uint8_t *src,
const int8_t *filter_y,
int32_t height)
{
int32_t dst_stride_in_bytes = 2 * dst_stride;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v8i16 filt0, filt1;
v8i16 filt_h0, filt_h1;
@ -3590,11 +3583,11 @@ static void hevc_hv_4t_6w_msa(uint8_t *src,
PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
ST8x4_UB(tmp0, tmp1, dst, dst_stride_in_bytes);
ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, dst + 4, dst_stride_in_bytes);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
ST_W4(tmp4, 0, 1, 2, 3, dst + 4, dst_stride);
dst += 4 * dst_stride;
ST8x4_UB(tmp2, tmp3, dst, dst_stride_in_bytes);
ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, dst + 4, dst_stride_in_bytes);
ST_D4(tmp2, tmp3, 0, 1, 0, 1, dst, dst_stride);
ST_W4(tmp5, 0, 1, 2, 3, dst + 4, dst_stride);
}
static void hevc_hv_4t_8x2_msa(uint8_t *src,
@ -4164,7 +4157,7 @@ static void hevc_hv_4t_12w_msa(uint8_t *src,
SRA_4V(tmp4, tmp5, tmp6, tmp7, 6);
PCKEV_H4_SW(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, tmp0, tmp1,
tmp2, tmp3);
ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, 2 * dst_stride);
ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;

View File

@ -589,7 +589,7 @@ static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top,
PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
SRARI_H2_SH(res0, res1, 3);
src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
ST4x4_UB(src_vec0, src_vec0, 0, 1, 2, 3, dst, stride);
ST_W4(src_vec0, 0, 1, 2, 3, dst, stride);
}
static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
@ -656,7 +656,8 @@ static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
src_vec0, src_vec1, src_vec2, src_vec3);
ST8x8_UB(src_vec0, src_vec1, src_vec2, src_vec3, dst, stride);
ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1,
0, 1, 0, 1, dst, stride);
}
static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
@ -1007,7 +1008,7 @@ static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top,
SRARI_H2_SH(diff1, diff3, 5);
dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
ST4x4_UB(dst_val0, dst_val0, 0, 1, 2, 3, dst, stride);
ST_W4(dst_val0, 0, 1, 2, 3, dst, stride);
}
static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
@ -1104,7 +1105,7 @@ static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
ST8x4_UB(dst_val0, dst_val1, dst, stride);
ST_D4(dst_val0, dst_val1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
}
}
@ -1425,9 +1426,8 @@ static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top,
dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
ST4x2_UB(dst_val0, dst, stride);
dst += (2 * stride);
ST4x2_UB(dst_val1, dst, stride);
ST_W2(dst_val0, 0, 1, dst, stride);
ST_W2(dst_val1, 0, 1, dst + 2 * stride, stride);
}
static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
@ -1526,7 +1526,7 @@ static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
dst_val0, dst_val1, dst_val2, dst_val3);
ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
ILVRL_H2_SH(diff1, diff0, diff3, diff4);
ST4x8_UB(diff3, diff4, dst_org, stride);
ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
dst += 4;
}
}
@ -1640,9 +1640,9 @@ static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top,
ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
ILVRL_H2_SH(diff1, diff0, diff4, diff5);
ILVRL_H2_SH(diff3, diff2, diff6, diff7);
ST4x8_UB(diff4, diff5, dst_org, stride);
ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
dst_org += (8 * stride);
ST4x8_UB(diff6, diff7, dst_org, stride);
ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
dst += 4;
}
}
@ -1746,23 +1746,14 @@ static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top,
ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
ST2x4_UB(diff0, 0, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff0, 4, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff1, 0, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff1, 4, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff2, 0, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff2, 4, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff3, 0, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff3, 4, dst_org, stride);
dst_org += (4 * stride);
ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
dst_org += (8 * stride);
ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
dst_org += (8 * stride);
ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
dst_org += (8 * stride);
ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
dst_org += (8 * stride);
dst += 2;
}

View File

@ -49,7 +49,7 @@
PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
}
static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride,
@ -584,7 +584,7 @@ static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
sum0, sum1, sum2, sum3);
SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1);
ST8x4_UB(src0, src1, dst, dst_stride);
ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src0 = src4;
}
@ -689,9 +689,9 @@ static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
SRA_4V(sum0, sum1, sum2, sum3, 2);
SRA_4V(sum4, sum5, sum6, sum7, 2);
PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1);
ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
@ -723,7 +723,7 @@ static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
SRA_4V(sum0, sum1, sum2, sum3, 2);
PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src,

File diff suppressed because it is too large Load Diff

View File

@ -47,7 +47,6 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
v4i32 cnst8w = {8, 8, 8, 8};
v4i32 cnst2048w = {2048, 2048, 2048, 2048};
v4i32 cnst128w = {128, 128, 128, 128};
int nstride = stride;
/* Extended input data */
LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7);
@ -386,20 +385,14 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
VSHF_B2_SB(r2_l, r6_l, r3_l, r7_l, mask, mask, d6, d7);
/* Final sequence of operations over-write original dst */
ST8x1_UB(d0, dst);
ST8x1_UB(d1, dst + nstride);
nstride += stride;
ST8x1_UB(d2, dst + nstride);
nstride += stride;
ST8x1_UB(d3, dst + nstride);
nstride += stride;
ST8x1_UB(d4, dst + nstride);
nstride += stride;
ST8x1_UB(d5, dst + nstride);
nstride += stride;
ST8x1_UB(d6, dst + nstride);
nstride += stride;
ST8x1_UB(d7, dst + nstride);
ST_D1(d0, 0, dst);
ST_D1(d1, 0, dst + stride);
ST_D1(d2, 0, dst + 2 * stride);
ST_D1(d3, 0, dst + 3 * stride);
ST_D1(d4, 0, dst + 4 * stride);
ST_D1(d5, 0, dst + 5 * stride);
ST_D1(d6, 0, dst + 6 * stride);
ST_D1(d7, 0, dst + 7 * stride);
}
void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
@ -424,7 +417,6 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
v4i32 r0, r1, r2, r3, r4, r5, r6, r7;
v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
v16i8 zero = {0};
int nstride = line_size;
LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7);
ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
@ -480,20 +472,14 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
VSHF_B2_SB(e6, r6, e7, r7, mask, mask, d6, d7);
/* Final sequence of operations over-write original dst */
ST8x1_UB(d0, dest);
ST8x1_UB(d1, dest + nstride);
nstride += line_size;
ST8x1_UB(d2, dest + nstride);
nstride += line_size;
ST8x1_UB(d3, dest + nstride);
nstride += line_size;
ST8x1_UB(d4, dest + nstride);
nstride += line_size;
ST8x1_UB(d5, dest + nstride);
nstride += line_size;
ST8x1_UB(d6, dest + nstride);
nstride += line_size;
ST8x1_UB(d7, dest + nstride);
ST_D1(d0, 0, dest);
ST_D1(d1, 0, dest + line_size);
ST_D1(d2, 0, dest + 2 * line_size);
ST_D1(d3, 0, dest + 3 * line_size);
ST_D1(d4, 0, dest + 4 * line_size);
ST_D1(d5, 0, dest + 5 * line_size);
ST_D1(d6, 0, dest + 6 * line_size);
ST_D1(d7, 0, dest + 7 * line_size);
block[0] = 0;
}
@ -537,8 +523,8 @@ void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2);
/* Final move to first_pixel */
ST8x1_UB(d1, first_pixel + nstride);
ST8x1_UB(d2, first_pixel);
ST_D1(d1, 0, first_pixel + nstride);
ST_D1(d2, 0, first_pixel);
}
void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
@ -583,8 +569,8 @@ void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
g1 = CLIP_SW_0_255(g1);
VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2);
/* Final move to first_pixel */
ST2x4_UB(d1, 0, first_pixel - 1, stride);
ST2x4_UB(d2, 0, first_pixel - 1 + 4 * stride, stride);
ST_H4(d1, 0, 1, 2, 3, first_pixel - 1, stride);
ST_H4(d2, 0, 1, 2, 3, first_pixel - 1 + 4 * stride, stride);
}
void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1,
@ -641,10 +627,8 @@ void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1,
f2 = (v4i32) __msa_and_v((v16u8)a3, (v16u8)b3);
t3 = t3 + (v4u32)f2;
ST4x4_UB(t0, t0, 0, 1, 2, 3, dst, stride);
ST4x4_UB(t1, t1, 0, 1, 2, 3, dst + 4 * stride, stride);
ST4x4_UB(t2, t2, 0, 1, 2, 3, dst + 4, stride);
ST4x4_UB(t3, t3, 0, 1, 2, 3, dst + 4 + 4 * stride, stride);
ST_W8(t0, t1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
ST_W8(t2, t3, 0, 1, 2, 3, 0, 1, 2, 3, dst + 4, stride);
} else {
int i;

View File

@ -76,7 +76,8 @@ void ff_vp8_idct_add_msa(uint8_t *dst, int16_t input[16], ptrdiff_t stride)
res2 = CLIP_SW_0_255(res2);
res3 = CLIP_SW_0_255(res3);
VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1);
ST4x4_UB(dest0, dest1, 0, 1, 0, 1, dst, stride);
ST_W2(dest0, 0, 1, dst, stride);
ST_W2(dest1, 0, 1, dst + 2 * stride, stride);
memset(input, 0, 4 * 4 * sizeof(*input));
}
@ -97,7 +98,8 @@ void ff_vp8_idct_dc_add_msa(uint8_t *dst, int16_t in_dc[16], ptrdiff_t stride)
ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
CLIP_SH4_0_255(res0, res1, res2, res3);
VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1);
ST4x4_UB(dest0, dest1, 0, 1, 0, 1, dst, stride);
ST_W2(dest0, 0, 1, dst, stride);
ST_W2(dest1, 0, 1, dst + 2 * stride, stride);
in_dc[0] = 0;
}

View File

@ -540,14 +540,8 @@ void ff_vp8_h_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_B2_SH(q0, p0, tmp1, tmp0);
src -= 1;
ST2x4_UB(tmp1, 0, src, pitch);
src += 4 * pitch;
ST2x4_UB(tmp1, 4, src, pitch);
src += 4 * pitch;
ST2x4_UB(tmp0, 0, src, pitch);
src += 4 * pitch;
ST2x4_UB(tmp0, 4, src, pitch);
src += 4 * pitch;
ST_H8(tmp1, 0, 1, 2, 3, 4, 5, 6, 7, src, pitch)
ST_H8(tmp0, 0, 1, 2, 3, 4, 5, 6, 7, src + 8 * pitch, pitch)
}
void ff_vp8_v_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v,
@ -596,7 +590,6 @@ void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v,
ptrdiff_t pitch, int b_limit_in,
int limit_in, int thresh_in)
{
uint8_t *temp_src_u, *temp_src_v;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 mask, hev, flat, thresh, limit, b_limit;
v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
@ -623,15 +616,8 @@ void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v,
tmp1 = (v4i32) __msa_ilvl_b((v16i8) q1, (v16i8) q0);
ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
temp_src_u = src_u - 2;
ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch);
temp_src_u += 4 * pitch;
ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch);
temp_src_v = src_v - 2;
ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch);
temp_src_v += 4 * pitch;
ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch);
ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src_u - 2, pitch);
ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src_v - 2, pitch);
}
void ff_vp8_v_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
@ -684,7 +670,6 @@ void ff_vp8_h_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
src -= 2;
ST4x8_UB(tmp2, tmp3, src, pitch);
src += (8 * pitch);
ST4x8_UB(tmp4, tmp5, src, pitch);
ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch)
ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch)
}

View File

@ -181,7 +181,7 @@ static void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride,
SRARI_H2_SH(out0, out1, 7);
SAT_SH2_SH(out0, out1, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride,
@ -214,10 +214,9 @@ static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 7);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
}
void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
@ -263,7 +262,7 @@ void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
@ -276,7 +275,7 @@ void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -368,7 +367,7 @@ void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
SRARI_H2_SH(out10, out32, 7);
SAT_SH2_SH(out10, out32, 7);
out = PCKEV_XORI128_UB(out10, out32);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
src2110 = src6554;
@ -416,7 +415,7 @@ void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src76_r;
@ -567,7 +566,7 @@ void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
SRARI_H2_SH(tmp0, tmp1, 7);
SAT_SH2_SH(tmp0, tmp1, 7);
out = PCKEV_XORI128_UB(tmp0, tmp1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
hz_out3 = hz_out7;
@ -651,7 +650,7 @@ void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
ST8x4_UB(vec0, vec1, dst, dst_stride);
ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
hz_out4 = hz_out8;
@ -702,7 +701,7 @@ static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
SRARI_H2_SH(out0, out1, 7);
SAT_SH2_SH(out0, out1, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
@ -735,10 +734,9 @@ static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 7);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
}
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
@ -769,10 +767,10 @@ static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 7);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
@ -785,10 +783,10 @@ static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 7);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
@ -836,7 +834,7 @@ void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -932,7 +930,7 @@ void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
SRARI_H2_SH(out10, out32, 7);
SAT_SH2_SH(out10, out32, 7);
out = PCKEV_XORI128_UB(out10, out32);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -974,7 +972,7 @@ void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src98_r;
@ -1093,7 +1091,7 @@ void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
SRARI_H2_SH(tmp0, tmp1, 7);
SAT_SH2_SH(tmp0, tmp1, 7);
out = PCKEV_XORI128_UB(tmp0, tmp1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
hz_out1 = hz_out5;
@ -1160,7 +1158,7 @@ void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
vec0 = vec4;
@ -1240,7 +1238,8 @@ void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
SAT_SH2_SH(tmp0, tmp1, 7);
PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
XORI_B2_128_UB(res0, res1);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
ST_W2(res0, 0, 1, dst, dst_stride);
ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
dst += (4 * dst_stride);
hz_out1 = hz_out5;
@ -1316,7 +1315,7 @@ void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -1391,7 +1390,7 @@ void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
SRARI_H2_SH(tmp0, tmp1, 7);
SAT_SH2_SH(tmp0, tmp1, 7);
out = PCKEV_XORI128_UB(tmp0, tmp1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
hz_out3 = hz_out7;
@ -1464,7 +1463,7 @@ void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
ST8x4_UB(vec0, vec1, dst, dst_stride);
ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
hz_out4 = hz_out8;
@ -1509,7 +1508,8 @@ static void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride,
DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
SRARI_H2_UH(vec2, vec3, 7);
PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
ST_W2(res0, 0, 1, dst, dst_stride);
ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
}
static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride,
@ -1535,9 +1535,10 @@ static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
res0, res1, res2, res3);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
ST_W2(res0, 0, 1, dst, dst_stride);
ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
}
void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
@ -1574,7 +1575,7 @@ static void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride,
vec0, vec1, vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
ST8x4_UB(src0, src1, dst, dst_stride);
ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
@ -1604,8 +1605,7 @@ static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
src += (4 * src_stride);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
@ -1613,8 +1613,8 @@ static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
vec0, vec1, vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
dst += (8 * dst_stride);
if (16 == height) {
LD_SB4(src, src_stride, src0, src1, src2, src3);
@ -1629,7 +1629,7 @@ static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
src += (4 * src_stride);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
@ -1637,7 +1637,7 @@ static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
vec0, vec1, vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
}
@ -1745,7 +1745,7 @@ static void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride,
SRARI_H2_UH(tmp0, tmp1, 7);
SAT_UH2_UH(tmp0, tmp1, 7);
src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
}
static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride,
@ -1779,8 +1779,7 @@ static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
}
void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
@ -1817,7 +1816,7 @@ static void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
@ -1851,16 +1850,15 @@ static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
tmp0, tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
dst += (8 * dst_stride);
src0 = src8;
}
@ -1964,7 +1962,8 @@ static void common_hv_2ht_2vt_4x4_msa(uint8_t *src, int32_t src_stride,
SRARI_H2_UH(tmp0, tmp1, 7);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
ST_W2(res0, 0, 1, dst, dst_stride);
ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
}
static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride,
@ -2008,9 +2007,10 @@ static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride,
SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
res0, res1, res2, res3);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
ST_W2(res0, 0, 1, dst, dst_stride);
ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
}
void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
@ -2070,7 +2070,7 @@ static void common_hv_2ht_2vt_8x4_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride,
@ -2127,8 +2127,7 @@ static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride,
SRARI_H2_UH(tmp3, tmp4, 7);
SAT_UH2_UH(tmp3, tmp4, 7);
PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
@ -2149,8 +2148,8 @@ static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
dst += (8 * dst_stride);
}
}

View File

@ -241,7 +241,7 @@ static const int32_t sinpi_4_9 = 15212;
res0_m, res1_m, res2_m, res3_m); \
CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \
PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \
ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \
ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, dst_m, dst_stride); \
}
#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \
@ -364,7 +364,10 @@ static void vp9_idct4x4_colcol_addblk_msa(int16_t *input, uint8_t *dst,
v8i16 zero = { 0 };
/* load vector elements of 4x4 block */
LD4x4_SH(input, in0, in1, in2, in3);
in0 = LD_SH(input);
in2 = LD_SH(input + 8);
in1 = (v8i16) __msa_ilvl_d((v2i64) in0, (v2i64) in0);
in3 = (v8i16) __msa_ilvl_d((v2i64) in2, (v2i64) in2);
ST_SH2(zero, zero, input, 8);
/* rows */
VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
@ -383,7 +386,10 @@ static void vp9_iadst4x4_colcol_addblk_msa(int16_t *input, uint8_t *dst,
v8i16 zero = { 0 };
/* load vector elements of 4x4 block */
LD4x4_SH(input, in0, in1, in2, in3);
in0 = LD_SH(input);
in2 = LD_SH(input + 8);
in1 = (v8i16) __msa_ilvl_d((v2i64) in0, (v2i64) in0);
in3 = (v8i16) __msa_ilvl_d((v2i64) in2, (v2i64) in2);
ST_SH2(zero, zero, input, 8);
/* rows */
VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
@ -402,7 +408,10 @@ static void vp9_iadst_idct_4x4_add_msa(int16_t *input, uint8_t *dst,
v8i16 zero = { 0 };
/* load vector elements of 4x4 block */
LD4x4_SH(input, in0, in1, in2, in3);
in0 = LD_SH(input);
in2 = LD_SH(input + 8);
in1 = (v8i16) __msa_ilvl_d((v2i64) in0, (v2i64) in0);
in3 = (v8i16) __msa_ilvl_d((v2i64) in2, (v2i64) in2);
ST_SH2(zero, zero, input, 8);
/* cols */
VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
@ -421,7 +430,10 @@ static void vp9_idct_iadst_4x4_add_msa(int16_t *input, uint8_t *dst,
v8i16 zero = { 0 };
/* load vector elements of 4x4 block */
LD4x4_SH(input, in0, in1, in2, in3);
in0 = LD_SH(input);
in2 = LD_SH(input + 8);
in1 = (v8i16) __msa_ilvl_d((v2i64) in0, (v2i64) in0);
in3 = (v8i16) __msa_ilvl_d((v2i64) in2, (v2i64) in2);
ST_SH2(zero, zero, input, 8);
/* cols */
VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
@ -753,13 +765,13 @@ static void vp9_iadst8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst,
res0 += out0;
res0 = CLIP_SH_0_255(res0);
res0 = (v8i16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
ST8x1_UB(res0, dst);
ST_D1(res0, 0, dst);
res7 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) dst7);
res7 += out7;
res7 = CLIP_SH_0_255(res7);
res7 = (v8i16) __msa_pckev_b((v16i8) res7, (v16i8) res7);
ST8x1_UB(res7, dst + 7 * dst_stride);
ST_D1(res7, 0, dst + 7 * dst_stride);
cnst1 = __msa_fill_h(cospi_24_64);
cnst0 = __msa_fill_h(cospi_8_64);
@ -782,8 +794,8 @@ static void vp9_iadst8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst,
ADD2(res1, out1, res6, out6, res1, res6);
CLIP_SH2_0_255(res1, res6);
PCKEV_B2_SH(res1, res1, res6, res6, res1, res6);
ST8x1_UB(res1, dst + dst_stride);
ST8x1_UB(res6, dst + 6 * dst_stride);
ST_D1(res1, 0, dst + dst_stride);
ST_D1(res6, 0, dst + 6 * dst_stride);
cnst0 = __msa_fill_h(cospi_16_64);
cnst1 = -cnst0;
@ -801,8 +813,8 @@ static void vp9_iadst8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst,
ADD2(res3, out3, res4, out4, res3, res4);
CLIP_SH2_0_255(res3, res4);
PCKEV_B2_SH(res3, res3, res4, res4, res3, res4);
ST8x1_UB(res3, dst + 3 * dst_stride);
ST8x1_UB(res4, dst + 4 * dst_stride);
ST_D1(res3, 0, dst + 3 * dst_stride);
ST_D1(res4, 0, dst + 4 * dst_stride);
out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp2, temp3, cnst0);
out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp2, temp3, cnst1);
@ -814,8 +826,8 @@ static void vp9_iadst8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst,
ADD2(res2, out2, res5, out5, res2, res5);
CLIP_SH2_0_255(res2, res5);
PCKEV_B2_SH(res2, res2, res5, res5, res2, res5);
ST8x1_UB(res2, dst + 2 * dst_stride);
ST8x1_UB(res5, dst + 5 * dst_stride);
ST_D1(res2, 0, dst + 2 * dst_stride);
ST_D1(res5, 0, dst + 5 * dst_stride);
}
static void vp9_iadst_idct_8x8_add_msa(int16_t *input, uint8_t *dst,
@ -1354,8 +1366,8 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
ADD2(res0, out0, res1, out1, res0, res1);
CLIP_SH2_0_255(res0, res1);
PCKEV_B2_SH(res0, res0, res1, res1, res0, res1);
ST8x1_UB(res0, dst);
ST8x1_UB(res1, dst + 15 * dst_stride);
ST_D1(res0, 0, dst);
ST_D1(res1, 0, dst + 15 * dst_stride);
k0 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
k1 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
@ -1371,8 +1383,8 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
ADD2(res8, out8, res9, out9, res8, res9);
CLIP_SH2_0_255(res8, res9);
PCKEV_B2_SH(res8, res8, res9, res9, res8, res9);
ST8x1_UB(res8, dst + dst_stride);
ST8x1_UB(res9, dst + 14 * dst_stride);
ST_D1(res8, 0, dst + dst_stride);
ST_D1(res9, 0, dst + 14 * dst_stride);
k0 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
k1 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
@ -1386,8 +1398,8 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
ADD2(res4, out4, res5, out5, res4, res5);
CLIP_SH2_0_255(res4, res5);
PCKEV_B2_SH(res4, res4, res5, res5, res4, res5);
ST8x1_UB(res4, dst + 3 * dst_stride);
ST8x1_UB(res5, dst + 12 * dst_stride);
ST_D1(res4, 0, dst + 3 * dst_stride);
ST_D1(res5, 0, dst + 12 * dst_stride);
VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
out13 = -out13;
@ -1398,8 +1410,8 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
ADD2(res12, out12, res13, out13, res12, res13);
CLIP_SH2_0_255(res12, res13);
PCKEV_B2_SH(res12, res12, res13, res13, res12, res13);
ST8x1_UB(res12, dst + 2 * dst_stride);
ST8x1_UB(res13, dst + 13 * dst_stride);
ST_D1(res12, 0, dst + 2 * dst_stride);
ST_D1(res13, 0, dst + 13 * dst_stride);
k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
k3 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
@ -1411,8 +1423,8 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
ADD2(res6, out6, res7, out7, res6, res7);
CLIP_SH2_0_255(res6, res7);
PCKEV_B2_SH(res6, res6, res7, res7, res6, res7);
ST8x1_UB(res6, dst + 4 * dst_stride);
ST8x1_UB(res7, dst + 11 * dst_stride);
ST_D1(res6, 0, dst + 4 * dst_stride);
ST_D1(res7, 0, dst + 11 * dst_stride);
VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11);
SRARI_H2_SH(out10, out11, 6);
@ -1422,8 +1434,8 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
ADD2(res10, out10, res11, out11, res10, res11);
CLIP_SH2_0_255(res10, res11);
PCKEV_B2_SH(res10, res10, res11, res11, res10, res11);
ST8x1_UB(res10, dst + 6 * dst_stride);
ST8x1_UB(res11, dst + 9 * dst_stride);
ST_D1(res10, 0, dst + 6 * dst_stride);
ST_D1(res11, 0, dst + 9 * dst_stride);
k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
k2 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
@ -1435,8 +1447,8 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
ADD2(res2, out2, res3, out3, res2, res3);
CLIP_SH2_0_255(res2, res3);
PCKEV_B2_SH(res2, res2, res3, res3, res2, res3);
ST8x1_UB(res2, dst + 7 * dst_stride);
ST8x1_UB(res3, dst + 8 * dst_stride);
ST_D1(res2, 0, dst + 7 * dst_stride);
ST_D1(res3, 0, dst + 8 * dst_stride);
VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15);
SRARI_H2_SH(out14, out15, 6);
@ -1446,8 +1458,8 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
ADD2(res14, out14, res15, out15, res14, res15);
CLIP_SH2_0_255(res14, res15);
PCKEV_B2_SH(res14, res14, res15, res15, res14, res15);
ST8x1_UB(res14, dst + 5 * dst_stride);
ST8x1_UB(res15, dst + 10 * dst_stride);
ST_D1(res14, 0, dst + 5 * dst_stride);
ST_D1(res15, 0, dst + 10 * dst_stride);
}
static void vp9_iadst16x16_colcol_addblk_msa(int16_t *input, uint8_t *dst,

View File

@ -378,7 +378,8 @@ void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride,
IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
ST_W2(tmp0, 0, 2, dst, dst_stride);
ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
}
void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,
@ -409,7 +410,7 @@ void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,
IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}

View File

@ -1219,9 +1219,7 @@ void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_H2_SH(vec1, vec0, vec2, vec3);
src -= 2;
ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
src += 4 * pitch;
ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
}
void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch,
@ -1266,9 +1264,8 @@ void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch,
src -= 2;
ST4x8_UB(tmp2, tmp3, src, pitch);
src += (8 * pitch);
ST4x8_UB(tmp4, tmp5, src, pitch);
ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
}
void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch,
@ -1313,9 +1310,7 @@ void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_H2_SH(vec1, vec0, vec2, vec3);
src -= 2;
ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
src += 4 * pitch;
ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
} else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
@ -1343,11 +1338,11 @@ void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch,
vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
src -= 3;
ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
ST2x4_UB(vec4, 0, src + 4, pitch);
ST_W4(vec2, 0, 1, 2, 3, src, pitch);
ST_H4(vec4, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
ST2x4_UB(vec4, 4, src + 4, pitch);
ST_W4(vec3, 0, 1, 2, 3, src, pitch);
ST_H4(vec4, 4, 5, 6, 7, src + 4, pitch);
}
}
@ -1410,9 +1405,8 @@ void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_H2_SH(vec1, vec0, vec4, vec5);
src -= 2;
ST4x8_UB(vec2, vec3, src, pitch);
src += 8 * pitch;
ST4x8_UB(vec4, vec5, src, pitch);
ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
} else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
@ -1451,17 +1445,17 @@ void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_B2_SH(q2, q1, vec2, vec5);
src -= 3;
ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
ST2x4_UB(vec2, 0, src + 4, pitch);
ST_W4(vec3, 0, 1, 2, 3, src, pitch);
ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
ST2x4_UB(vec2, 4, src + 4, pitch);
ST_W4(vec4, 0, 1, 2, 3, src, pitch);
ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
src += (4 * pitch);
ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
ST2x4_UB(vec5, 0, src + 4, pitch);
ST_W4(vec6, 0, 1, 2, 3, src, pitch);
ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
ST2x4_UB(vec5, 4, src + 4, pitch);
ST_W4(vec7, 0, 1, 2, 3, src, pitch);
ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
}
}
@ -1523,9 +1517,8 @@ void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_H2_SH(vec1, vec0, vec4, vec5);
src -= 2;
ST4x8_UB(vec2, vec3, src, pitch);
src += 8 * pitch;
ST4x8_UB(vec4, vec5, src, pitch);
ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
} else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
@ -1555,17 +1548,17 @@ void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_B2_SH(q2, q1, vec2, vec5);
src -= 3;
ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
ST2x4_UB(vec2, 0, src + 4, pitch);
ST_W4(vec3, 0, 1, 2, 3, src, pitch);
ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
ST2x4_UB(vec2, 4, src + 4, pitch);
ST_W4(vec4, 0, 1, 2, 3, src, pitch);
ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
src += (4 * pitch);
ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
ST2x4_UB(vec5, 0, src + 4, pitch);
ST_W4(vec6, 0, 1, 2, 3, src, pitch);
ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
ST2x4_UB(vec5, 4, src + 4, pitch);
ST_W4(vec7, 0, 1, 2, 3, src, pitch);
ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
}
}
@ -1627,9 +1620,8 @@ void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_H2_SH(vec1, vec0, vec4, vec5);
src -= 2;
ST4x8_UB(vec2, vec3, src, pitch);
src += 8 * pitch;
ST4x8_UB(vec4, vec5, src, pitch);
ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
} else {
ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
p0_l);
@ -1661,17 +1653,17 @@ void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_B2_SH(q2, q1, vec2, vec5);
src -= 3;
ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
ST2x4_UB(vec2, 0, src + 4, pitch);
ST_W4(vec3, 0, 1, 2, 3, src, pitch);
ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
ST2x4_UB(vec2, 4, src + 4, pitch);
ST_W4(vec4, 0, 1, 2, 3, src, pitch);
ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
src += (4 * pitch);
ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
ST2x4_UB(vec5, 0, src + 4, pitch);
ST_W4(vec6, 0, 1, 2, 3, src, pitch);
ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
ST2x4_UB(vec5, 4, src + 4, pitch);
ST_W4(vec7, 0, 1, 2, 3, src, pitch);
ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
}
}
@ -1811,7 +1803,7 @@ static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
if (__msa_test_bz_v(flat)) {
ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
ILVRL_H2_SH(vec1, vec0, vec2, vec3);
ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, (src_org - 2), pitch_org);
return 1;
} else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
@ -1878,11 +1870,11 @@ static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch
vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
src_org -= 3;
ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
ST2x4_UB(vec2, 0, (src_org + 4), pitch);
ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
src_org += (4 * pitch);
ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
ST2x4_UB(vec2, 4, (src_org + 4), pitch);
ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
return 1;
} else {
@ -1908,7 +1900,7 @@ static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
ST8x1_UB(p6, src);
ST_D1(p6, 0, src);
src += 16;
/* p5 */
@ -1920,7 +1912,7 @@ static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
ST8x1_UB(p5, src);
ST_D1(p5, 0, src);
src += 16;
/* p4 */
@ -1932,7 +1924,7 @@ static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
ST8x1_UB(p4, src);
ST_D1(p4, 0, src);
src += 16;
/* p3 */
@ -1944,7 +1936,7 @@ static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
ST8x1_UB(p3, src);
ST_D1(p3, 0, src);
src += 16;
/* p2 */
@ -1957,7 +1949,7 @@ static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
ST8x1_UB(filter8, src);
ST_D1(filter8, 0, src);
src += 16;
/* p1 */
@ -1970,7 +1962,7 @@ static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
ST8x1_UB(filter8, src);
ST_D1(filter8, 0, src);
src += 16;
/* p0 */
@ -1983,7 +1975,7 @@ static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
ST8x1_UB(filter8, src);
ST_D1(filter8, 0, src);
src += 16;
/* q0 */
@ -1996,7 +1988,7 @@ static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
ST8x1_UB(filter8, src);
ST_D1(filter8, 0, src);
src += 16;
/* q1 */
@ -2008,7 +2000,7 @@ static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
ST8x1_UB(filter8, src);
ST_D1(filter8, 0, src);
src += 16;
/* q2 */
@ -2020,7 +2012,7 @@ static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
ST8x1_UB(filter8, src);
ST_D1(filter8, 0, src);
src += 16;
/* q3 */
@ -2031,7 +2023,7 @@ static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
ST8x1_UB(q3, src);
ST_D1(q3, 0, src);
src += 16;
/* q4 */
@ -2042,7 +2034,7 @@ static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
ST8x1_UB(q4, src);
ST_D1(q4, 0, src);
src += 16;
/* q5 */
@ -2053,7 +2045,7 @@ static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
ST8x1_UB(q5, src);
ST_D1(q5, 0, src);
src += 16;
/* q6 */
@ -2064,7 +2056,7 @@ static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
ST8x1_UB(q6, src);
ST_D1(q6, 0, src);
return 0;
}
@ -2137,9 +2129,8 @@ static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
ILVRL_H2_SH(vec1, vec0, vec4, vec5);
src_org -= 2;
ST4x8_UB(vec2, vec3, src_org, pitch);
src_org += 8 * pitch;
ST4x8_UB(vec4, vec5, src_org, pitch);
ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src_org, pitch);
ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src_org + 8 * pitch, pitch);
return 1;
} else {
@ -2218,17 +2209,17 @@ static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitc
ILVRL_B2_SH(q2, q1, vec2, vec5);
src_org -= 3;
ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
ST2x4_UB(vec2, 0, (src_org + 4), pitch);
ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
src_org += (4 * pitch);
ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
ST2x4_UB(vec2, 4, (src_org + 4), pitch);
ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
src_org += (4 * pitch);
ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
ST2x4_UB(vec5, 0, (src_org + 4), pitch);
ST_W4(vec6, 0, 1, 2, 3, src_org, pitch);
ST_H4(vec5, 0, 1, 2, 3, (src_org + 4), pitch);
src_org += (4 * pitch);
ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
ST2x4_UB(vec5, 4, (src_org + 4), pitch);
ST_W4(vec7, 0, 1, 2, 3, src_org, pitch);
ST_H4(vec5, 4, 5, 6, 7, (src_org + 4), pitch);
return 1;
} else {

View File

@ -153,7 +153,7 @@ static const int8_t vp9_bilinear_filters_msa[15][2] = {
\
PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
}
static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
@ -182,7 +182,7 @@ static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
SRARI_H2_SH(out0, out1, 7);
SAT_SH2_SH(out0, out1, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
@ -217,10 +217,9 @@ static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 7);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
}
static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
@ -262,7 +261,7 @@ static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
@ -296,7 +295,7 @@ static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@ -510,7 +509,7 @@ static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
SRARI_H2_SH(out10, out32, 7);
SAT_SH2_SH(out10, out32, 7);
out = PCKEV_XORI128_UB(out10, out32);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
src2110 = src6554;
@ -562,7 +561,7 @@ static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
@ -825,7 +824,7 @@ static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
SRARI_H2_SH(tmp0, tmp1, 7);
SAT_SH2_SH(tmp0, tmp1, 7);
out = PCKEV_XORI128_UB(tmp0, tmp1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
hz_out5 = hz_out9;
@ -920,7 +919,7 @@ static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
ST8x4_UB(vec0, vec1, dst, dst_stride);
ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
hz_out6 = hz_out10;
@ -1016,7 +1015,7 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
SAT_SH2_SH(res0, res1, 7);
res = PCKEV_XORI128_UB(res0, res1);
res = (v16u8) __msa_aver_u_b(res, dst0);
ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
@ -1061,7 +1060,7 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
XORI_B2_128_UB(res0, res2);
AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
ST4x8_UB(res0, res2, dst, dst_stride);
ST_W8(res0, res2, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
@ -1348,7 +1347,7 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
out = PCKEV_XORI128_UB(out10, out32);
out = __msa_aver_u_b(out, dst0);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
src2110 = src6554;
@ -1619,7 +1618,7 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
SAT_SH2_SH(res0, res1, 7);
res = PCKEV_XORI128_UB(res0, res1);
res = (v16u8) __msa_aver_u_b(res, dst0);
ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
hz_out5 = hz_out9;
@ -1812,7 +1811,8 @@ static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
SRARI_H2_UH(vec2, vec3, 7);
PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
ST_W2(res0, 0, 1, dst, dst_stride);
ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
}
static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
@ -1838,9 +1838,10 @@ static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
res0, res1, res2, res3);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
ST_W2(res0, 0, 1, dst, dst_stride);
ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
}
void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
@ -1877,7 +1878,7 @@ static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
vec0, vec1, vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
ST8x4_UB(src0, src1, dst, dst_stride);
ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
@ -1906,8 +1907,7 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
src += (4 * src_stride);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
@ -1915,8 +1915,8 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
vec0, vec1, vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
dst += (8 * dst_stride);
if (16 == height) {
LD_SB4(src, src_stride, src0, src1, src2, src3);
@ -1931,7 +1931,7 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
src += (4 * src_stride);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
@ -1939,7 +1939,7 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
vec0, vec1, vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
}
@ -2137,7 +2137,7 @@ static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
SRARI_H2_UH(tmp0, tmp1, 7);
SAT_UH2_UH(tmp0, tmp1, 7);
src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
}
static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
@ -2171,8 +2171,7 @@ static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
}
void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
@ -2209,7 +2208,7 @@ static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
@ -2243,16 +2242,15 @@ static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
tmp0, tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
dst += (8 * dst_stride);
src0 = src8;
}
@ -2514,7 +2512,8 @@ static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
SRARI_H2_UH(tmp0, tmp1, 7);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
ST_W2(res0, 0, 1, dst, dst_stride);
ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
}
static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
@ -2557,9 +2556,10 @@ static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
res0, res1, res2, res3);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
ST_W2(res0, 0, 1, dst, dst_stride);
ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
}
void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
@ -2618,7 +2618,7 @@ static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride,
@ -2674,8 +2674,7 @@ static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride
SRARI_H2_UH(tmp3, tmp4, 7);
SAT_UH2_UH(tmp3, tmp4, 7);
PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
@ -2696,8 +2695,8 @@ static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride
SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
dst += (8 * dst_stride);
}
}
@ -2842,7 +2841,7 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
res = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
res = (v16u8) __msa_aver_u_b(res, dst0);
ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
@ -2876,7 +2875,7 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
res2, res3);
ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
ST4x8_UB(res0, res2, dst, dst_stride);
ST_W8(res0, res2, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
}
void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
@ -3202,7 +3201,7 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
out = __msa_aver_u_b(out, dst0);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
@ -3241,7 +3240,7 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
ST4x8_UB(src2110, src4332, dst, dst_stride);
ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
}
void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
@ -3620,7 +3619,7 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
out = __msa_aver_u_b(out, dst0);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
@ -3672,7 +3671,7 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1);
AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
ST4x8_UB(res0, res1, dst, dst_stride);
ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
}
void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
@ -4071,14 +4070,14 @@ static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
ST4x8_UB(dst0, dst1, dst, dst_stride);
ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
} else if (4 == height) {
LW4(src, src_stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
dst0 = __msa_aver_u_b(src0, dst0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
}
}
@ -4109,7 +4108,7 @@ static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
INSERT_D2_UB(tp6, tp7, dst3);
AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0,
dst1, dst2, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, dst_stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += 8 * dst_stride;
}
} else if (4 == height) {
@ -4120,7 +4119,7 @@ static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
INSERT_D2_UB(tp0, tp1, dst0);
INSERT_D2_UB(tp2, tp3, dst1);
AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, dst_stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
}
}

View File

@ -349,19 +349,6 @@
}
#define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
/* Description : Load as 4x4 block of signed halfword elements from 1D source
data into 4 vectors (Each vector with 4 signed halfwords)
Arguments : Inputs - psrc
Outputs - out0, out1, out2, out3
*/
#define LD4x4_SH(psrc, out0, out1, out2, out3) \
{ \
out0 = LD_SH(psrc); \
out2 = LD_SH(psrc + 8); \
out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
out3 = (v8i16) __msa_ilvl_d((v2i64) out2, (v2i64) out2); \
}
/* Description : Store vectors with stride
Arguments : Inputs - in0, in1, stride
Outputs - pdst (destination pointer to store to)
@ -405,198 +392,127 @@
#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
#define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
/* Description : Store as 2x4 byte block to destination memory from input vector
Arguments : Inputs - in, stidx, pdst, stride
Return Type - unsigned byte
Details : Index stidx halfword element from 'in' vector is copied and
stored on first line
Index stidx+1 halfword element from 'in' vector is copied and
stored on second line
Index stidx+2 halfword element from 'in' vector is copied and
stored on third line
Index stidx+3 halfword element from 'in' vector is copied and
stored on fourth line
*/
#define ST2x4_UB(in, stidx, pdst, stride) \
{ \
uint16_t out0_m, out1_m, out2_m, out3_m; \
uint8_t *pblk_2x4_m = (uint8_t *) (pdst); \
\
out0_m = __msa_copy_u_h((v8i16) in, (stidx)); \
out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1)); \
out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2)); \
out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3)); \
\
SH(out0_m, pblk_2x4_m); \
SH(out1_m, pblk_2x4_m + stride); \
SH(out2_m, pblk_2x4_m + 2 * stride); \
SH(out3_m, pblk_2x4_m + 3 * stride); \
/* Description : Store half word elements of vector with stride
* Arguments : Inputs - in source vector
* - pdst (destination pointer to store to)
* - stride
* Details : Stores half word 'idx0' from 'in' to (pdst)
* Stores half word 'idx1' from 'in' to (pdst + stride)
* Similar for other elements
*/
#define ST_H1(in, idx, pdst) \
{ \
uint16_t out0_m; \
out0_m = __msa_copy_u_h((v8i16) in, idx); \
SH(out0_m, (pdst)); \
}
#define ST_H2(in, idx0, idx1, pdst, stride) \
{ \
uint16_t out0_m, out1_m; \
out0_m = __msa_copy_u_h((v8i16) in, idx0); \
out1_m = __msa_copy_u_h((v8i16) in, idx1); \
SH(out0_m, (pdst)); \
SH(out1_m, (pdst) + stride); \
}
#define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \
{ \
uint16_t out0_m, out1_m, out2_m, out3_m; \
out0_m = __msa_copy_u_h((v8i16) in, idx0); \
out1_m = __msa_copy_u_h((v8i16) in, idx1); \
out2_m = __msa_copy_u_h((v8i16) in, idx2); \
out3_m = __msa_copy_u_h((v8i16) in, idx3); \
SH(out0_m, (pdst)); \
SH(out1_m, (pdst) + stride); \
SH(out2_m, (pdst) + 2 * stride); \
SH(out3_m, (pdst) + 3 * stride); \
}
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, \
idx6, idx7, pdst, stride) \
{ \
ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \
ST_H4(in, idx4, idx5, idx6, idx7, (pdst) + 4*stride, stride) \
}
/* Description : Store as 4x2 byte block to destination memory from input vector
Arguments : Inputs - in, pdst, stride
Return Type - unsigned byte
Details : Index 0 word element from input vector is copied and stored
on first line
Index 1 word element from input vector is copied and stored
on second line
*/
#define ST4x2_UB(in, pdst, stride) \
/* Description : Store word elements of vector with stride
* Arguments : Inputs - in source vector
* - pdst (destination pointer to store to)
* - stride
* Details : Stores word 'idx0' from 'in' to (pdst)
* Stores word 'idx1' from 'in' to (pdst + stride)
* Similar for other elements
*/
#define ST_W1(in, idx, pdst) \
{ \
uint32_t out0_m; \
out0_m = __msa_copy_u_w((v4i32) in, idx); \
SW(out0_m, (pdst)); \
}
#define ST_W2(in, idx0, idx1, pdst, stride) \
{ \
uint32_t out0_m, out1_m; \
out0_m = __msa_copy_u_w((v4i32) in, idx0); \
out1_m = __msa_copy_u_w((v4i32) in, idx1); \
SW(out0_m, (pdst)); \
SW(out1_m, (pdst) + stride); \
}
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride) \
{ \
uint32_t out0_m, out1_m, out2_m, out3_m; \
out0_m = __msa_copy_u_w((v4i32) in, idx0); \
out1_m = __msa_copy_u_w((v4i32) in, idx1); \
out2_m = __msa_copy_u_w((v4i32) in, idx2); \
out3_m = __msa_copy_u_w((v4i32) in, idx3); \
SW(out0_m, (pdst)); \
SW(out1_m, (pdst) + stride); \
SW(out2_m, (pdst) + 2*stride); \
SW(out3_m, (pdst) + 3*stride); \
}
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, \
idx4, idx5, idx6, idx7, pdst, stride) \
{ \
ST_W4(in0, idx0, idx1, idx2, idx3, pdst, stride) \
ST_W4(in1, idx4, idx5, idx6, idx7, pdst + 4*stride, stride) \
}
/* Description : Store double word elements of vector with stride
* Arguments : Inputs - in source vector
* - pdst (destination pointer to store to)
* - stride
* Details : Stores double word 'idx0' from 'in' to (pdst)
* Stores double word 'idx1' from 'in' to (pdst + stride)
* Similar for other elements
*/
#define ST_D1(in, idx, pdst) \
{ \
uint32_t out0_m, out1_m; \
uint8_t *pblk_4x2_m = (uint8_t *) (pdst); \
\
out0_m = __msa_copy_u_w((v4i32) in, 0); \
out1_m = __msa_copy_u_w((v4i32) in, 1); \
\
SW(out0_m, pblk_4x2_m); \
SW(out1_m, pblk_4x2_m + stride); \
uint64_t out0_m; \
out0_m = __msa_copy_u_d((v2i64) in, idx); \
SD(out0_m, (pdst)); \
}
/* Description : Store as 4x4 byte block to destination memory from input vector
Arguments : Inputs - in0, in1, pdst, stride
Return Type - unsigned byte
Details : Idx0 word element from input vector 'in0' is copied and stored
on first line
Idx1 word element from input vector 'in0' is copied and stored
on second line
Idx2 word element from input vector 'in1' is copied and stored
on third line
Idx3 word element from input vector 'in1' is copied and stored
on fourth line
*/
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
{ \
uint32_t out0_m, out1_m, out2_m, out3_m; \
uint8_t *pblk_4x4_m = (uint8_t *) (pdst); \
\
out0_m = __msa_copy_u_w((v4i32) in0, idx0); \
out1_m = __msa_copy_u_w((v4i32) in0, idx1); \
out2_m = __msa_copy_u_w((v4i32) in1, idx2); \
out3_m = __msa_copy_u_w((v4i32) in1, idx3); \
\
SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
}
#define ST4x8_UB(in0, in1, pdst, stride) \
{ \
uint8_t *pblk_4x8 = (uint8_t *) (pdst); \
\
ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
}
/* Description : Store as 6x4 byte block to destination memory from input
vectors
Arguments : Inputs - in0, in1, pdst, stride
Return Type - unsigned byte
Details : Index 0 word element from input vector 'in0' is copied and
stored on first line followed by index 2 halfword element
Index 2 word element from input vector 'in0' is copied and
stored on second line followed by index 2 halfword element
Index 0 word element from input vector 'in1' is copied and
stored on third line followed by index 2 halfword element
Index 2 word element from input vector 'in1' is copied and
stored on fourth line followed by index 2 halfword element
*/
#define ST6x4_UB(in0, in1, pdst, stride) \
{ \
uint32_t out0_m, out1_m, out2_m, out3_m; \
uint16_t out4_m, out5_m, out6_m, out7_m; \
uint8_t *pblk_6x4_m = (uint8_t *) (pdst); \
\
out0_m = __msa_copy_u_w((v4i32) in0, 0); \
out1_m = __msa_copy_u_w((v4i32) in0, 2); \
out2_m = __msa_copy_u_w((v4i32) in1, 0); \
out3_m = __msa_copy_u_w((v4i32) in1, 2); \
\
out4_m = __msa_copy_u_h((v8i16) in0, 2); \
out5_m = __msa_copy_u_h((v8i16) in0, 6); \
out6_m = __msa_copy_u_h((v8i16) in1, 2); \
out7_m = __msa_copy_u_h((v8i16) in1, 6); \
\
SW(out0_m, pblk_6x4_m); \
SH(out4_m, (pblk_6x4_m + 4)); \
pblk_6x4_m += stride; \
SW(out1_m, pblk_6x4_m); \
SH(out5_m, (pblk_6x4_m + 4)); \
pblk_6x4_m += stride; \
SW(out2_m, pblk_6x4_m); \
SH(out6_m, (pblk_6x4_m + 4)); \
pblk_6x4_m += stride; \
SW(out3_m, pblk_6x4_m); \
SH(out7_m, (pblk_6x4_m + 4)); \
}
/* Description : Store as 8x1 byte block to destination memory from input vector
Arguments : Inputs - in, pdst
Details : Index 0 double word element from input vector 'in' is copied
and stored to destination memory at (pdst)
*/
#define ST8x1_UB(in, pdst) \
{ \
uint64_t out0_m; \
out0_m = __msa_copy_u_d((v2i64) in, 0); \
SD(out0_m, pdst); \
}
/* Description : Store as 8x2 byte block to destination memory from input vector
Arguments : Inputs - in, pdst, stride
Details : Index 0 double word element from input vector 'in' is copied
and stored to destination memory at (pdst)
Index 1 double word element from input vector 'in' is copied
and stored to destination memory at (pdst + stride)
*/
#define ST8x2_UB(in, pdst, stride) \
#define ST_D2(in, idx0, idx1, pdst, stride) \
{ \
uint64_t out0_m, out1_m; \
uint8_t *pblk_8x2_m = (uint8_t *) (pdst); \
\
out0_m = __msa_copy_u_d((v2i64) in, 0); \
out1_m = __msa_copy_u_d((v2i64) in, 1); \
\
SD(out0_m, pblk_8x2_m); \
SD(out1_m, pblk_8x2_m + stride); \
out0_m = __msa_copy_u_d((v2i64) in, idx0); \
out1_m = __msa_copy_u_d((v2i64) in, idx1); \
SD(out0_m, (pdst)); \
SD(out1_m, (pdst) + stride); \
}
/* Description : Store as 8x4 byte block to destination memory from input
vectors
Arguments : Inputs - in0, in1, pdst, stride
Details : Index 0 double word element from input vector 'in0' is copied
and stored to destination memory at (pblk_8x4_m)
Index 1 double word element from input vector 'in0' is copied
and stored to destination memory at (pblk_8x4_m + stride)
Index 0 double word element from input vector 'in1' is copied
and stored to destination memory at (pblk_8x4_m + 2 * stride)
Index 1 double word element from input vector 'in1' is copied
and stored to destination memory at (pblk_8x4_m + 3 * stride)
*/
#define ST8x4_UB(in0, in1, pdst, stride) \
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
{ \
uint64_t out0_m, out1_m, out2_m, out3_m; \
uint8_t *pblk_8x4_m = (uint8_t *) (pdst); \
\
out0_m = __msa_copy_u_d((v2i64) in0, 0); \
out1_m = __msa_copy_u_d((v2i64) in0, 1); \
out2_m = __msa_copy_u_d((v2i64) in1, 0); \
out3_m = __msa_copy_u_d((v2i64) in1, 1); \
\
SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
out0_m = __msa_copy_u_d((v2i64) in0, idx0); \
out1_m = __msa_copy_u_d((v2i64) in0, idx1); \
out2_m = __msa_copy_u_d((v2i64) in1, idx2); \
out3_m = __msa_copy_u_d((v2i64) in1, idx3); \
SD(out0_m, (pdst)); \
SD(out1_m, (pdst) + stride); \
SD(out2_m, (pdst) + 2 * stride); \
SD(out3_m, (pdst) + 3 * stride); \
}
#define ST8x8_UB(in0, in1, in2, in3, pdst, stride) \
{ \
uint8_t *pblk_8x8_m = (uint8_t *) (pdst); \
\
ST8x4_UB(in0, in1, pblk_8x8_m, stride); \
ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride); \
}
#define ST12x4_UB(in0, in1, in2, pdst, stride) \
{ \
uint8_t *pblk_12x4_m = (uint8_t *) (pdst); \
\
/* left 8x4 */ \
ST8x4_UB(in0, in1, pblk_12x4_m, stride); \
/* right 4x4 */ \
ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride); \
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, \
idx4, idx5, idx6, idx7, pdst, stride) \
{ \
ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
ST_D4(in2, in3, idx4, idx5, idx6, idx7, pdst + 4 * stride, stride) \
}
/* Description : Store as 12x8 byte block to destination memory from
@ -2926,7 +2842,7 @@
tmp0_m = PCKEV_XORI128_UB(in0, in1); \
tmp1_m = PCKEV_XORI128_UB(in2, in3); \
AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
}
/* Description : Pack even byte elements, extract 0 & 2 index words from pair