lavc/h264dsp: stick R-V V weight to 16-bit precision

T-Head C908 (ns):
h264_weight2_8_c:        1607.8
h264_weight2_8_rvv_i32:   515.0 (before)
h264_weight2_8_rvv_i32:   348.5 (after)
h264_weight4_8_c:        2255.8
h264_weight4_8_rvv_i32:  1015.0 (before)
h264_weight4_8_rvv_i32:   691.0 (after)
h264_weight8_8_c:        3857.5
h264_weight8_8_rvv_i32:  2218.8 (before)
h264_weight8_8_rvv_i32:  1561.3 (after)
h264_weight16_8_c:       7431.5
h264_weight16_8_rvv_i32: 2737.3 (before)
h264_weight16_8_rvv_i32: 1848.3 (after)

SpacemiT X60 (ns):
h264_weight2_8_c:        1624.1
h264_weight2_8_rvv_i32:   352.6 (before)
h264_weight2_8_rvv_i32:   259.3 (after)
h264_weight4_8_c:        2259.3
h264_weight4_8_rvv_i32:   685.8 (before)
h264_weight4_8_rvv_i32:   530.3 (after)
h264_weight8_8_c:        4103.3
h264_weight8_8_rvv_i32:  1581.8 (before)
h264_weight8_8_rvv_i32:  1238.6 (after)
h264_weight16_8_c:       7624.3
h264_weight16_8_rvv_i32: 2738.1 (before)
h264_weight16_8_rvv_i32: 1853.3 (after)
This commit is contained in:
Rémi Denis-Courmont 2024-07-30 20:29:02 +03:00
parent afd45c7ff7
commit 677f28b310

View File

@ -32,17 +32,15 @@ func ff_h264_weight_pixels_simple_8_rvv, zve32x
csrwi vxrm, 0 csrwi vxrm, 0
sll a5, a5, a3 sll a5, a5, a3
1: 1:
vsetvli zero, a6, e32, m4, ta, ma vsetvli zero, a6, e16, m2, ta, ma
vle8.v v8, (a0) vle8.v v8, (a0)
addi a2, a2, -1 addi a2, a2, -1
vmv.v.x v16, a5
vsetvli zero, zero, e16, m2, ta, ma
vzext.vf2 v24, v8 vzext.vf2 v24, v8
vwmaccsu.vx v16, a4, v24 vmul.vx v16, v24, a4
vnclip.wx v16, v16, a3 vsadd.vx v16, v16, a5
vmax.vx v16, v16, zero vmax.vx v16, v16, zero
vsetvli zero, zero, e8, m1, ta, ma vsetvli zero, zero, e8, m1, ta, ma
vnclipu.wi v8, v16, 0 vnclipu.wx v8, v16, a3
vse8.v v8, (a0) vse8.v v8, (a0)
add a0, a0, a1 add a0, a0, a1
bnez a2, 1b bnez a2, 1b
@ -85,23 +83,20 @@ func ff_h264_weight_pixels_8_rvv, zve32x
mv t0, a0 mv t0, a0
mv t6, a6 mv t6, a6
2: 2:
vsetvli t2, a2, e32, m8, ta, ma vsetvli t2, a2, e16, m8, ta, ma
vlsseg2e8.v v0, (t0), a1 vlsseg2e8.v v0, (t0), a1
addi t6, t6, -2 addi t6, t6, -2
vmv.v.x v16, a5 vzext.vf2 v16, v0
vmv.v.x v24, a5 vzext.vf2 v24, v4
vsetvli zero, zero, e16, m4, ta, ma vmul.vx v16, v16, a4
vzext.vf2 v8, v0 vmul.vx v24, v24, a4
vzext.vf2 v12, v2 vsadd.vx v16, v16, a5
vwmaccsu.vx v16, a4, v8 vsadd.vx v24, v24, a5
vwmaccsu.vx v24, a4, v12 vmax.vx v16, v16, zero
vnclip.wx v8, v16, a3 vmax.vx v24, v24, zero
vnclip.wx v12, v24, a3 vsetvli zero, zero, e8, m4, ta, ma
vmax.vx v8, v8, zero vnclipu.wx v0, v16, a3
vmax.vx v12, v12, zero vnclipu.wx v4, v24, a3
vsetvli zero, zero, e8, m2, ta, ma
vnclipu.wi v0, v8, 0
vnclipu.wi v2, v12, 0
vssseg2e8.v v0, (t0), a1 vssseg2e8.v v0, (t0), a1
addi t0, t0, 2 addi t0, t0, 2
bnez t6, 2b bnez t6, 2b