1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vp8_rtcd.h"
12 #include "vp8/common/mips/msa/vp8_macros_msa.h"
13 #include "vp8/encoder/block.h"
14 
fast_quantize_b_msa(int16_t * coeff_ptr,int16_t * round,int16_t * quant,int16_t * de_quant,int16_t * q_coeff,int16_t * dq_coeff)15 static int8_t fast_quantize_b_msa(int16_t *coeff_ptr, int16_t *round,
16                                   int16_t *quant, int16_t *de_quant,
17                                   int16_t *q_coeff, int16_t *dq_coeff) {
18   int32_t cnt, eob;
19   v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 };
20   v8i16 round0, round1;
21   v8i16 sign_z0, sign_z1;
22   v8i16 q_coeff0, q_coeff1;
23   v8i16 x0, x1, de_quant0, de_quant1;
24   v8i16 coeff0, coeff1, z0, z1;
25   v8i16 quant0, quant1, quant2, quant3;
26   v8i16 zero = { 0 };
27   v8i16 inv_zig_zag0, inv_zig_zag1;
28   v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
29   v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
30   v8i16 temp0_h, temp1_h, temp2_h, temp3_h;
31   v4i32 temp0_w, temp1_w, temp2_w, temp3_w;
32 
33   ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1);
34   eob = -1;
35   LD_SH2(coeff_ptr, 8, coeff0, coeff1);
36   VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, z0,
37              z1);
38   LD_SH2(round, 8, coeff0, coeff1);
39   VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, round0,
40              round1);
41   LD_SH2(quant, 8, coeff0, coeff1);
42   VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, quant0,
43              quant2);
44   sign_z0 = z0 >> 15;
45   sign_z1 = z1 >> 15;
46   x0 = __msa_add_a_h(z0, zero);
47   x1 = __msa_add_a_h(z1, zero);
48   ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
49   ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
50   ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h);
51   ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h);
52   DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
53               quant3, temp0_w, temp1_w, temp2_w, temp3_w);
54   SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
55   PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1);
56   x0 = x0 ^ sign_z0;
57   x1 = x1 ^ sign_z1;
58   SUB2(x0, sign_z0, x1, sign_z1, x0, x1);
59   VSHF_H2_SH(x0, x1, x0, x1, inv_zig_zag0, inv_zig_zag1, q_coeff0, q_coeff1);
60   ST_SH2(q_coeff0, q_coeff1, q_coeff, 8);
61   LD_SH2(de_quant, 8, de_quant0, de_quant1);
62   q_coeff0 *= de_quant0;
63   q_coeff1 *= de_quant1;
64   ST_SH2(q_coeff0, q_coeff1, dq_coeff, 8);
65 
66   for (cnt = 0; cnt < 16; ++cnt) {
67     if ((cnt <= 7) && (x1[7 - cnt] != 0)) {
68       eob = (15 - cnt);
69       break;
70     }
71 
72     if ((cnt > 7) && (x0[7 - (cnt - 8)] != 0)) {
73       eob = (7 - (cnt - 8));
74       break;
75     }
76   }
77 
78   return (int8_t)(eob + 1);
79 }
80 
exact_regular_quantize_b_msa(int16_t * zbin_boost,int16_t * coeff_ptr,int16_t * zbin,int16_t * round,int16_t * quant,int16_t * quant_shift,int16_t * de_quant,int16_t zbin_oq_in,int16_t * q_coeff,int16_t * dq_coeff)81 static int8_t exact_regular_quantize_b_msa(
82     int16_t *zbin_boost, int16_t *coeff_ptr, int16_t *zbin, int16_t *round,
83     int16_t *quant, int16_t *quant_shift, int16_t *de_quant, int16_t zbin_oq_in,
84     int16_t *q_coeff, int16_t *dq_coeff) {
85   int32_t cnt, eob;
86   int16_t *boost_temp = zbin_boost;
87   v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 };
88   v8i16 round0, round1;
89   v8i16 sign_z0, sign_z1;
90   v8i16 q_coeff0, q_coeff1;
91   v8i16 z_bin0, z_bin1, zbin_o_q;
92   v8i16 x0, x1, sign_x0, sign_x1, de_quant0, de_quant1;
93   v8i16 coeff0, coeff1, z0, z1;
94   v8i16 quant0, quant1, quant2, quant3;
95   v8i16 zero = { 0 };
96   v8i16 inv_zig_zag0, inv_zig_zag1;
97   v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
98   v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
99   v8i16 temp0_h, temp1_h, temp2_h, temp3_h;
100   v4i32 temp0_w, temp1_w, temp2_w, temp3_w;
101 
102   ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1);
103   zbin_o_q = __msa_fill_h(zbin_oq_in);
104   eob = -1;
105   LD_SH2(coeff_ptr, 8, coeff0, coeff1);
106   VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, z0,
107              z1);
108   LD_SH2(round, 8, coeff0, coeff1);
109   VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, round0,
110              round1);
111   LD_SH2(quant, 8, coeff0, coeff1);
112   VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, quant0,
113              quant2);
114   LD_SH2(zbin, 8, coeff0, coeff1);
115   VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, z_bin0,
116              z_bin1);
117   sign_z0 = z0 >> 15;
118   sign_z1 = z1 >> 15;
119   x0 = __msa_add_a_h(z0, zero);
120   x1 = __msa_add_a_h(z1, zero);
121   SUB2(x0, z_bin0, x1, z_bin1, z_bin0, z_bin1);
122   SUB2(z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1);
123   ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
124   ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
125   ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h);
126   ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h);
127   DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
128               quant3, temp0_w, temp1_w, temp2_w, temp3_w);
129   SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
130   PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, temp0_h, temp2_h);
131   LD_SH2(quant_shift, 8, coeff0, coeff1);
132   VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, quant0,
133              quant2);
134   ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
135   ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
136   ADD2(x0, round0, x1, round1, x0, x1);
137   ILVL_H2_SH(temp0_h, x0, temp2_h, x1, temp1_h, temp3_h);
138   ILVR_H2_SH(temp0_h, x0, temp2_h, x1, temp0_h, temp2_h);
139   DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
140               quant3, temp0_w, temp1_w, temp2_w, temp3_w);
141   SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
142   PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1);
143   sign_x0 = x0 ^ sign_z0;
144   sign_x1 = x1 ^ sign_z1;
145   SUB2(sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1);
146   for (cnt = 0; cnt < 16; ++cnt) {
147     if (cnt <= 7) {
148       if (boost_temp[0] <= z_bin0[cnt]) {
149         if (x0[cnt]) {
150           eob = cnt;
151           boost_temp = zbin_boost;
152         } else {
153           boost_temp++;
154         }
155       } else {
156         sign_x0[cnt] = 0;
157         boost_temp++;
158       }
159     } else {
160       if (boost_temp[0] <= z_bin1[cnt - 8]) {
161         if (x1[cnt - 8]) {
162           eob = cnt;
163           boost_temp = zbin_boost;
164         } else {
165           boost_temp++;
166         }
167       } else {
168         sign_x1[cnt - 8] = 0;
169         boost_temp++;
170       }
171     }
172   }
173 
174   VSHF_H2_SH(sign_x0, sign_x1, sign_x0, sign_x1, inv_zig_zag0, inv_zig_zag1,
175              q_coeff0, q_coeff1);
176   ST_SH2(q_coeff0, q_coeff1, q_coeff, 8);
177   LD_SH2(de_quant, 8, de_quant0, de_quant1);
178   MUL2(de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0, de_quant1);
179   ST_SH2(de_quant0, de_quant1, dq_coeff, 8);
180 
181   return (int8_t)(eob + 1);
182 }
183 
vp8_fast_quantize_b_msa(BLOCK * b,BLOCKD * d)184 void vp8_fast_quantize_b_msa(BLOCK *b, BLOCKD *d) {
185   int16_t *coeff_ptr = b->coeff;
186   int16_t *round_ptr = b->round;
187   int16_t *quant_ptr = b->quant_fast;
188   int16_t *qcoeff_ptr = d->qcoeff;
189   int16_t *dqcoeff_ptr = d->dqcoeff;
190   int16_t *dequant_ptr = d->dequant;
191 
192   *d->eob = fast_quantize_b_msa(coeff_ptr, round_ptr, quant_ptr, dequant_ptr,
193                                 qcoeff_ptr, dqcoeff_ptr);
194 }
195 
vp8_regular_quantize_b_msa(BLOCK * b,BLOCKD * d)196 void vp8_regular_quantize_b_msa(BLOCK *b, BLOCKD *d) {
197   int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
198   int16_t *coeff_ptr = b->coeff;
199   int16_t *zbin_ptr = b->zbin;
200   int16_t *round_ptr = b->round;
201   int16_t *quant_ptr = b->quant;
202   int16_t *quant_shift_ptr = b->quant_shift;
203   int16_t *qcoeff_ptr = d->qcoeff;
204   int16_t *dqcoeff_ptr = d->dqcoeff;
205   int16_t *dequant_ptr = d->dequant;
206   int16_t zbin_oq_value = b->zbin_extra;
207 
208   *d->eob = exact_regular_quantize_b_msa(
209       zbin_boost_ptr, coeff_ptr, zbin_ptr, round_ptr, quant_ptr,
210       quant_shift_ptr, dequant_ptr, zbin_oq_value, qcoeff_ptr, dqcoeff_ptr);
211 }
212