1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vp8_rtcd.h"
12 #include "vp8/common/blockd.h"
13 #include "vp8/common/mips/msa/vp8_macros_msa.h"
14 
15 static const int32_t cospi8sqrt2minus1 = 20091;
16 static const int32_t sinpi8sqrt2 = 35468;
17 
18 #define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3)  \
19 {                                                                        \
20     v8i16 s4_m, s5_m, s6_m, s7_m;                                        \
21                                                                          \
22     TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m);      \
23     ILVR_D2_SH(s6_m, s4_m, s7_m, s5_m, out0, out2);                      \
24     out1 = (v8i16)__msa_ilvl_d((v2i64)s6_m, (v2i64)s4_m);                \
25     out3 = (v8i16)__msa_ilvl_d((v2i64)s7_m, (v2i64)s5_m);                \
26 }
27 
28 #define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in)     \
29 ({                                                        \
30     v8i16 out_m;                                          \
31     v8i16 zero_m = { 0 };                                 \
32     v4i32 tmp1_m, tmp2_m;                                 \
33     v4i32 sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2);     \
34                                                           \
35     ILVRL_H2_SW(in, zero_m, tmp1_m, tmp2_m);              \
36     tmp1_m >>= 16;                                        \
37     tmp2_m >>= 16;                                        \
38     tmp1_m = (tmp1_m * sinpi8_sqrt2_m) >> 16;             \
39     tmp2_m = (tmp2_m * sinpi8_sqrt2_m) >> 16;             \
40     out_m = __msa_pckev_h((v8i16)tmp2_m, (v8i16)tmp1_m);  \
41                                                           \
42     out_m;                                                \
43 })
44 
45 #define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3)  \
46 {                                                                  \
47     v8i16 a1_m, b1_m, c1_m, d1_m;                                  \
48     v8i16 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                  \
49     v8i16 const_cospi8sqrt2minus1_m;                               \
50                                                                    \
51     const_cospi8sqrt2minus1_m = __msa_fill_h(cospi8sqrt2minus1);   \
52     a1_m = in0 + in2;                                              \
53     b1_m = in0 - in2;                                              \
54     c_tmp1_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1);     \
55     c_tmp2_m = __msa_mul_q_h(in3, const_cospi8sqrt2minus1_m);      \
56     c_tmp2_m = c_tmp2_m >> 1;                                      \
57     c_tmp2_m = in3 + c_tmp2_m;                                     \
58     c1_m = c_tmp1_m - c_tmp2_m;                                    \
59     d_tmp1_m = __msa_mul_q_h(in1, const_cospi8sqrt2minus1_m);      \
60     d_tmp1_m = d_tmp1_m >> 1;                                      \
61     d_tmp1_m = in1 + d_tmp1_m;                                     \
62     d_tmp2_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3);     \
63     d1_m = d_tmp1_m + d_tmp2_m;                                    \
64     BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
65 }
66 
67 #define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3)  \
68 {                                                                  \
69     v4i32 a1_m, b1_m, c1_m, d1_m;                                  \
70     v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                  \
71     v4i32 const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m;               \
72                                                                    \
73     const_cospi8sqrt2minus1_m = __msa_fill_w(cospi8sqrt2minus1);   \
74     sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2);                    \
75     a1_m = in0 + in2;                                              \
76     b1_m = in0 - in2;                                              \
77     c_tmp1_m = (in1 * sinpi8_sqrt2_m) >> 16;                       \
78     c_tmp2_m = in3 + ((in3 * const_cospi8sqrt2minus1_m) >> 16);    \
79     c1_m = c_tmp1_m - c_tmp2_m;                                    \
80     d_tmp1_m = in1 + ((in1 * const_cospi8sqrt2minus1_m) >> 16);    \
81     d_tmp2_m = (in3 * sinpi8_sqrt2_m) >> 16;                       \
82     d1_m = d_tmp1_m + d_tmp2_m;                                    \
83     BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
84 }
85 
idct4x4_addblk_msa(int16_t * input,uint8_t * pred,int32_t pred_stride,uint8_t * dest,int32_t dest_stride)86 static void idct4x4_addblk_msa(int16_t *input, uint8_t *pred,
87                                int32_t pred_stride,
88                                uint8_t *dest, int32_t dest_stride)
89 {
90     v8i16 input0, input1;
91     v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
92     v4i32 res0, res1, res2, res3;
93     v16i8 zero = { 0 };
94     v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
95     v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24,
96                    25, 26, 27, 28, 29, 30, 31 };
97 
98     LD_SH2(input, 8, input0, input1);
99     UNPCK_SH_SW(input0, in0, in1);
100     UNPCK_SH_SW(input1, in2, in3);
101     VP8_IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
102     TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
103     VP8_IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
104     SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
105     TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
106     LD_SB4(pred, pred_stride, pred0, pred1, pred2, pred3);
107     ILVR_B4_SW(zero, pred0, zero, pred1, zero, pred2, zero, pred3, res0, res1,
108                res2, res3);
109     ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1,
110                res2, res3);
111     ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
112     res0 = CLIP_SW_0_255(res0);
113     res1 = CLIP_SW_0_255(res1);
114     res2 = CLIP_SW_0_255(res2);
115     res3 = CLIP_SW_0_255(res3);
116     LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
117     VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
118     VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
119     ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
120 }
121 
idct4x4_addconst_msa(int16_t in_dc,uint8_t * pred,int32_t pred_stride,uint8_t * dest,int32_t dest_stride)122 static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred,
123                                  int32_t pred_stride,
124                                  uint8_t *dest, int32_t dest_stride)
125 {
126     v8i16 vec;
127     v8i16 res0, res1, res2, res3;
128     v16i8 zero = { 0 };
129     v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
130     v16i8 mask = { 0, 2, 4, 6, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
131 
132     vec = __msa_fill_h(in_dc);
133     vec = __msa_srari_h(vec, 3);
134     LD_SB4(pred, pred_stride, pred0, pred1, pred2, pred3);
135     ILVR_B4_SH(zero, pred0, zero, pred1, zero, pred2, zero, pred3, res0, res1,
136                res2, res3);
137     ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
138     CLIP_SH4_0_255(res0, res1, res2, res3);
139     LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
140     VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
141     VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
142     ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
143 }
144 
vp8_short_inv_walsh4x4_msa(int16_t * input,int16_t * mb_dq_coeff)145 void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff)
146 {
147     v8i16 input0, input1;
148     v4i32 in0, in1, in2, in3, a1, b1, c1, d1;
149     v4i32 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
150 
151     LD_SH2(input, 8, input0, input1);
152     UNPCK_SH_SW(input0, in0, in1);
153     UNPCK_SH_SW(input1, in2, in3);
154     BUTTERFLY_4(in0, in1, in2, in3, a1, b1, c1, d1);
155     BUTTERFLY_4(a1, d1, c1, b1, hz0, hz1, hz3, hz2);
156     TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
157     BUTTERFLY_4(hz0, hz1, hz2, hz3, a1, b1, c1, d1);
158     BUTTERFLY_4(a1, d1, c1, b1, vt0, vt1, vt3, vt2);
159     ADD4(vt0, 3, vt1, 3, vt2, 3, vt3, 3, vt0, vt1, vt2, vt3);
160     SRA_4V(vt0, vt1, vt2, vt3, 3);
161     mb_dq_coeff[0] = __msa_copy_s_h((v8i16)vt0, 0);
162     mb_dq_coeff[16] = __msa_copy_s_h((v8i16)vt1, 0);
163     mb_dq_coeff[32] = __msa_copy_s_h((v8i16)vt2, 0);
164     mb_dq_coeff[48] = __msa_copy_s_h((v8i16)vt3, 0);
165     mb_dq_coeff[64] = __msa_copy_s_h((v8i16)vt0, 2);
166     mb_dq_coeff[80] = __msa_copy_s_h((v8i16)vt1, 2);
167     mb_dq_coeff[96] = __msa_copy_s_h((v8i16)vt2, 2);
168     mb_dq_coeff[112] = __msa_copy_s_h((v8i16)vt3, 2);
169     mb_dq_coeff[128] = __msa_copy_s_h((v8i16)vt0, 4);
170     mb_dq_coeff[144] = __msa_copy_s_h((v8i16)vt1, 4);
171     mb_dq_coeff[160] = __msa_copy_s_h((v8i16)vt2, 4);
172     mb_dq_coeff[176] = __msa_copy_s_h((v8i16)vt3, 4);
173     mb_dq_coeff[192] = __msa_copy_s_h((v8i16)vt0, 6);
174     mb_dq_coeff[208] = __msa_copy_s_h((v8i16)vt1, 6);
175     mb_dq_coeff[224] = __msa_copy_s_h((v8i16)vt2, 6);
176     mb_dq_coeff[240] = __msa_copy_s_h((v8i16)vt3, 6);
177 }
178 
dequant_idct4x4_addblk_msa(int16_t * input,int16_t * dequant_input,uint8_t * dest,int32_t dest_stride)179 static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input,
180                                        uint8_t *dest, int32_t dest_stride)
181 {
182     v8i16 input0, input1, dequant_in0, dequant_in1, mul0, mul1;
183     v8i16 in0, in1, in2, in3;
184     v8i16 hz0_h, hz1_h, hz2_h, hz3_h;
185     v16i8 dest0, dest1, dest2, dest3;
186     v4i32 hz0_w, hz1_w, hz2_w, hz3_w;
187     v4i32 vt0, vt1, vt2, vt3, res0, res1, res2, res3;
188     v2i64 zero = { 0 };
189     v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24,
190                    25, 26, 27, 28, 29, 30, 31 };
191 
192     LD_SH2(input, 8, input0, input1);
193     LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
194     MUL2(input0, dequant_in0, input1, dequant_in1, mul0, mul1);
195     PCKEV_D2_SH(zero, mul0, zero, mul1, in0, in2);
196     PCKOD_D2_SH(zero, mul0, zero, mul1, in1, in3);
197     VP8_IDCT_1D_H(in0, in1, in2, in3, hz0_h, hz1_h, hz2_h, hz3_h);
198     PCKEV_D2_SH(hz1_h, hz0_h, hz3_h, hz2_h, mul0, mul1);
199     UNPCK_SH_SW(mul0, hz0_w, hz1_w);
200     UNPCK_SH_SW(mul1, hz2_w, hz3_w);
201     TRANSPOSE4x4_SW_SW(hz0_w, hz1_w, hz2_w, hz3_w, hz0_w, hz1_w, hz2_w, hz3_w);
202     VP8_IDCT_1D_W(hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3);
203     SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
204     TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
205     LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
206     ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
207                res2, res3);
208     ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1,
209                res2, res3);
210     ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
211     res0 = CLIP_SW_0_255(res0);
212     res1 = CLIP_SW_0_255(res1);
213     res2 = CLIP_SW_0_255(res2);
214     res3 = CLIP_SW_0_255(res3);
215     VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
216     VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
217     ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
218 }
219 
dequant_idct4x4_addblk_2x_msa(int16_t * input,int16_t * dequant_input,uint8_t * dest,int32_t dest_stride)220 static void dequant_idct4x4_addblk_2x_msa(int16_t *input,
221                                           int16_t *dequant_input,
222                                           uint8_t *dest, int32_t dest_stride)
223 {
224     v16u8 dest0, dest1, dest2, dest3;
225     v8i16 in0, in1, in2, in3;
226     v8i16 mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
227     v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
228     v8i16 res0, res1, res2, res3;
229     v4i32 hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r;
230     v4i32 vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r;
231     v16i8 zero = { 0 };
232 
233     LD_SH4(input, 8, in0, in1, in2, in3);
234     LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
235     MUL4(in0, dequant_in0, in1, dequant_in1, in2, dequant_in0, in3, dequant_in1,
236          mul0, mul1, mul2, mul3);
237     PCKEV_D2_SH(mul2, mul0, mul3, mul1, in0, in2);
238     PCKOD_D2_SH(mul2, mul0, mul3, mul1, in1, in3);
239     VP8_IDCT_1D_H(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
240     TRANSPOSE_TWO_4x4_H(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
241     UNPCK_SH_SW(hz0, hz0r, hz0l);
242     UNPCK_SH_SW(hz1, hz1r, hz1l);
243     UNPCK_SH_SW(hz2, hz2r, hz2l);
244     UNPCK_SH_SW(hz3, hz3r, hz3l);
245     VP8_IDCT_1D_W(hz0l, hz1l, hz2l, hz3l, vt0l, vt1l, vt2l, vt3l);
246     SRARI_W4_SW(vt0l, vt1l, vt2l, vt3l, 3);
247     VP8_IDCT_1D_W(hz0r, hz1r, hz2r, hz3r, vt0r, vt1r, vt2r, vt3r);
248     SRARI_W4_SW(vt0r, vt1r, vt2r, vt3r, 3);
249     PCKEV_H4_SH(vt0l, vt0r, vt1l, vt1r, vt2l, vt2r, vt3l, vt3r, vt0, vt1, vt2,
250                 vt3);
251     TRANSPOSE_TWO_4x4_H(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
252     LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
253     ILVR_B4_SH(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
254                res2, res3);
255     ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
256     CLIP_SH4_0_255(res0, res1, res2, res3);
257     PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1,
258                 res2, res3);
259     PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
260     PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
261     ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
262 
263     __asm__ __volatile__(
264         "sw   $zero,    0(%[input])  \n\t"
265         "sw   $zero,    4(%[input])  \n\t"
266         "sw   $zero,    8(%[input])  \n\t"
267         "sw   $zero,   12(%[input])  \n\t"
268         "sw   $zero,   16(%[input])  \n\t"
269         "sw   $zero,   20(%[input])  \n\t"
270         "sw   $zero,   24(%[input])  \n\t"
271         "sw   $zero,   28(%[input])  \n\t"
272         "sw   $zero,   32(%[input])  \n\t"
273         "sw   $zero,   36(%[input])  \n\t"
274         "sw   $zero,   40(%[input])  \n\t"
275         "sw   $zero,   44(%[input])  \n\t"
276         "sw   $zero,   48(%[input])  \n\t"
277         "sw   $zero,   52(%[input])  \n\t"
278         "sw   $zero,   56(%[input])  \n\t"
279         "sw   $zero,   60(%[input])  \n\t"::
280 
281         [input] "r"(input)
282     );
283 }
284 
dequant_idct_addconst_2x_msa(int16_t * input,int16_t * dequant_input,uint8_t * dest,int32_t dest_stride)285 static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input,
286                                          uint8_t *dest, int32_t dest_stride)
287 {
288     v8i16 input_dc0, input_dc1, vec;
289     v16u8 dest0, dest1, dest2, dest3;
290     v16i8 zero = { 0 };
291     v8i16 res0, res1, res2, res3;
292 
293     input_dc0 = __msa_fill_h(input[0] * dequant_input[0]);
294     input_dc1 = __msa_fill_h(input[16] * dequant_input[0]);
295     SRARI_H2_SH(input_dc0, input_dc1, 3);
296     vec = (v8i16)__msa_pckev_d((v2i64)input_dc1, (v2i64)input_dc0);
297     input[0] = 0;
298     input[16] = 0;
299     LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
300     ILVR_B4_SH(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0,
301                res1, res2, res3);
302     ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
303     CLIP_SH4_0_255(res0, res1, res2, res3);
304     PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1,
305                 res2, res3);
306     PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
307     PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
308     ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
309 }
310 
vp8_short_idct4x4llm_msa(int16_t * input,uint8_t * pred_ptr,int32_t pred_stride,uint8_t * dst_ptr,int32_t dst_stride)311 void vp8_short_idct4x4llm_msa(int16_t *input, uint8_t *pred_ptr,
312                               int32_t pred_stride, uint8_t *dst_ptr,
313                               int32_t dst_stride)
314 {
315     idct4x4_addblk_msa(input, pred_ptr, pred_stride, dst_ptr, dst_stride);
316 }
317 
vp8_dc_only_idct_add_msa(int16_t input_dc,uint8_t * pred_ptr,int32_t pred_stride,uint8_t * dst_ptr,int32_t dst_stride)318 void vp8_dc_only_idct_add_msa(int16_t input_dc, uint8_t *pred_ptr,
319                               int32_t pred_stride, uint8_t *dst_ptr,
320                               int32_t dst_stride)
321 {
322     idct4x4_addconst_msa(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride);
323 }
324 
vp8_dequantize_b_msa(BLOCKD * d,int16_t * DQC)325 void vp8_dequantize_b_msa(BLOCKD *d, int16_t *DQC)
326 {
327     v8i16 dqc0, dqc1, q0, q1, dq0, dq1;
328 
329     LD_SH2(DQC, 8, dqc0, dqc1);
330     LD_SH2(d->qcoeff, 8, q0, q1);
331     MUL2(dqc0, q0, dqc1, q1, dq0, dq1);
332     ST_SH2(dq0, dq1, d->dqcoeff, 8);
333 }
334 
vp8_dequant_idct_add_msa(int16_t * input,int16_t * dq,uint8_t * dest,int32_t stride)335 void vp8_dequant_idct_add_msa(int16_t *input, int16_t *dq,
336                               uint8_t *dest, int32_t stride)
337 {
338     dequant_idct4x4_addblk_msa(input, dq, dest, stride);
339 
340     __asm__ __volatile__ (
341         "sw     $zero,    0(%[input])     \n\t"
342         "sw     $zero,    4(%[input])     \n\t"
343         "sw     $zero,    8(%[input])     \n\t"
344         "sw     $zero,   12(%[input])     \n\t"
345         "sw     $zero,   16(%[input])     \n\t"
346         "sw     $zero,   20(%[input])     \n\t"
347         "sw     $zero,   24(%[input])     \n\t"
348         "sw     $zero,   28(%[input])     \n\t"
349 
350         :
351         : [input] "r" (input)
352     );
353 }
354 
vp8_dequant_idct_add_y_block_msa(int16_t * q,int16_t * dq,uint8_t * dst,int32_t stride,char * eobs)355 void vp8_dequant_idct_add_y_block_msa(int16_t *q, int16_t *dq,
356                                       uint8_t *dst, int32_t stride,
357                                       char *eobs)
358 {
359     int16_t *eobs_h = (int16_t *)eobs;
360     uint8_t i;
361 
362     for (i = 4; i--;)
363     {
364         if (eobs_h[0])
365         {
366             if (eobs_h[0] & 0xfefe)
367             {
368                 dequant_idct4x4_addblk_2x_msa(q, dq, dst, stride);
369             }
370             else
371             {
372                 dequant_idct_addconst_2x_msa(q, dq, dst, stride);
373             }
374         }
375 
376         q += 32;
377 
378         if (eobs_h[1])
379         {
380             if (eobs_h[1] & 0xfefe)
381             {
382                 dequant_idct4x4_addblk_2x_msa(q, dq, dst + 8, stride);
383             }
384             else
385             {
386                 dequant_idct_addconst_2x_msa(q, dq, dst + 8, stride);
387             }
388         }
389 
390         q += 32;
391         dst += (4 * stride);
392         eobs_h += 2;
393     }
394 }
395 
vp8_dequant_idct_add_uv_block_msa(int16_t * q,int16_t * dq,uint8_t * dstu,uint8_t * dstv,int32_t stride,char * eobs)396 void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq,
397                                        uint8_t *dstu, uint8_t *dstv,
398                                        int32_t stride, char *eobs)
399 {
400     int16_t *eobs_h = (int16_t *)eobs;
401 
402     if (eobs_h[0])
403     {
404         if (eobs_h[0] & 0xfefe)
405         {
406             dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride);
407         }
408         else
409         {
410             dequant_idct_addconst_2x_msa(q, dq, dstu, stride);
411         }
412     }
413 
414     q += 32;
415     dstu += (stride * 4);
416 
417     if (eobs_h[1])
418     {
419         if (eobs_h[1] & 0xfefe)
420         {
421             dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride);
422         }
423         else
424         {
425             dequant_idct_addconst_2x_msa(q, dq, dstu, stride);
426         }
427     }
428 
429     q += 32;
430 
431     if (eobs_h[2])
432     {
433         if (eobs_h[2] & 0xfefe)
434         {
435             dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride);
436         }
437         else
438         {
439             dequant_idct_addconst_2x_msa(q, dq, dstv, stride);
440         }
441     }
442 
443     q += 32;
444     dstv += (stride * 4);
445 
446     if (eobs_h[3])
447     {
448         if (eobs_h[3] & 0xfefe)
449         {
450             dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride);
451         }
452         else
453         {
454             dequant_idct_addconst_2x_msa(q, dq, dstv, stride);
455         }
456     }
457 }
458