1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vp8_rtcd.h"
12 #include "vp8/common/blockd.h"
13 #include "vp8/common/mips/msa/vp8_macros_msa.h"
14
15 static const int32_t cospi8sqrt2minus1 = 20091;
16 static const int32_t sinpi8sqrt2 = 35468;
17
18 #define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \
19 { \
20 v8i16 s4_m, s5_m, s6_m, s7_m; \
21 \
22 TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m); \
23 ILVR_D2_SH(s6_m, s4_m, s7_m, s5_m, out0, out2); \
24 out1 = (v8i16)__msa_ilvl_d((v2i64)s6_m, (v2i64)s4_m); \
25 out3 = (v8i16)__msa_ilvl_d((v2i64)s7_m, (v2i64)s5_m); \
26 }
27
28 #define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in) \
29 ({ \
30 v8i16 out_m; \
31 v8i16 zero_m = { 0 }; \
32 v4i32 tmp1_m, tmp2_m; \
33 v4i32 sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2); \
34 \
35 ILVRL_H2_SW(in, zero_m, tmp1_m, tmp2_m); \
36 tmp1_m >>= 16; \
37 tmp2_m >>= 16; \
38 tmp1_m = (tmp1_m * sinpi8_sqrt2_m) >> 16; \
39 tmp2_m = (tmp2_m * sinpi8_sqrt2_m) >> 16; \
40 out_m = __msa_pckev_h((v8i16)tmp2_m, (v8i16)tmp1_m); \
41 \
42 out_m; \
43 })
44
45 #define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3) \
46 { \
47 v8i16 a1_m, b1_m, c1_m, d1_m; \
48 v8i16 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \
49 v8i16 const_cospi8sqrt2minus1_m; \
50 \
51 const_cospi8sqrt2minus1_m = __msa_fill_h(cospi8sqrt2minus1); \
52 a1_m = in0 + in2; \
53 b1_m = in0 - in2; \
54 c_tmp1_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1); \
55 c_tmp2_m = __msa_mul_q_h(in3, const_cospi8sqrt2minus1_m); \
56 c_tmp2_m = c_tmp2_m >> 1; \
57 c_tmp2_m = in3 + c_tmp2_m; \
58 c1_m = c_tmp1_m - c_tmp2_m; \
59 d_tmp1_m = __msa_mul_q_h(in1, const_cospi8sqrt2minus1_m); \
60 d_tmp1_m = d_tmp1_m >> 1; \
61 d_tmp1_m = in1 + d_tmp1_m; \
62 d_tmp2_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3); \
63 d1_m = d_tmp1_m + d_tmp2_m; \
64 BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
65 }
66
67 #define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) \
68 { \
69 v4i32 a1_m, b1_m, c1_m, d1_m; \
70 v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \
71 v4i32 const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m; \
72 \
73 const_cospi8sqrt2minus1_m = __msa_fill_w(cospi8sqrt2minus1); \
74 sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2); \
75 a1_m = in0 + in2; \
76 b1_m = in0 - in2; \
77 c_tmp1_m = (in1 * sinpi8_sqrt2_m) >> 16; \
78 c_tmp2_m = in3 + ((in3 * const_cospi8sqrt2minus1_m) >> 16); \
79 c1_m = c_tmp1_m - c_tmp2_m; \
80 d_tmp1_m = in1 + ((in1 * const_cospi8sqrt2minus1_m) >> 16); \
81 d_tmp2_m = (in3 * sinpi8_sqrt2_m) >> 16; \
82 d1_m = d_tmp1_m + d_tmp2_m; \
83 BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
84 }
85
idct4x4_addblk_msa(int16_t * input,uint8_t * pred,int32_t pred_stride,uint8_t * dest,int32_t dest_stride)86 static void idct4x4_addblk_msa(int16_t *input, uint8_t *pred,
87 int32_t pred_stride,
88 uint8_t *dest, int32_t dest_stride)
89 {
90 v8i16 input0, input1;
91 v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
92 v4i32 res0, res1, res2, res3;
93 v16i8 zero = { 0 };
94 v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
95 v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24,
96 25, 26, 27, 28, 29, 30, 31 };
97
98 LD_SH2(input, 8, input0, input1);
99 UNPCK_SH_SW(input0, in0, in1);
100 UNPCK_SH_SW(input1, in2, in3);
101 VP8_IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
102 TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
103 VP8_IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
104 SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
105 TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
106 LD_SB4(pred, pred_stride, pred0, pred1, pred2, pred3);
107 ILVR_B4_SW(zero, pred0, zero, pred1, zero, pred2, zero, pred3, res0, res1,
108 res2, res3);
109 ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1,
110 res2, res3);
111 ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
112 res0 = CLIP_SW_0_255(res0);
113 res1 = CLIP_SW_0_255(res1);
114 res2 = CLIP_SW_0_255(res2);
115 res3 = CLIP_SW_0_255(res3);
116 LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
117 VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
118 VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
119 ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
120 }
121
idct4x4_addconst_msa(int16_t in_dc,uint8_t * pred,int32_t pred_stride,uint8_t * dest,int32_t dest_stride)122 static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred,
123 int32_t pred_stride,
124 uint8_t *dest, int32_t dest_stride)
125 {
126 v8i16 vec;
127 v8i16 res0, res1, res2, res3;
128 v16i8 zero = { 0 };
129 v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
130 v16i8 mask = { 0, 2, 4, 6, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
131
132 vec = __msa_fill_h(in_dc);
133 vec = __msa_srari_h(vec, 3);
134 LD_SB4(pred, pred_stride, pred0, pred1, pred2, pred3);
135 ILVR_B4_SH(zero, pred0, zero, pred1, zero, pred2, zero, pred3, res0, res1,
136 res2, res3);
137 ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
138 CLIP_SH4_0_255(res0, res1, res2, res3);
139 LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
140 VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
141 VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
142 ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
143 }
144
vp8_short_inv_walsh4x4_msa(int16_t * input,int16_t * mb_dq_coeff)145 void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff)
146 {
147 v8i16 input0, input1;
148 v4i32 in0, in1, in2, in3, a1, b1, c1, d1;
149 v4i32 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
150
151 LD_SH2(input, 8, input0, input1);
152 UNPCK_SH_SW(input0, in0, in1);
153 UNPCK_SH_SW(input1, in2, in3);
154 BUTTERFLY_4(in0, in1, in2, in3, a1, b1, c1, d1);
155 BUTTERFLY_4(a1, d1, c1, b1, hz0, hz1, hz3, hz2);
156 TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
157 BUTTERFLY_4(hz0, hz1, hz2, hz3, a1, b1, c1, d1);
158 BUTTERFLY_4(a1, d1, c1, b1, vt0, vt1, vt3, vt2);
159 ADD4(vt0, 3, vt1, 3, vt2, 3, vt3, 3, vt0, vt1, vt2, vt3);
160 SRA_4V(vt0, vt1, vt2, vt3, 3);
161 mb_dq_coeff[0] = __msa_copy_s_h((v8i16)vt0, 0);
162 mb_dq_coeff[16] = __msa_copy_s_h((v8i16)vt1, 0);
163 mb_dq_coeff[32] = __msa_copy_s_h((v8i16)vt2, 0);
164 mb_dq_coeff[48] = __msa_copy_s_h((v8i16)vt3, 0);
165 mb_dq_coeff[64] = __msa_copy_s_h((v8i16)vt0, 2);
166 mb_dq_coeff[80] = __msa_copy_s_h((v8i16)vt1, 2);
167 mb_dq_coeff[96] = __msa_copy_s_h((v8i16)vt2, 2);
168 mb_dq_coeff[112] = __msa_copy_s_h((v8i16)vt3, 2);
169 mb_dq_coeff[128] = __msa_copy_s_h((v8i16)vt0, 4);
170 mb_dq_coeff[144] = __msa_copy_s_h((v8i16)vt1, 4);
171 mb_dq_coeff[160] = __msa_copy_s_h((v8i16)vt2, 4);
172 mb_dq_coeff[176] = __msa_copy_s_h((v8i16)vt3, 4);
173 mb_dq_coeff[192] = __msa_copy_s_h((v8i16)vt0, 6);
174 mb_dq_coeff[208] = __msa_copy_s_h((v8i16)vt1, 6);
175 mb_dq_coeff[224] = __msa_copy_s_h((v8i16)vt2, 6);
176 mb_dq_coeff[240] = __msa_copy_s_h((v8i16)vt3, 6);
177 }
178
dequant_idct4x4_addblk_msa(int16_t * input,int16_t * dequant_input,uint8_t * dest,int32_t dest_stride)179 static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input,
180 uint8_t *dest, int32_t dest_stride)
181 {
182 v8i16 input0, input1, dequant_in0, dequant_in1, mul0, mul1;
183 v8i16 in0, in1, in2, in3;
184 v8i16 hz0_h, hz1_h, hz2_h, hz3_h;
185 v16i8 dest0, dest1, dest2, dest3;
186 v4i32 hz0_w, hz1_w, hz2_w, hz3_w;
187 v4i32 vt0, vt1, vt2, vt3, res0, res1, res2, res3;
188 v2i64 zero = { 0 };
189 v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24,
190 25, 26, 27, 28, 29, 30, 31 };
191
192 LD_SH2(input, 8, input0, input1);
193 LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
194 MUL2(input0, dequant_in0, input1, dequant_in1, mul0, mul1);
195 PCKEV_D2_SH(zero, mul0, zero, mul1, in0, in2);
196 PCKOD_D2_SH(zero, mul0, zero, mul1, in1, in3);
197 VP8_IDCT_1D_H(in0, in1, in2, in3, hz0_h, hz1_h, hz2_h, hz3_h);
198 PCKEV_D2_SH(hz1_h, hz0_h, hz3_h, hz2_h, mul0, mul1);
199 UNPCK_SH_SW(mul0, hz0_w, hz1_w);
200 UNPCK_SH_SW(mul1, hz2_w, hz3_w);
201 TRANSPOSE4x4_SW_SW(hz0_w, hz1_w, hz2_w, hz3_w, hz0_w, hz1_w, hz2_w, hz3_w);
202 VP8_IDCT_1D_W(hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3);
203 SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
204 TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
205 LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
206 ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
207 res2, res3);
208 ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1,
209 res2, res3);
210 ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
211 res0 = CLIP_SW_0_255(res0);
212 res1 = CLIP_SW_0_255(res1);
213 res2 = CLIP_SW_0_255(res2);
214 res3 = CLIP_SW_0_255(res3);
215 VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
216 VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
217 ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
218 }
219
dequant_idct4x4_addblk_2x_msa(int16_t * input,int16_t * dequant_input,uint8_t * dest,int32_t dest_stride)220 static void dequant_idct4x4_addblk_2x_msa(int16_t *input,
221 int16_t *dequant_input,
222 uint8_t *dest, int32_t dest_stride)
223 {
224 v16u8 dest0, dest1, dest2, dest3;
225 v8i16 in0, in1, in2, in3;
226 v8i16 mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
227 v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
228 v8i16 res0, res1, res2, res3;
229 v4i32 hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r;
230 v4i32 vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r;
231 v16i8 zero = { 0 };
232
233 LD_SH4(input, 8, in0, in1, in2, in3);
234 LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
235 MUL4(in0, dequant_in0, in1, dequant_in1, in2, dequant_in0, in3, dequant_in1,
236 mul0, mul1, mul2, mul3);
237 PCKEV_D2_SH(mul2, mul0, mul3, mul1, in0, in2);
238 PCKOD_D2_SH(mul2, mul0, mul3, mul1, in1, in3);
239 VP8_IDCT_1D_H(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
240 TRANSPOSE_TWO_4x4_H(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
241 UNPCK_SH_SW(hz0, hz0r, hz0l);
242 UNPCK_SH_SW(hz1, hz1r, hz1l);
243 UNPCK_SH_SW(hz2, hz2r, hz2l);
244 UNPCK_SH_SW(hz3, hz3r, hz3l);
245 VP8_IDCT_1D_W(hz0l, hz1l, hz2l, hz3l, vt0l, vt1l, vt2l, vt3l);
246 SRARI_W4_SW(vt0l, vt1l, vt2l, vt3l, 3);
247 VP8_IDCT_1D_W(hz0r, hz1r, hz2r, hz3r, vt0r, vt1r, vt2r, vt3r);
248 SRARI_W4_SW(vt0r, vt1r, vt2r, vt3r, 3);
249 PCKEV_H4_SH(vt0l, vt0r, vt1l, vt1r, vt2l, vt2r, vt3l, vt3r, vt0, vt1, vt2,
250 vt3);
251 TRANSPOSE_TWO_4x4_H(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
252 LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
253 ILVR_B4_SH(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
254 res2, res3);
255 ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
256 CLIP_SH4_0_255(res0, res1, res2, res3);
257 PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1,
258 res2, res3);
259 PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
260 PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
261 ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
262
263 __asm__ __volatile__(
264 "sw $zero, 0(%[input]) \n\t"
265 "sw $zero, 4(%[input]) \n\t"
266 "sw $zero, 8(%[input]) \n\t"
267 "sw $zero, 12(%[input]) \n\t"
268 "sw $zero, 16(%[input]) \n\t"
269 "sw $zero, 20(%[input]) \n\t"
270 "sw $zero, 24(%[input]) \n\t"
271 "sw $zero, 28(%[input]) \n\t"
272 "sw $zero, 32(%[input]) \n\t"
273 "sw $zero, 36(%[input]) \n\t"
274 "sw $zero, 40(%[input]) \n\t"
275 "sw $zero, 44(%[input]) \n\t"
276 "sw $zero, 48(%[input]) \n\t"
277 "sw $zero, 52(%[input]) \n\t"
278 "sw $zero, 56(%[input]) \n\t"
279 "sw $zero, 60(%[input]) \n\t"::
280
281 [input] "r"(input)
282 );
283 }
284
dequant_idct_addconst_2x_msa(int16_t * input,int16_t * dequant_input,uint8_t * dest,int32_t dest_stride)285 static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input,
286 uint8_t *dest, int32_t dest_stride)
287 {
288 v8i16 input_dc0, input_dc1, vec;
289 v16u8 dest0, dest1, dest2, dest3;
290 v16i8 zero = { 0 };
291 v8i16 res0, res1, res2, res3;
292
293 input_dc0 = __msa_fill_h(input[0] * dequant_input[0]);
294 input_dc1 = __msa_fill_h(input[16] * dequant_input[0]);
295 SRARI_H2_SH(input_dc0, input_dc1, 3);
296 vec = (v8i16)__msa_pckev_d((v2i64)input_dc1, (v2i64)input_dc0);
297 input[0] = 0;
298 input[16] = 0;
299 LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
300 ILVR_B4_SH(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0,
301 res1, res2, res3);
302 ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
303 CLIP_SH4_0_255(res0, res1, res2, res3);
304 PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1,
305 res2, res3);
306 PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
307 PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
308 ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
309 }
310
vp8_short_idct4x4llm_msa(int16_t * input,uint8_t * pred_ptr,int32_t pred_stride,uint8_t * dst_ptr,int32_t dst_stride)311 void vp8_short_idct4x4llm_msa(int16_t *input, uint8_t *pred_ptr,
312 int32_t pred_stride, uint8_t *dst_ptr,
313 int32_t dst_stride)
314 {
315 idct4x4_addblk_msa(input, pred_ptr, pred_stride, dst_ptr, dst_stride);
316 }
317
vp8_dc_only_idct_add_msa(int16_t input_dc,uint8_t * pred_ptr,int32_t pred_stride,uint8_t * dst_ptr,int32_t dst_stride)318 void vp8_dc_only_idct_add_msa(int16_t input_dc, uint8_t *pred_ptr,
319 int32_t pred_stride, uint8_t *dst_ptr,
320 int32_t dst_stride)
321 {
322 idct4x4_addconst_msa(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride);
323 }
324
vp8_dequantize_b_msa(BLOCKD * d,int16_t * DQC)325 void vp8_dequantize_b_msa(BLOCKD *d, int16_t *DQC)
326 {
327 v8i16 dqc0, dqc1, q0, q1, dq0, dq1;
328
329 LD_SH2(DQC, 8, dqc0, dqc1);
330 LD_SH2(d->qcoeff, 8, q0, q1);
331 MUL2(dqc0, q0, dqc1, q1, dq0, dq1);
332 ST_SH2(dq0, dq1, d->dqcoeff, 8);
333 }
334
vp8_dequant_idct_add_msa(int16_t * input,int16_t * dq,uint8_t * dest,int32_t stride)335 void vp8_dequant_idct_add_msa(int16_t *input, int16_t *dq,
336 uint8_t *dest, int32_t stride)
337 {
338 dequant_idct4x4_addblk_msa(input, dq, dest, stride);
339
340 __asm__ __volatile__ (
341 "sw $zero, 0(%[input]) \n\t"
342 "sw $zero, 4(%[input]) \n\t"
343 "sw $zero, 8(%[input]) \n\t"
344 "sw $zero, 12(%[input]) \n\t"
345 "sw $zero, 16(%[input]) \n\t"
346 "sw $zero, 20(%[input]) \n\t"
347 "sw $zero, 24(%[input]) \n\t"
348 "sw $zero, 28(%[input]) \n\t"
349
350 :
351 : [input] "r" (input)
352 );
353 }
354
vp8_dequant_idct_add_y_block_msa(int16_t * q,int16_t * dq,uint8_t * dst,int32_t stride,char * eobs)355 void vp8_dequant_idct_add_y_block_msa(int16_t *q, int16_t *dq,
356 uint8_t *dst, int32_t stride,
357 char *eobs)
358 {
359 int16_t *eobs_h = (int16_t *)eobs;
360 uint8_t i;
361
362 for (i = 4; i--;)
363 {
364 if (eobs_h[0])
365 {
366 if (eobs_h[0] & 0xfefe)
367 {
368 dequant_idct4x4_addblk_2x_msa(q, dq, dst, stride);
369 }
370 else
371 {
372 dequant_idct_addconst_2x_msa(q, dq, dst, stride);
373 }
374 }
375
376 q += 32;
377
378 if (eobs_h[1])
379 {
380 if (eobs_h[1] & 0xfefe)
381 {
382 dequant_idct4x4_addblk_2x_msa(q, dq, dst + 8, stride);
383 }
384 else
385 {
386 dequant_idct_addconst_2x_msa(q, dq, dst + 8, stride);
387 }
388 }
389
390 q += 32;
391 dst += (4 * stride);
392 eobs_h += 2;
393 }
394 }
395
vp8_dequant_idct_add_uv_block_msa(int16_t * q,int16_t * dq,uint8_t * dstu,uint8_t * dstv,int32_t stride,char * eobs)396 void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq,
397 uint8_t *dstu, uint8_t *dstv,
398 int32_t stride, char *eobs)
399 {
400 int16_t *eobs_h = (int16_t *)eobs;
401
402 if (eobs_h[0])
403 {
404 if (eobs_h[0] & 0xfefe)
405 {
406 dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride);
407 }
408 else
409 {
410 dequant_idct_addconst_2x_msa(q, dq, dstu, stride);
411 }
412 }
413
414 q += 32;
415 dstu += (stride * 4);
416
417 if (eobs_h[1])
418 {
419 if (eobs_h[1] & 0xfefe)
420 {
421 dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride);
422 }
423 else
424 {
425 dequant_idct_addconst_2x_msa(q, dq, dstu, stride);
426 }
427 }
428
429 q += 32;
430
431 if (eobs_h[2])
432 {
433 if (eobs_h[2] & 0xfefe)
434 {
435 dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride);
436 }
437 else
438 {
439 dequant_idct_addconst_2x_msa(q, dq, dstv, stride);
440 }
441 }
442
443 q += 32;
444 dstv += (stride * 4);
445
446 if (eobs_h[3])
447 {
448 if (eobs_h[3] & 0xfefe)
449 {
450 dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride);
451 }
452 else
453 {
454 dequant_idct_addconst_2x_msa(q, dq, dstv, stride);
455 }
456 }
457 }
458