1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <stdlib.h>
12 #include "./vp8_rtcd.h"
13 #include "vp8/common/mips/msa/vp8_macros_msa.h"
14
15 static const int16_t vp8_rv_msa[] =
16 {
17 8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
18 0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
19 10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
20 8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
21 8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
22 1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
23 3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
24 11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
25 14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
26 4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
27 7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
28 0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
29 8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
30 3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
31 3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
32 13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
33 5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
34 9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
35 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
36 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
37 11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
38 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
39 0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
40 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
41 4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
42 0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
43 8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
44 3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
45 3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
46 13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
47 5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
48 9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
49 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
50 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
51 11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
52 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
53 0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
54 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
55 4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
56 3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
57 11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
58 14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
59 5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
60 0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
61 };
62
63 #define VP8_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
64 out0, out1, out2, out3, \
65 out4, out5, out6, out7, \
66 out8, out9, out10, out11, \
67 out12, out13, out14, out15) \
68 { \
69 v8i16 temp0, temp1, temp2, temp3, temp4; \
70 v8i16 temp5, temp6, temp7, temp8, temp9; \
71 \
72 ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
73 temp0, temp1, temp2, temp3); \
74 ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
75 ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
76 ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
77 ILVRL_W2_SH(temp5, temp4, temp8, temp9); \
78 ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
79 temp0, temp1, temp2, temp3); \
80 ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
81 ILVRL_W2_UB(temp5, temp4, out8, out10); \
82 ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
83 ILVRL_W2_UB(temp5, temp4, out12, out14); \
84 out0 = (v16u8)temp6; \
85 out2 = (v16u8)temp7; \
86 out4 = (v16u8)temp8; \
87 out6 = (v16u8)temp9; \
88 out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \
89 out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \
90 out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \
91 out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \
92 out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
93 out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
94 out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \
95 out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \
96 }
97
98 #define VP8_AVER_IF_RETAIN(above2_in, above1_in, src_in, \
99 below1_in, below2_in, ref, out) \
100 { \
101 v16u8 temp0, temp1; \
102 \
103 temp1 = __msa_aver_u_b(above2_in, above1_in); \
104 temp0 = __msa_aver_u_b(below2_in, below1_in); \
105 temp1 = __msa_aver_u_b(temp1, temp0); \
106 out = __msa_aver_u_b(src_in, temp1); \
107 temp0 = __msa_asub_u_b(src_in, above2_in); \
108 temp1 = __msa_asub_u_b(src_in, above1_in); \
109 temp0 = (temp0 < ref); \
110 temp1 = (temp1 < ref); \
111 temp0 = temp0 & temp1; \
112 temp1 = __msa_asub_u_b(src_in, below1_in); \
113 temp1 = (temp1 < ref); \
114 temp0 = temp0 & temp1; \
115 temp1 = __msa_asub_u_b(src_in, below2_in); \
116 temp1 = (temp1 < ref); \
117 temp0 = temp0 & temp1; \
118 out = __msa_bmz_v(out, src_in, temp0); \
119 }
120
121 #define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, \
122 in8, in9, in10, in11, in12, in13, in14, in15) \
123 { \
124 v8i16 temp0, temp1, temp2, temp3, temp4; \
125 v8i16 temp5, temp6, temp7, temp8, temp9; \
126 \
127 ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
128 ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
129 ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
130 ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
131 ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
132 ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
133 ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
134 ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
135 ILVRL_H2_SH(temp5, temp4, temp6, temp7); \
136 ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \
137 ILVRL_H2_SH(temp5, temp4, temp8, temp9); \
138 ILVRL_W2_SH(temp8, temp6, temp4, temp5); \
139 ILVRL_W2_SH(temp9, temp7, temp6, temp7); \
140 ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \
141 ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \
142 in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \
143 in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \
144 ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \
145 ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \
146 in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \
147 in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \
148 ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, \
149 temp2, temp3, temp4, temp5); \
150 ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, \
151 temp6, temp7, temp8, temp9); \
152 ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \
153 in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \
154 in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \
155 ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \
156 in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \
157 in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \
158 }
159
160 #define VP8_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, \
161 in6, in7, in8, in9, in10, in11) \
162 { \
163 v8i16 temp0, temp1, temp2, temp3; \
164 v8i16 temp4, temp5, temp6, temp7; \
165 \
166 ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
167 ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
168 ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
169 ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
170 ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
171 ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
172 ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \
173 temp4 = __msa_ilvr_h(temp5, temp4); \
174 ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \
175 temp5 = __msa_ilvr_h(temp7, temp6); \
176 ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
177 in0 = (v16u8)temp0; \
178 in2 = (v16u8)temp1; \
179 in4 = (v16u8)temp2; \
180 in6 = (v16u8)temp3; \
181 in8 = (v16u8)temp6; \
182 in10 = (v16u8)temp7; \
183 in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \
184 in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \
185 in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \
186 in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \
187 in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \
188 in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \
189 }
190
postproc_down_across_chroma_msa(uint8_t * src_ptr,uint8_t * dst_ptr,int32_t src_stride,int32_t dst_stride,int32_t cols,uint8_t * f)191 static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
192 int32_t src_stride,
193 int32_t dst_stride,
194 int32_t cols, uint8_t *f)
195 {
196 uint8_t *p_src = src_ptr;
197 uint8_t *p_dst = dst_ptr;
198 uint8_t *f_orig = f;
199 uint8_t *p_dst_st = dst_ptr;
200 uint16_t col;
201 uint64_t out0, out1, out2, out3;
202 v16u8 above2, above1, below2, below1, src, ref, ref_temp;
203 v16u8 inter0, inter1, inter2, inter3, inter4, inter5;
204 v16u8 inter6, inter7, inter8, inter9, inter10, inter11;
205
206 for (col = (cols / 16); col--;)
207 {
208 ref = LD_UB(f);
209 LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
210 src = LD_UB(p_src);
211 LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
212 VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
213 above2 = LD_UB(p_src + 3 * src_stride);
214 VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
215 above1 = LD_UB(p_src + 4 * src_stride);
216 VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
217 src = LD_UB(p_src + 5 * src_stride);
218 VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
219 below1 = LD_UB(p_src + 6 * src_stride);
220 VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
221 below2 = LD_UB(p_src + 7 * src_stride);
222 VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
223 above2 = LD_UB(p_src + 8 * src_stride);
224 VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
225 above1 = LD_UB(p_src + 9 * src_stride);
226 VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
227 ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
228 p_dst, dst_stride);
229
230 p_dst += 16;
231 p_src += 16;
232 f += 16;
233 }
234
235 if (0 != (cols / 16))
236 {
237 ref = LD_UB(f);
238 LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
239 src = LD_UB(p_src);
240 LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
241 VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
242 above2 = LD_UB(p_src + 3 * src_stride);
243 VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
244 above1 = LD_UB(p_src + 4 * src_stride);
245 VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
246 src = LD_UB(p_src + 5 * src_stride);
247 VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
248 below1 = LD_UB(p_src + 6 * src_stride);
249 VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
250 below2 = LD_UB(p_src + 7 * src_stride);
251 VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
252 above2 = LD_UB(p_src + 8 * src_stride);
253 VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
254 above1 = LD_UB(p_src + 9 * src_stride);
255 VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
256 out0 = __msa_copy_u_d((v2i64)inter0, 0);
257 out1 = __msa_copy_u_d((v2i64)inter1, 0);
258 out2 = __msa_copy_u_d((v2i64)inter2, 0);
259 out3 = __msa_copy_u_d((v2i64)inter3, 0);
260 SD4(out0, out1, out2, out3, p_dst, dst_stride);
261
262 out0 = __msa_copy_u_d((v2i64)inter4, 0);
263 out1 = __msa_copy_u_d((v2i64)inter5, 0);
264 out2 = __msa_copy_u_d((v2i64)inter6, 0);
265 out3 = __msa_copy_u_d((v2i64)inter7, 0);
266 SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
267 }
268
269 f = f_orig;
270 p_dst = dst_ptr - 2;
271 LD_UB8(p_dst, dst_stride,
272 inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7);
273
274 for (col = 0; col < (cols / 8); ++col)
275 {
276 ref = LD_UB(f);
277 f += 8;
278 VP8_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3,
279 inter4, inter5, inter6, inter7,
280 inter8, inter9, inter10, inter11);
281 if (0 == col)
282 {
283 above2 = inter2;
284 above1 = inter2;
285 }
286 else
287 {
288 above2 = inter0;
289 above1 = inter1;
290 }
291 src = inter2;
292 below1 = inter3;
293 below2 = inter4;
294 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
295 VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
296 ref_temp, inter2);
297 above2 = inter5;
298 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
299 VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
300 ref_temp, inter3);
301 above1 = inter6;
302 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
303 VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
304 ref_temp, inter4);
305 src = inter7;
306 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
307 VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src,
308 ref_temp, inter5);
309 below1 = inter8;
310 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
311 VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1,
312 ref_temp, inter6);
313 below2 = inter9;
314 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
315 VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
316 ref_temp, inter7);
317 if (col == (cols / 8 - 1))
318 {
319 above2 = inter9;
320 }
321 else
322 {
323 above2 = inter10;
324 }
325 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
326 VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
327 ref_temp, inter8);
328 if (col == (cols / 8 - 1))
329 {
330 above1 = inter9;
331 }
332 else
333 {
334 above1 = inter11;
335 }
336 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
337 VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
338 ref_temp, inter9);
339 TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
340 inter8, inter9, inter2, inter3, inter4, inter5,
341 inter6, inter7, inter8, inter9);
342 p_dst += 8;
343 LD_UB2(p_dst, dst_stride, inter0, inter1);
344 ST8x1_UB(inter2, p_dst_st);
345 ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
346 LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
347 ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
348 ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
349 LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
350 ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
351 ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
352 LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
353 ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
354 ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
355 p_dst_st += 8;
356 }
357 }
358
postproc_down_across_luma_msa(uint8_t * src_ptr,uint8_t * dst_ptr,int32_t src_stride,int32_t dst_stride,int32_t cols,uint8_t * f)359 static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
360 int32_t src_stride,
361 int32_t dst_stride,
362 int32_t cols, uint8_t *f)
363 {
364 uint8_t *p_src = src_ptr;
365 uint8_t *p_dst = dst_ptr;
366 uint8_t *p_dst_st = dst_ptr;
367 uint8_t *f_orig = f;
368 uint16_t col;
369 v16u8 above2, above1, below2, below1;
370 v16u8 src, ref, ref_temp;
371 v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
372 v16u8 inter7, inter8, inter9, inter10, inter11;
373 v16u8 inter12, inter13, inter14, inter15;
374
375 for (col = (cols / 16); col--;)
376 {
377 ref = LD_UB(f);
378 LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
379 src = LD_UB(p_src);
380 LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
381 VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
382 above2 = LD_UB(p_src + 3 * src_stride);
383 VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
384 above1 = LD_UB(p_src + 4 * src_stride);
385 VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
386 src = LD_UB(p_src + 5 * src_stride);
387 VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
388 below1 = LD_UB(p_src + 6 * src_stride);
389 VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
390 below2 = LD_UB(p_src + 7 * src_stride);
391 VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
392 above2 = LD_UB(p_src + 8 * src_stride);
393 VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
394 above1 = LD_UB(p_src + 9 * src_stride);
395 VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
396 src = LD_UB(p_src + 10 * src_stride);
397 VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
398 below1 = LD_UB(p_src + 11 * src_stride);
399 VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
400 below2 = LD_UB(p_src + 12 * src_stride);
401 VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
402 above2 = LD_UB(p_src + 13 * src_stride);
403 VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
404 above1 = LD_UB(p_src + 14 * src_stride);
405 VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
406 src = LD_UB(p_src + 15 * src_stride);
407 VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
408 below1 = LD_UB(p_src + 16 * src_stride);
409 VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
410 below2 = LD_UB(p_src + 17 * src_stride);
411 VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
412 ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
413 p_dst, dst_stride);
414 ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13,
415 inter14, inter15, p_dst + 8 * dst_stride, dst_stride);
416 p_src += 16;
417 p_dst += 16;
418 f += 16;
419 }
420
421 f = f_orig;
422 p_dst = dst_ptr - 2;
423 LD_UB8(p_dst, dst_stride,
424 inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7);
425 LD_UB8(p_dst + 8 * dst_stride, dst_stride,
426 inter8, inter9, inter10, inter11, inter12, inter13,
427 inter14, inter15);
428
429 for (col = 0; col < cols / 8; ++col)
430 {
431 ref = LD_UB(f);
432 f += 8;
433 TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5,
434 inter6, inter7, inter8, inter9, inter10, inter11,
435 inter12, inter13, inter14, inter15);
436 if (0 == col)
437 {
438 above2 = inter2;
439 above1 = inter2;
440 }
441 else
442 {
443 above2 = inter0;
444 above1 = inter1;
445 }
446
447 src = inter2;
448 below1 = inter3;
449 below2 = inter4;
450 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
451 VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
452 ref_temp, inter2);
453 above2 = inter5;
454 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
455 VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
456 ref_temp, inter3);
457 above1 = inter6;
458 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
459 VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
460 ref_temp, inter4);
461 src = inter7;
462 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
463 VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src,
464 ref_temp, inter5);
465 below1 = inter8;
466 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
467 VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1,
468 ref_temp, inter6);
469 below2 = inter9;
470 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
471 VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
472 ref_temp, inter7);
473 if (col == (cols / 8 - 1))
474 {
475 above2 = inter9;
476 }
477 else
478 {
479 above2 = inter10;
480 }
481 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
482 VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
483 ref_temp, inter8);
484 if (col == (cols / 8 - 1))
485 {
486 above1 = inter9;
487 }
488 else
489 {
490 above1 = inter11;
491 }
492 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
493 VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
494 ref_temp, inter9);
495 VP8_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5,
496 inter6, inter7, inter8, inter9,
497 inter2, inter3, inter4, inter5,
498 inter6, inter7, inter8, inter9,
499 inter10, inter11, inter12, inter13,
500 inter14, inter15, above2, above1);
501
502 p_dst += 8;
503 LD_UB2(p_dst, dst_stride, inter0, inter1);
504 ST8x1_UB(inter2, p_dst_st);
505 ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
506 LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
507 ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
508 ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
509 LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
510 ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
511 ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
512 LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
513 ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
514 ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
515 LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9);
516 ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride));
517 ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride));
518 LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11);
519 ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride));
520 ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride));
521 LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13);
522 ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride));
523 ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride));
524 LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15);
525 ST8x1_UB(above2, (p_dst_st + 14 * dst_stride));
526 ST8x1_UB(above1, (p_dst_st + 15 * dst_stride));
527 p_dst_st += 8;
528 }
529 }
530
vp8_post_proc_down_and_across_mb_row_msa(uint8_t * src,uint8_t * dst,int32_t src_stride,int32_t dst_stride,int32_t cols,uint8_t * f,int32_t size)531 void vp8_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
532 int32_t src_stride,
533 int32_t dst_stride,
534 int32_t cols, uint8_t *f,
535 int32_t size)
536 {
537 if (8 == size)
538 {
539 postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride,
540 cols, f);
541 }
542 else if (16 == size)
543 {
544 postproc_down_across_luma_msa(src, dst, src_stride, dst_stride,
545 cols, f);
546 }
547 }
548
vp8_mbpost_proc_across_ip_msa(uint8_t * src_ptr,int32_t pitch,int32_t rows,int32_t cols,int32_t flimit)549 void vp8_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
550 int32_t rows, int32_t cols, int32_t flimit)
551 {
552 int32_t row, col, cnt;
553 uint8_t *src_dup = src_ptr;
554 v16u8 src0, src, tmp_orig;
555 v16u8 tmp = { 0 };
556 v16i8 zero = { 0 };
557 v8u16 sum_h, src_r_h, src_l_h;
558 v4u32 src_r_w, src_l_w;
559 v4i32 flimit_vec;
560
561 flimit_vec = __msa_fill_w(flimit);
562 for (row = rows; row--;)
563 {
564 int32_t sum_sq = 0;
565 int32_t sum = 0;
566 src0 = (v16u8)__msa_fill_b(src_dup[0]);
567 ST8x1_UB(src0, (src_dup - 8));
568
569 src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]);
570 ST_UB(src0, src_dup + cols);
571 src_dup[cols + 16] = src_dup[cols - 1];
572 tmp_orig = (v16u8)__msa_ldi_b(0);
573 tmp_orig[15] = tmp[15];
574 src = LD_UB(src_dup - 8);
575 src[15] = 0;
576 ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
577 src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
578 src_l_w = __msa_dotp_u_w(src_l_h, src_l_h);
579 sum_sq = HADD_SW_S32(src_r_w);
580 sum_sq += HADD_SW_S32(src_l_w);
581 sum_h = __msa_hadd_u_h(src, src);
582 sum = HADD_UH_U32(sum_h);
583 {
584 v16u8 src7, src8, src_r, src_l;
585 v16i8 mask;
586 v8u16 add_r, add_l;
587 v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1;
588 v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3;
589 v4i32 sub0, sub1, sub2, sub3;
590 v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
591 v4i32 mul0, mul1, mul2, mul3;
592 v4i32 total0, total1, total2, total3;
593 v8i16 const8 = __msa_fill_h(8);
594
595 src7 = LD_UB(src_dup + 7);
596 src8 = LD_UB(src_dup - 8);
597 for (col = 0; col < (cols >> 4); ++col)
598 {
599 ILVRL_B2_UB(src7, src8, src_r, src_l);
600 HSUB_UB2_SH(src_r, src_l, sub_r, sub_l);
601
602 sum_r[0] = sum + sub_r[0];
603 for (cnt = 0; cnt < 7; ++cnt)
604 {
605 sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1];
606 }
607 sum_l[0] = sum_r[7] + sub_l[0];
608 for (cnt = 0; cnt < 7; ++cnt)
609 {
610 sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
611 }
612 sum = sum_l[7];
613 src = LD_UB(src_dup + 16 * col);
614 ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
615 src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
616 src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
617 tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
618
619 HADD_UB2_UH(src_r, src_l, add_r, add_l);
620 UNPCK_SH_SW(sub_r, sub0, sub1);
621 UNPCK_SH_SW(sub_l, sub2, sub3);
622 ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w);
623 ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w);
624 MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3,
625 mul0, mul1, mul2, mul3);
626 sum_sq0[0] = sum_sq + mul0[0];
627 for (cnt = 0; cnt < 3; ++cnt)
628 {
629 sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1];
630 }
631 sum_sq1[0] = sum_sq0[3] + mul1[0];
632 for (cnt = 0; cnt < 3; ++cnt)
633 {
634 sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1];
635 }
636 sum_sq2[0] = sum_sq1[3] + mul2[0];
637 for (cnt = 0; cnt < 3; ++cnt)
638 {
639 sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1];
640 }
641 sum_sq3[0] = sum_sq2[3] + mul3[0];
642 for (cnt = 0; cnt < 3; ++cnt)
643 {
644 sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1];
645 }
646 sum_sq = sum_sq3[3];
647
648 UNPCK_SH_SW(sum_r, sum0_w, sum1_w);
649 UNPCK_SH_SW(sum_l, sum2_w, sum3_w);
650 total0 = sum_sq0 * __msa_ldi_w(15);
651 total0 -= sum0_w * sum0_w;
652 total1 = sum_sq1 * __msa_ldi_w(15);
653 total1 -= sum1_w * sum1_w;
654 total2 = sum_sq2 * __msa_ldi_w(15);
655 total2 -= sum2_w * sum2_w;
656 total3 = sum_sq3 * __msa_ldi_w(15);
657 total3 -= sum3_w * sum3_w;
658 total0 = (total0 < flimit_vec);
659 total1 = (total1 < flimit_vec);
660 total2 = (total2 < flimit_vec);
661 total3 = (total3 < flimit_vec);
662 PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
663 mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
664 tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
665
666 if (col == 0)
667 {
668 uint64_t src_d;
669
670 src_d = __msa_copy_u_d((v2i64)tmp_orig, 1);
671 SD(src_d, (src_dup - 8));
672 }
673
674 src7 = LD_UB(src_dup + 16 * (col + 1) + 7);
675 src8 = LD_UB(src_dup + 16 * (col + 1) - 8);
676 ST_UB(tmp, (src_dup + (16 * col)));
677 }
678
679 src_dup += pitch;
680 }
681 }
682 }
683
vp8_mbpost_proc_down_msa(uint8_t * dst_ptr,int32_t pitch,int32_t rows,int32_t cols,int32_t flimit)684 void vp8_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
685 int32_t cols, int32_t flimit)
686 {
687 int32_t row, col, cnt, i;
688 const int16_t *rv3 = &vp8_rv_msa[63 & rand()];
689 v4i32 flimit_vec;
690 v16u8 dst7, dst8, dst_r_b, dst_l_b;
691 v16i8 mask;
692 v8u16 add_r, add_l;
693 v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1;
694 v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3;
695
696 flimit_vec = __msa_fill_w(flimit);
697
698 for (col = 0; col < (cols >> 4); ++col)
699 {
700 uint8_t *dst_tmp = &dst_ptr[col << 4];
701 v16u8 dst;
702 v16i8 zero = { 0 };
703 v16u8 tmp[16];
704 v8i16 mult0, mult1, rv2_0, rv2_1;
705 v8i16 sum0_h = { 0 };
706 v8i16 sum1_h = { 0 };
707 v4i32 mul0 = { 0 };
708 v4i32 mul1 = { 0 };
709 v4i32 mul2 = { 0 };
710 v4i32 mul3 = { 0 };
711 v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
712 v4i32 add0, add1, add2, add3;
713 const int16_t *rv2[16];
714
715 dst = LD_UB(dst_tmp);
716 for (cnt = (col << 4), i = 0; i < 16; ++cnt)
717 {
718 rv2[i] = rv3 + ((cnt * 17) & 127);
719 ++i;
720 }
721 for (cnt = -8; cnt < 0; ++cnt)
722 {
723 ST_UB(dst, dst_tmp + cnt * pitch);
724 }
725
726 dst = LD_UB((dst_tmp + (rows - 1) * pitch));
727 for (cnt = rows; cnt < rows + 17; ++cnt)
728 {
729 ST_UB(dst, dst_tmp + cnt * pitch);
730 }
731 for (cnt = -8; cnt <= 6; ++cnt)
732 {
733 dst = LD_UB(dst_tmp + (cnt * pitch));
734 UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
735 MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
736 mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0);
737 mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0);
738 mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1);
739 mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1);
740 ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
741 }
742
743 for (row = 0; row < (rows + 8); ++row)
744 {
745 for (i = 0; i < 8; ++i)
746 {
747 rv2_0[i] = *(rv2[i] + (row & 127));
748 rv2_1[i] = *(rv2[i + 8] + (row & 127));
749 }
750 dst7 = LD_UB(dst_tmp + (7 * pitch));
751 dst8 = LD_UB(dst_tmp - (8 * pitch));
752 ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b);
753
754 HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l);
755 UNPCK_SH_SW(sub_r, sub0, sub1);
756 UNPCK_SH_SW(sub_l, sub2, sub3);
757 sum0_h += sub_r;
758 sum1_h += sub_l;
759
760 HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l);
761
762 ILVRL_H2_SW(zero, add_r, add0, add1);
763 ILVRL_H2_SW(zero, add_l, add2, add3);
764 mul0 += add0 * sub0;
765 mul1 += add1 * sub1;
766 mul2 += add2 * sub2;
767 mul3 += add3 * sub3;
768 dst = LD_UB(dst_tmp);
769 ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
770 dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
771 dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
772 tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7);
773
774 UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
775 UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
776 total0 = mul0 * __msa_ldi_w(15);
777 total0 -= sum0_w * sum0_w;
778 total1 = mul1 * __msa_ldi_w(15);
779 total1 -= sum1_w * sum1_w;
780 total2 = mul2 * __msa_ldi_w(15);
781 total2 -= sum2_w * sum2_w;
782 total3 = mul3 * __msa_ldi_w(15);
783 total3 -= sum3_w * sum3_w;
784 total0 = (total0 < flimit_vec);
785 total1 = (total1 < flimit_vec);
786 total2 = (total2 < flimit_vec);
787 total3 = (total3 < flimit_vec);
788 PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
789 mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
790 tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask);
791
792 if (row >= 8)
793 {
794 ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
795 }
796
797 dst_tmp += pitch;
798 }
799 }
800 }
801
vp8_plane_add_noise_msa(uint8_t * start_ptr,char * noise,char blackclamp[16],char whiteclamp[16],char bothclamp[16],uint32_t width,uint32_t height,int32_t pitch)802 void vp8_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
803 char blackclamp[16], char whiteclamp[16],
804 char bothclamp[16],
805 uint32_t width, uint32_t height,
806 int32_t pitch)
807 {
808 uint32_t i, j;
809
810 for (i = 0; i < height / 2; ++i)
811 {
812 uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
813 int8_t *ref0_ptr = (int8_t *) (noise + (rand() & 0xff));
814 uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
815 int8_t *ref1_ptr = (int8_t *) (noise + (rand() & 0xff));
816 for (j = width / 16; j--;)
817 {
818 v16i8 temp00_s, temp01_s;
819 v16u8 temp00, temp01, black_clamp, white_clamp;
820 v16u8 pos0, ref0, pos1, ref1;
821 v16i8 const127 = __msa_ldi_b(127);
822
823 pos0 = LD_UB(pos0_ptr);
824 ref0 = LD_UB(ref0_ptr);
825 pos1 = LD_UB(pos1_ptr);
826 ref1 = LD_UB(ref1_ptr);
827 black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
828 white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
829 temp00 = (pos0 < black_clamp);
830 pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
831 temp01 = (pos1 < black_clamp);
832 pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
833 XORI_B2_128_UB(pos0, pos1);
834 temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
835 temp00 = (v16u8)(temp00_s < pos0);
836 pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
837 temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
838 temp01 = (temp01_s < pos1);
839 pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
840 XORI_B2_128_UB(pos0, pos1);
841 pos0 += ref0;
842 ST_UB(pos0, pos0_ptr);
843 pos1 += ref1;
844 ST_UB(pos1, pos1_ptr);
845 pos0_ptr += 16;
846 pos1_ptr += 16;
847 ref0_ptr += 16;
848 ref1_ptr += 16;
849 }
850 }
851 }
852