1 /*
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <string.h>
12
13 #include "libyuv/row.h"
14
15 // This module is for GCC MSA
16 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
17 #include "libyuv/macros_msa.h"
18
19 #ifdef __cplusplus
20 namespace libyuv {
21 extern "C" {
22 #endif
23
24 #define ALPHA_VAL (-1)
25
26 // Fill YUV -> RGB conversion constants into vectors
27 #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
28 { \
29 ub = __msa_fill_w(yuvconst->kUVToB[0]); \
30 vr = __msa_fill_w(yuvconst->kUVToR[1]); \
31 ug = __msa_fill_w(yuvconst->kUVToG[0]); \
32 vg = __msa_fill_w(yuvconst->kUVToG[1]); \
33 bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \
34 bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \
35 br = __msa_fill_w(yuvconst->kUVBiasR[0]); \
36 yg = __msa_fill_w(yuvconst->kYToRgb[0]); \
37 }
38
39 // Load YUV 422 pixel data
40 #define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
41 { \
42 uint64_t y_m; \
43 uint32_t u_m, v_m; \
44 v4i32 zero_m = {0}; \
45 y_m = LD(psrc_y); \
46 u_m = LW(psrc_u); \
47 v_m = LW(psrc_v); \
48 out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \
49 out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m); \
50 out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m); \
51 }
52
53 // Clip input vector elements between 0 to 255
54 #define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \
55 { \
56 v4i32 max_m = __msa_ldi_w(0xFF); \
57 \
58 in0 = __msa_maxi_s_w(in0, 0); \
59 in1 = __msa_maxi_s_w(in1, 0); \
60 in2 = __msa_maxi_s_w(in2, 0); \
61 in3 = __msa_maxi_s_w(in3, 0); \
62 in4 = __msa_maxi_s_w(in4, 0); \
63 in5 = __msa_maxi_s_w(in5, 0); \
64 in0 = __msa_min_s_w(max_m, in0); \
65 in1 = __msa_min_s_w(max_m, in1); \
66 in2 = __msa_min_s_w(max_m, in2); \
67 in3 = __msa_min_s_w(max_m, in3); \
68 in4 = __msa_min_s_w(max_m, in4); \
69 in5 = __msa_min_s_w(max_m, in5); \
70 }
71
72 // Convert 8 pixels of YUV 420 to RGB.
73 #define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
74 { \
75 v8i16 vec0_m, vec1_m; \
76 v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
77 v4i32 reg5_m, reg6_m, reg7_m; \
78 v16i8 zero_m = {0}; \
79 \
80 vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
81 vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \
82 reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \
83 reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \
84 reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \
85 reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \
86 reg0_m *= yg; \
87 reg1_m *= yg; \
88 reg2_m *= ubvr; \
89 reg3_m *= ubvr; \
90 reg0_m = __msa_srai_w(reg0_m, 16); \
91 reg1_m = __msa_srai_w(reg1_m, 16); \
92 reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \
93 reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \
94 reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \
95 reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \
96 reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \
97 reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \
98 reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \
99 reg5_m = reg0_m - reg5_m; \
100 reg6_m = reg1_m - reg6_m; \
101 reg2_m = reg0_m - reg2_m; \
102 reg3_m = reg1_m - reg3_m; \
103 reg7_m = reg0_m - reg7_m; \
104 reg4_m = reg1_m - reg4_m; \
105 reg5_m += bb; \
106 reg6_m += bb; \
107 reg7_m += bg; \
108 reg4_m += bg; \
109 reg2_m += br; \
110 reg3_m += br; \
111 reg5_m = __msa_srai_w(reg5_m, 6); \
112 reg6_m = __msa_srai_w(reg6_m, 6); \
113 reg7_m = __msa_srai_w(reg7_m, 6); \
114 reg4_m = __msa_srai_w(reg4_m, 6); \
115 reg2_m = __msa_srai_w(reg2_m, 6); \
116 reg3_m = __msa_srai_w(reg3_m, 6); \
117 CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \
118 out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \
119 out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \
120 out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
121 }
122
123 // Pack and Store 8 ARGB values.
124 #define STOREARGB(in0, in1, in2, in3, pdst_argb) \
125 { \
126 v8i16 vec0_m, vec1_m; \
127 v16u8 dst0_m, dst1_m; \
128 vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
129 vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
130 dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \
131 dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \
132 ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \
133 }
134
135 // Takes ARGB input and calculates Y.
136 #define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \
137 y_out) \
138 { \
139 v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \
140 v8u16 reg0_m, reg1_m; \
141 \
142 vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \
143 vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \
144 vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \
145 vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \
146 reg0_m = __msa_dotp_u_h(vec0_m, const0); \
147 reg1_m = __msa_dotp_u_h(vec1_m, const0); \
148 reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \
149 reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \
150 reg0_m += const2; \
151 reg1_m += const2; \
152 reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \
153 reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \
154 y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
155 }
156
157 // Loads current and next row of ARGB input and averages it to calculate U and V
158 #define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \
159 { \
160 v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
161 v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
162 v16u8 vec8_m, vec9_m; \
163 v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
164 v8u16 reg8_m, reg9_m; \
165 \
166 src0_m = (v16u8)__msa_ld_b((void*)s, 0); \
167 src1_m = (v16u8)__msa_ld_b((void*)s, 16); \
168 src2_m = (v16u8)__msa_ld_b((void*)s, 32); \
169 src3_m = (v16u8)__msa_ld_b((void*)s, 48); \
170 src4_m = (v16u8)__msa_ld_b((void*)t, 0); \
171 src5_m = (v16u8)__msa_ld_b((void*)t, 16); \
172 src6_m = (v16u8)__msa_ld_b((void*)t, 32); \
173 src7_m = (v16u8)__msa_ld_b((void*)t, 48); \
174 vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
175 vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
176 vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
177 vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
178 vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
179 vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
180 vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
181 vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
182 reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \
183 reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \
184 reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \
185 reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \
186 reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \
187 reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \
188 reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \
189 reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \
190 reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
191 reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
192 reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
193 reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
194 reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
195 reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
196 reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
197 reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
198 reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
199 reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
200 reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
201 reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
202 argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
203 argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
204 src0_m = (v16u8)__msa_ld_b((void*)s, 64); \
205 src1_m = (v16u8)__msa_ld_b((void*)s, 80); \
206 src2_m = (v16u8)__msa_ld_b((void*)s, 96); \
207 src3_m = (v16u8)__msa_ld_b((void*)s, 112); \
208 src4_m = (v16u8)__msa_ld_b((void*)t, 64); \
209 src5_m = (v16u8)__msa_ld_b((void*)t, 80); \
210 src6_m = (v16u8)__msa_ld_b((void*)t, 96); \
211 src7_m = (v16u8)__msa_ld_b((void*)t, 112); \
212 vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
213 vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
214 vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
215 vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
216 vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
217 vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
218 vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
219 vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
220 reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \
221 reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \
222 reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \
223 reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \
224 reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \
225 reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \
226 reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \
227 reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \
228 reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
229 reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
230 reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
231 reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
232 reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
233 reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
234 reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
235 reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
236 reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
237 reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
238 reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
239 reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
240 argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
241 argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
242 }
243
244 // Takes ARGB input and calculates U and V.
245 #define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
246 shf0, shf1, shf2, shf3, v_out, u_out) \
247 { \
248 v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
249 v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \
250 \
251 vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \
252 vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \
253 vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \
254 vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \
255 vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \
256 vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \
257 vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \
258 vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \
259 reg0_m = __msa_dotp_u_h(vec0_m, const1); \
260 reg1_m = __msa_dotp_u_h(vec1_m, const1); \
261 reg2_m = __msa_dotp_u_h(vec4_m, const1); \
262 reg3_m = __msa_dotp_u_h(vec5_m, const1); \
263 reg0_m += const3; \
264 reg1_m += const3; \
265 reg2_m += const3; \
266 reg3_m += const3; \
267 reg0_m -= __msa_dotp_u_h(vec2_m, const0); \
268 reg1_m -= __msa_dotp_u_h(vec3_m, const0); \
269 reg2_m -= __msa_dotp_u_h(vec6_m, const2); \
270 reg3_m -= __msa_dotp_u_h(vec7_m, const2); \
271 v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \
272 u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \
273 }
274
275 // Load I444 pixel data
276 #define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
277 { \
278 uint64_t y_m, u_m, v_m; \
279 v2i64 zero_m = {0}; \
280 y_m = LD(psrc_y); \
281 u_m = LD(psrc_u); \
282 v_m = LD(psrc_v); \
283 out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m); \
284 out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m); \
285 out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \
286 }
287
MirrorRow_MSA(const uint8_t * src,uint8_t * dst,int width)288 void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
289 int x;
290 v16u8 src0, src1, src2, src3;
291 v16u8 dst0, dst1, dst2, dst3;
292 v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
293 src += width - 64;
294
295 for (x = 0; x < width; x += 64) {
296 LD_UB4(src, 16, src3, src2, src1, src0);
297 VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
298 VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
299 ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
300 dst += 64;
301 src -= 64;
302 }
303 }
304
ARGBMirrorRow_MSA(const uint8_t * src,uint8_t * dst,int width)305 void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
306 int x;
307 v16u8 src0, src1, src2, src3;
308 v16u8 dst0, dst1, dst2, dst3;
309 v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
310 src += width * 4 - 64;
311
312 for (x = 0; x < width; x += 16) {
313 LD_UB4(src, 16, src3, src2, src1, src0);
314 VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
315 VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
316 ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
317 dst += 64;
318 src -= 64;
319 }
320 }
321
I422ToYUY2Row_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)322 void I422ToYUY2Row_MSA(const uint8_t* src_y,
323 const uint8_t* src_u,
324 const uint8_t* src_v,
325 uint8_t* dst_yuy2,
326 int width) {
327 int x;
328 v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
329 v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;
330
331 for (x = 0; x < width; x += 32) {
332 src_u0 = LD_UB(src_u);
333 src_v0 = LD_UB(src_v);
334 LD_UB2(src_y, 16, src_y0, src_y1);
335 ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
336 ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);
337 ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);
338 ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);
339 src_u += 16;
340 src_v += 16;
341 src_y += 32;
342 dst_yuy2 += 64;
343 }
344 }
345
I422ToUYVYRow_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)346 void I422ToUYVYRow_MSA(const uint8_t* src_y,
347 const uint8_t* src_u,
348 const uint8_t* src_v,
349 uint8_t* dst_uyvy,
350 int width) {
351 int x;
352 v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
353 v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;
354
355 for (x = 0; x < width; x += 32) {
356 src_u0 = LD_UB(src_u);
357 src_v0 = LD_UB(src_v);
358 LD_UB2(src_y, 16, src_y0, src_y1);
359 ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
360 ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
361 ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
362 ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
363 src_u += 16;
364 src_v += 16;
365 src_y += 32;
366 dst_uyvy += 64;
367 }
368 }
369
I422ToARGBRow_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)370 void I422ToARGBRow_MSA(const uint8_t* src_y,
371 const uint8_t* src_u,
372 const uint8_t* src_v,
373 uint8_t* dst_argb,
374 const struct YuvConstants* yuvconstants,
375 int width) {
376 int x;
377 v16u8 src0, src1, src2;
378 v8i16 vec0, vec1, vec2;
379 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
380 v4i32 vec_ubvr, vec_ugvg;
381 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
382
383 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
384 vec_br, vec_yg);
385 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
386 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
387
388 for (x = 0; x < width; x += 8) {
389 READYUV422(src_y, src_u, src_v, src0, src1, src2);
390 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
391 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
392 vec0, vec1, vec2);
393 STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
394 src_y += 8;
395 src_u += 4;
396 src_v += 4;
397 dst_argb += 32;
398 }
399 }
400
I422ToRGBARow_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)401 void I422ToRGBARow_MSA(const uint8_t* src_y,
402 const uint8_t* src_u,
403 const uint8_t* src_v,
404 uint8_t* dst_argb,
405 const struct YuvConstants* yuvconstants,
406 int width) {
407 int x;
408 v16u8 src0, src1, src2;
409 v8i16 vec0, vec1, vec2;
410 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
411 v4i32 vec_ubvr, vec_ugvg;
412 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
413
414 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
415 vec_br, vec_yg);
416 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
417 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
418
419 for (x = 0; x < width; x += 8) {
420 READYUV422(src_y, src_u, src_v, src0, src1, src2);
421 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
422 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
423 vec0, vec1, vec2);
424 STOREARGB(alpha, vec0, vec1, vec2, dst_argb);
425 src_y += 8;
426 src_u += 4;
427 src_v += 4;
428 dst_argb += 32;
429 }
430 }
431
I422AlphaToARGBRow_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)432 void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
433 const uint8_t* src_u,
434 const uint8_t* src_v,
435 const uint8_t* src_a,
436 uint8_t* dst_argb,
437 const struct YuvConstants* yuvconstants,
438 int width) {
439 int x;
440 int64_t data_a;
441 v16u8 src0, src1, src2, src3;
442 v8i16 vec0, vec1, vec2;
443 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
444 v4i32 vec_ubvr, vec_ugvg;
445 v4i32 zero = {0};
446
447 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
448 vec_br, vec_yg);
449 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
450 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
451
452 for (x = 0; x < width; x += 8) {
453 data_a = LD(src_a);
454 READYUV422(src_y, src_u, src_v, src0, src1, src2);
455 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
456 src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
457 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
458 vec0, vec1, vec2);
459 src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
460 STOREARGB(vec0, vec1, vec2, src3, dst_argb);
461 src_y += 8;
462 src_u += 4;
463 src_v += 4;
464 src_a += 8;
465 dst_argb += 32;
466 }
467 }
468
I422ToRGB24Row_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int32_t width)469 void I422ToRGB24Row_MSA(const uint8_t* src_y,
470 const uint8_t* src_u,
471 const uint8_t* src_v,
472 uint8_t* dst_argb,
473 const struct YuvConstants* yuvconstants,
474 int32_t width) {
475 int x;
476 int64_t data_u, data_v;
477 v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
478 v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
479 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
480 v4i32 vec_ubvr, vec_ugvg;
481 v16u8 reg0, reg1, reg2, reg3;
482 v2i64 zero = {0};
483 v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
484 v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
485 v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10,
486 11, 29, 12, 13, 30, 14, 15, 31};
487
488 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
489 vec_br, vec_yg);
490 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
491 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
492
493 for (x = 0; x < width; x += 16) {
494 src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0);
495 data_u = LD(src_u);
496 data_v = LD(src_v);
497 src1 = (v16u8)__msa_insert_d(zero, 0, data_u);
498 src2 = (v16u8)__msa_insert_d(zero, 0, data_v);
499 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
500 src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
501 src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
502 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
503 vec0, vec1, vec2);
504 YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
505 vec3, vec4, vec5);
506 reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
507 reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
508 reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
509 reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11);
510 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);
511 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);
512 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);
513 ST_UB2(dst0, dst1, dst_argb, 16);
514 ST_UB(dst2, (dst_argb + 32));
515 src_y += 16;
516 src_u += 8;
517 src_v += 8;
518 dst_argb += 48;
519 }
520 }
521
522 // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
I422ToRGB565Row_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)523 void I422ToRGB565Row_MSA(const uint8_t* src_y,
524 const uint8_t* src_u,
525 const uint8_t* src_v,
526 uint8_t* dst_rgb565,
527 const struct YuvConstants* yuvconstants,
528 int width) {
529 int x;
530 v16u8 src0, src1, src2, dst0;
531 v8i16 vec0, vec1, vec2;
532 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
533 v4i32 vec_ubvr, vec_ugvg;
534
535 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
536 vec_br, vec_yg);
537 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
538 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
539
540 for (x = 0; x < width; x += 8) {
541 READYUV422(src_y, src_u, src_v, src0, src1, src2);
542 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
543 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
544 vec0, vec2, vec1);
545 vec0 = __msa_srai_h(vec0, 3);
546 vec1 = __msa_srai_h(vec1, 3);
547 vec2 = __msa_srai_h(vec2, 2);
548 vec1 = __msa_slli_h(vec1, 11);
549 vec2 = __msa_slli_h(vec2, 5);
550 vec0 |= vec1;
551 dst0 = (v16u8)(vec2 | vec0);
552 ST_UB(dst0, dst_rgb565);
553 src_y += 8;
554 src_u += 4;
555 src_v += 4;
556 dst_rgb565 += 16;
557 }
558 }
559
560 // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
I422ToARGB4444Row_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)561 void I422ToARGB4444Row_MSA(const uint8_t* src_y,
562 const uint8_t* src_u,
563 const uint8_t* src_v,
564 uint8_t* dst_argb4444,
565 const struct YuvConstants* yuvconstants,
566 int width) {
567 int x;
568 v16u8 src0, src1, src2, dst0;
569 v8i16 vec0, vec1, vec2;
570 v8u16 reg0, reg1, reg2;
571 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
572 v4i32 vec_ubvr, vec_ugvg;
573 v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
574
575 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
576 vec_br, vec_yg);
577 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
578 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
579
580 for (x = 0; x < width; x += 8) {
581 READYUV422(src_y, src_u, src_v, src0, src1, src2);
582 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
583 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
584 vec0, vec1, vec2);
585 reg0 = (v8u16)__msa_srai_h(vec0, 4);
586 reg1 = (v8u16)__msa_srai_h(vec1, 4);
587 reg2 = (v8u16)__msa_srai_h(vec2, 4);
588 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
589 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
590 reg1 |= const_0xF000;
591 reg0 |= reg2;
592 dst0 = (v16u8)(reg1 | reg0);
593 ST_UB(dst0, dst_argb4444);
594 src_y += 8;
595 src_u += 4;
596 src_v += 4;
597 dst_argb4444 += 16;
598 }
599 }
600
I422ToARGB1555Row_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)601 void I422ToARGB1555Row_MSA(const uint8_t* src_y,
602 const uint8_t* src_u,
603 const uint8_t* src_v,
604 uint8_t* dst_argb1555,
605 const struct YuvConstants* yuvconstants,
606 int width) {
607 int x;
608 v16u8 src0, src1, src2, dst0;
609 v8i16 vec0, vec1, vec2;
610 v8u16 reg0, reg1, reg2;
611 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
612 v4i32 vec_ubvr, vec_ugvg;
613 v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
614
615 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
616 vec_br, vec_yg);
617 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
618 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
619
620 for (x = 0; x < width; x += 8) {
621 READYUV422(src_y, src_u, src_v, src0, src1, src2);
622 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
623 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
624 vec0, vec1, vec2);
625 reg0 = (v8u16)__msa_srai_h(vec0, 3);
626 reg1 = (v8u16)__msa_srai_h(vec1, 3);
627 reg2 = (v8u16)__msa_srai_h(vec2, 3);
628 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
629 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
630 reg1 |= const_0x8000;
631 reg0 |= reg2;
632 dst0 = (v16u8)(reg1 | reg0);
633 ST_UB(dst0, dst_argb1555);
634 src_y += 8;
635 src_u += 4;
636 src_v += 4;
637 dst_argb1555 += 16;
638 }
639 }
640
YUY2ToYRow_MSA(const uint8_t * src_yuy2,uint8_t * dst_y,int width)641 void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
642 int x;
643 v16u8 src0, src1, src2, src3, dst0, dst1;
644
645 for (x = 0; x < width; x += 32) {
646 LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
647 dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
648 dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
649 ST_UB2(dst0, dst1, dst_y, 16);
650 src_yuy2 += 64;
651 dst_y += 32;
652 }
653 }
654
YUY2ToUVRow_MSA(const uint8_t * src_yuy2,int src_stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)655 void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
656 int src_stride_yuy2,
657 uint8_t* dst_u,
658 uint8_t* dst_v,
659 int width) {
660 const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
661 int x;
662 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
663 v16u8 vec0, vec1, dst0, dst1;
664
665 for (x = 0; x < width; x += 32) {
666 LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
667 LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
668 src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
669 src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
670 src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
671 src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
672 vec0 = __msa_aver_u_b(src0, src2);
673 vec1 = __msa_aver_u_b(src1, src3);
674 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
675 dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
676 ST_UB(dst0, dst_u);
677 ST_UB(dst1, dst_v);
678 src_yuy2 += 64;
679 src_yuy2_next += 64;
680 dst_u += 16;
681 dst_v += 16;
682 }
683 }
684
YUY2ToUV422Row_MSA(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)685 void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
686 uint8_t* dst_u,
687 uint8_t* dst_v,
688 int width) {
689 int x;
690 v16u8 src0, src1, src2, src3, dst0, dst1;
691
692 for (x = 0; x < width; x += 32) {
693 LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
694 src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
695 src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
696 dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
697 dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
698 ST_UB(dst0, dst_u);
699 ST_UB(dst1, dst_v);
700 src_yuy2 += 64;
701 dst_u += 16;
702 dst_v += 16;
703 }
704 }
705
UYVYToYRow_MSA(const uint8_t * src_uyvy,uint8_t * dst_y,int width)706 void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
707 int x;
708 v16u8 src0, src1, src2, src3, dst0, dst1;
709
710 for (x = 0; x < width; x += 32) {
711 LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
712 dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
713 dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
714 ST_UB2(dst0, dst1, dst_y, 16);
715 src_uyvy += 64;
716 dst_y += 32;
717 }
718 }
719
UYVYToUVRow_MSA(const uint8_t * src_uyvy,int src_stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)720 void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
721 int src_stride_uyvy,
722 uint8_t* dst_u,
723 uint8_t* dst_v,
724 int width) {
725 const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
726 int x;
727 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
728 v16u8 vec0, vec1, dst0, dst1;
729
730 for (x = 0; x < width; x += 32) {
731 LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
732 LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
733 src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
734 src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
735 src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
736 src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
737 vec0 = __msa_aver_u_b(src0, src2);
738 vec1 = __msa_aver_u_b(src1, src3);
739 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
740 dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
741 ST_UB(dst0, dst_u);
742 ST_UB(dst1, dst_v);
743 src_uyvy += 64;
744 src_uyvy_next += 64;
745 dst_u += 16;
746 dst_v += 16;
747 }
748 }
749
UYVYToUV422Row_MSA(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)750 void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
751 uint8_t* dst_u,
752 uint8_t* dst_v,
753 int width) {
754 int x;
755 v16u8 src0, src1, src2, src3, dst0, dst1;
756
757 for (x = 0; x < width; x += 32) {
758 LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
759 src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
760 src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
761 dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
762 dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
763 ST_UB(dst0, dst_u);
764 ST_UB(dst1, dst_v);
765 src_uyvy += 64;
766 dst_u += 16;
767 dst_v += 16;
768 }
769 }
770
ARGBToYRow_MSA(const uint8_t * src_argb0,uint8_t * dst_y,int width)771 void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
772 int x;
773 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
774 v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
775 v16i8 zero = {0};
776 v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
777 v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
778 v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
779 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
780
781 for (x = 0; x < width; x += 16) {
782 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
783 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
784 src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
785 src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
786 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
787 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
788 vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
789 vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
790 reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0);
791 reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1);
792 reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2);
793 reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3);
794 reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0);
795 reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1);
796 reg0 *= const_0x19;
797 reg1 *= const_0x19;
798 reg2 *= const_0x81;
799 reg3 *= const_0x81;
800 reg4 *= const_0x42;
801 reg5 *= const_0x42;
802 reg0 += reg2;
803 reg1 += reg3;
804 reg0 += reg4;
805 reg1 += reg5;
806 reg0 += const_0x1080;
807 reg1 += const_0x1080;
808 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
809 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
810 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
811 ST_UB(dst0, dst_y);
812 src_argb0 += 64;
813 dst_y += 16;
814 }
815 }
816
ARGBToUVRow_MSA(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)817 void ARGBToUVRow_MSA(const uint8_t* src_argb0,
818 int src_stride_argb,
819 uint8_t* dst_u,
820 uint8_t* dst_v,
821 int width) {
822 int x;
823 const uint8_t* src_argb0_next = src_argb0 + src_stride_argb;
824 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
825 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
826 v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
827 v16u8 dst0, dst1;
828 v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
829 v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
830 v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
831 v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
832 v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
833 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
834
835 for (x = 0; x < width; x += 32) {
836 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
837 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
838 src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
839 src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
840 src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
841 src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
842 src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
843 src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
844 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
845 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
846 vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
847 vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
848 vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
849 vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
850 vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
851 vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
852 vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
853 vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
854 vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
855 vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
856 vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
857 vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
858 reg0 = __msa_hadd_u_h(vec8, vec8);
859 reg1 = __msa_hadd_u_h(vec9, vec9);
860 reg2 = __msa_hadd_u_h(vec4, vec4);
861 reg3 = __msa_hadd_u_h(vec5, vec5);
862 reg4 = __msa_hadd_u_h(vec0, vec0);
863 reg5 = __msa_hadd_u_h(vec1, vec1);
864 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
865 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
866 src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
867 src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
868 src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
869 src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
870 src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
871 src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
872 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
873 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
874 vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
875 vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
876 vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
877 vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
878 vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
879 vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
880 vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
881 vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
882 vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
883 vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
884 vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
885 vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
886 reg0 += __msa_hadd_u_h(vec8, vec8);
887 reg1 += __msa_hadd_u_h(vec9, vec9);
888 reg2 += __msa_hadd_u_h(vec4, vec4);
889 reg3 += __msa_hadd_u_h(vec5, vec5);
890 reg4 += __msa_hadd_u_h(vec0, vec0);
891 reg5 += __msa_hadd_u_h(vec1, vec1);
892 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
893 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
894 reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
895 reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
896 reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
897 reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
898 reg6 = reg0 * const_0x70;
899 reg7 = reg1 * const_0x70;
900 reg8 = reg2 * const_0x4A;
901 reg9 = reg3 * const_0x4A;
902 reg6 += const_0x8080;
903 reg7 += const_0x8080;
904 reg8 += reg4 * const_0x26;
905 reg9 += reg5 * const_0x26;
906 reg0 *= const_0x12;
907 reg1 *= const_0x12;
908 reg2 *= const_0x5E;
909 reg3 *= const_0x5E;
910 reg4 *= const_0x70;
911 reg5 *= const_0x70;
912 reg2 += reg0;
913 reg3 += reg1;
914 reg4 += const_0x8080;
915 reg5 += const_0x8080;
916 reg6 -= reg8;
917 reg7 -= reg9;
918 reg4 -= reg2;
919 reg5 -= reg3;
920 reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8);
921 reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8);
922 reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8);
923 reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8);
924 dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6);
925 dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
926 ST_UB(dst0, dst_u);
927 ST_UB(dst1, dst_v);
928 src_argb0 += 128;
929 src_argb0_next += 128;
930 dst_u += 16;
931 dst_v += 16;
932 }
933 }
934
ARGBToRGB24Row_MSA(const uint8_t * src_argb,uint8_t * dst_rgb,int width)935 void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
936 int x;
937 v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
938 v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20};
939 v16i8 shuffler1 = {5, 6, 8, 9, 10, 12, 13, 14,
940 16, 17, 18, 20, 21, 22, 24, 25};
941 v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20,
942 21, 22, 24, 25, 26, 28, 29, 30};
943
944 for (x = 0; x < width; x += 16) {
945 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
946 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
947 src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
948 src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
949 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
950 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
951 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
952 ST_UB2(dst0, dst1, dst_rgb, 16);
953 ST_UB(dst2, (dst_rgb + 32));
954 src_argb += 64;
955 dst_rgb += 48;
956 }
957 }
958
ARGBToRAWRow_MSA(const uint8_t * src_argb,uint8_t * dst_rgb,int width)959 void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
960 int x;
961 v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
962 v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22};
963 v16i8 shuffler1 = {5, 4, 10, 9, 8, 14, 13, 12,
964 18, 17, 16, 22, 21, 20, 26, 25};
965 v16i8 shuffler2 = {8, 14, 13, 12, 18, 17, 16, 22,
966 21, 20, 26, 25, 24, 30, 29, 28};
967
968 for (x = 0; x < width; x += 16) {
969 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
970 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
971 src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
972 src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
973 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
974 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
975 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
976 ST_UB2(dst0, dst1, dst_rgb, 16);
977 ST_UB(dst2, (dst_rgb + 32));
978 src_argb += 64;
979 dst_rgb += 48;
980 }
981 }
982
ARGBToRGB565Row_MSA(const uint8_t * src_argb,uint8_t * dst_rgb,int width)983 void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
984 int x;
985 v16u8 src0, src1, dst0;
986 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
987 v16i8 zero = {0};
988
989 for (x = 0; x < width; x += 8) {
990 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
991 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
992 vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
993 vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3);
994 vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5);
995 vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3);
996 vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3);
997 vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5);
998 vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
999 vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
1000 vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1);
1001 vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
1002 vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2);
1003 vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2);
1004 vec0 = __msa_binsli_b(vec0, vec1, 2);
1005 vec1 = __msa_binsli_b(vec2, vec3, 4);
1006 vec4 = __msa_binsli_b(vec4, vec5, 2);
1007 vec5 = __msa_binsli_b(vec6, vec7, 4);
1008 vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
1009 vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4);
1010 dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0);
1011 ST_UB(dst0, dst_rgb);
1012 src_argb += 32;
1013 dst_rgb += 16;
1014 }
1015 }
1016
ARGBToARGB1555Row_MSA(const uint8_t * src_argb,uint8_t * dst_rgb,int width)1017 void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,
1018 uint8_t* dst_rgb,
1019 int width) {
1020 int x;
1021 v16u8 src0, src1, dst0;
1022 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1023 v16i8 zero = {0};
1024
1025 for (x = 0; x < width; x += 8) {
1026 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1027 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
1028 vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
1029 vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2);
1030 vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3);
1031 vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
1032 vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
1033 vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1);
1034 vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3);
1035 vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2);
1036 vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3);
1037 vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
1038 vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1);
1039 vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1);
1040 vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2);
1041 vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2);
1042 vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3);
1043 vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3);
1044 vec0 = __msa_binsli_b(vec0, vec1, 2);
1045 vec5 = __msa_binsli_b(vec5, vec6, 2);
1046 vec1 = __msa_binsli_b(vec2, vec3, 5);
1047 vec6 = __msa_binsli_b(vec7, vec8, 5);
1048 vec1 = __msa_binsli_b(vec1, vec4, 0);
1049 vec6 = __msa_binsli_b(vec6, vec9, 0);
1050 vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
1051 vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5);
1052 dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
1053 ST_UB(dst0, dst_rgb);
1054 src_argb += 32;
1055 dst_rgb += 16;
1056 }
1057 }
1058
ARGBToARGB4444Row_MSA(const uint8_t * src_argb,uint8_t * dst_rgb,int width)1059 void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
1060 uint8_t* dst_rgb,
1061 int width) {
1062 int x;
1063 v16u8 src0, src1;
1064 v16u8 vec0, vec1;
1065 v16u8 dst0;
1066 v16i8 zero = {0};
1067
1068 for (x = 0; x < width; x += 8) {
1069 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1070 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
1071 vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4);
1072 vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4);
1073 src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1);
1074 src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1);
1075 vec0 = __msa_binsli_b(vec0, src0, 3);
1076 vec1 = __msa_binsli_b(vec1, src1, 3);
1077 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1078 ST_UB(dst0, dst_rgb);
1079 src_argb += 32;
1080 dst_rgb += 16;
1081 }
1082 }
1083
ARGBToUV444Row_MSA(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int32_t width)1084 void ARGBToUV444Row_MSA(const uint8_t* src_argb,
1085 uint8_t* dst_u,
1086 uint8_t* dst_v,
1087 int32_t width) {
1088 int32_t x;
1089 v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1;
1090 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1091 v8u16 vec8, vec9, vec10, vec11;
1092 v8u16 const_112 = (v8u16)__msa_ldi_h(112);
1093 v8u16 const_74 = (v8u16)__msa_ldi_h(74);
1094 v8u16 const_38 = (v8u16)__msa_ldi_h(38);
1095 v8u16 const_94 = (v8u16)__msa_ldi_h(94);
1096 v8u16 const_18 = (v8u16)__msa_ldi_h(18);
1097 v8u16 const_32896 = (v8u16)__msa_fill_h(32896);
1098 v16i8 zero = {0};
1099
1100 for (x = width; x > 0; x -= 16) {
1101 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1102 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
1103 src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
1104 src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
1105 reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
1106 reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
1107 reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
1108 reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
1109 src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
1110 src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
1111 src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0);
1112 vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
1113 vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
1114 vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
1115 vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
1116 vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
1117 vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
1118 vec10 = vec0 * const_18;
1119 vec11 = vec1 * const_18;
1120 vec8 = vec2 * const_94;
1121 vec9 = vec3 * const_94;
1122 vec6 = vec4 * const_112;
1123 vec7 = vec5 * const_112;
1124 vec0 *= const_112;
1125 vec1 *= const_112;
1126 vec2 *= const_74;
1127 vec3 *= const_74;
1128 vec4 *= const_38;
1129 vec5 *= const_38;
1130 vec8 += vec10;
1131 vec9 += vec11;
1132 vec6 += const_32896;
1133 vec7 += const_32896;
1134 vec0 += const_32896;
1135 vec1 += const_32896;
1136 vec2 += vec4;
1137 vec3 += vec5;
1138 vec0 -= vec2;
1139 vec1 -= vec3;
1140 vec6 -= vec8;
1141 vec7 -= vec9;
1142 vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
1143 vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
1144 vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8);
1145 vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8);
1146 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1147 dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
1148 ST_UB(dst0, dst_u);
1149 ST_UB(dst1, dst_v);
1150 src_argb += 64;
1151 dst_u += 16;
1152 dst_v += 16;
1153 }
1154 }
1155
ARGBMultiplyRow_MSA(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1156 void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
1157 const uint8_t* src_argb1,
1158 uint8_t* dst_argb,
1159 int width) {
1160 int x;
1161 v16u8 src0, src1, dst0;
1162 v8u16 vec0, vec1, vec2, vec3;
1163 v4u32 reg0, reg1, reg2, reg3;
1164 v8i16 zero = {0};
1165
1166 for (x = 0; x < width; x += 4) {
1167 src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
1168 src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
1169 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
1170 vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
1171 vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
1172 vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
1173 reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
1174 reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
1175 reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
1176 reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
1177 reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
1178 reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
1179 reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
1180 reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
1181 reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16);
1182 reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16);
1183 reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16);
1184 reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16);
1185 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1186 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1187 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1188 ST_UB(dst0, dst_argb);
1189 src_argb0 += 16;
1190 src_argb1 += 16;
1191 dst_argb += 16;
1192 }
1193 }
1194
ARGBAddRow_MSA(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1195 void ARGBAddRow_MSA(const uint8_t* src_argb0,
1196 const uint8_t* src_argb1,
1197 uint8_t* dst_argb,
1198 int width) {
1199 int x;
1200 v16u8 src0, src1, src2, src3, dst0, dst1;
1201
1202 for (x = 0; x < width; x += 8) {
1203 src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
1204 src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
1205 src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
1206 src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
1207 dst0 = __msa_adds_u_b(src0, src2);
1208 dst1 = __msa_adds_u_b(src1, src3);
1209 ST_UB2(dst0, dst1, dst_argb, 16);
1210 src_argb0 += 32;
1211 src_argb1 += 32;
1212 dst_argb += 32;
1213 }
1214 }
1215
ARGBSubtractRow_MSA(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1216 void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
1217 const uint8_t* src_argb1,
1218 uint8_t* dst_argb,
1219 int width) {
1220 int x;
1221 v16u8 src0, src1, src2, src3, dst0, dst1;
1222
1223 for (x = 0; x < width; x += 8) {
1224 src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
1225 src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
1226 src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
1227 src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
1228 dst0 = __msa_subs_u_b(src0, src2);
1229 dst1 = __msa_subs_u_b(src1, src3);
1230 ST_UB2(dst0, dst1, dst_argb, 16);
1231 src_argb0 += 32;
1232 src_argb1 += 32;
1233 dst_argb += 32;
1234 }
1235 }
1236
ARGBAttenuateRow_MSA(const uint8_t * src_argb,uint8_t * dst_argb,int width)1237 void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
1238 uint8_t* dst_argb,
1239 int width) {
1240 int x;
1241 v16u8 src0, src1, dst0, dst1;
1242 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1243 v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
1244 v8i16 zero = {0};
1245 v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
1246
1247 for (x = 0; x < width; x += 8) {
1248 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1249 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
1250 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
1251 vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
1252 vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);
1253 vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1);
1254 vec4 = (v8u16)__msa_fill_h(vec0[3]);
1255 vec5 = (v8u16)__msa_fill_h(vec0[7]);
1256 vec6 = (v8u16)__msa_fill_h(vec1[3]);
1257 vec7 = (v8u16)__msa_fill_h(vec1[7]);
1258 vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
1259 vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
1260 vec6 = (v8u16)__msa_fill_h(vec2[3]);
1261 vec7 = (v8u16)__msa_fill_h(vec2[7]);
1262 vec8 = (v8u16)__msa_fill_h(vec3[3]);
1263 vec9 = (v8u16)__msa_fill_h(vec3[7]);
1264 vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
1265 vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
1266 reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4);
1267 reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4);
1268 reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5);
1269 reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5);
1270 reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6);
1271 reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6);
1272 reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7);
1273 reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7);
1274 reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
1275 reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
1276 reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
1277 reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
1278 reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
1279 reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
1280 reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
1281 reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
1282 reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
1283 reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
1284 reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
1285 reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
1286 reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24);
1287 reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24);
1288 reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24);
1289 reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24);
1290 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1291 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1292 vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
1293 vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6);
1294 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1295 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
1296 dst0 = __msa_bmnz_v(dst0, src0, mask);
1297 dst1 = __msa_bmnz_v(dst1, src1, mask);
1298 ST_UB2(dst0, dst1, dst_argb, 16);
1299 src_argb += 32;
1300 dst_argb += 32;
1301 }
1302 }
1303
ARGBToRGB565DitherRow_MSA(const uint8_t * src_argb,uint8_t * dst_rgb,uint32_t dither4,int width)1304 void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
1305 uint8_t* dst_rgb,
1306 uint32_t dither4,
1307 int width) {
1308 int x;
1309 v16u8 src0, src1, dst0, vec0, vec1;
1310 v8i16 vec_d0;
1311 v8i16 reg0, reg1, reg2;
1312 v16i8 zero = {0};
1313 v8i16 max = __msa_ldi_h(0xFF);
1314
1315 vec_d0 = (v8i16)__msa_fill_w(dither4);
1316 vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);
1317
1318 for (x = 0; x < width; x += 8) {
1319 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1320 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
1321 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
1322 vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
1323 reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);
1324 reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1);
1325 reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0);
1326 reg0 += vec_d0;
1327 reg1 += vec_d0;
1328 reg2 += vec_d0;
1329 reg0 = __msa_maxi_s_h((v8i16)reg0, 0);
1330 reg1 = __msa_maxi_s_h((v8i16)reg1, 0);
1331 reg2 = __msa_maxi_s_h((v8i16)reg2, 0);
1332 reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0);
1333 reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1);
1334 reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2);
1335 reg0 = __msa_srai_h(reg0, 3);
1336 reg2 = __msa_srai_h(reg2, 3);
1337 reg1 = __msa_srai_h(reg1, 2);
1338 reg2 = __msa_slli_h(reg2, 11);
1339 reg1 = __msa_slli_h(reg1, 5);
1340 reg0 |= reg1;
1341 dst0 = (v16u8)(reg0 | reg2);
1342 ST_UB(dst0, dst_rgb);
1343 src_argb += 32;
1344 dst_rgb += 16;
1345 }
1346 }
1347
ARGBShuffleRow_MSA(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)1348 void ARGBShuffleRow_MSA(const uint8_t* src_argb,
1349 uint8_t* dst_argb,
1350 const uint8_t* shuffler,
1351 int width) {
1352 int x;
1353 v16u8 src0, src1, dst0, dst1;
1354 v16i8 vec0;
1355 v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
1356 int32_t val = LW((int32_t*)shuffler);
1357
1358 vec0 = (v16i8)__msa_fill_w(val);
1359 shuffler_vec += vec0;
1360
1361 for (x = 0; x < width; x += 8) {
1362 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
1363 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
1364 dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0);
1365 dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1);
1366 ST_UB2(dst0, dst1, dst_argb, 16);
1367 src_argb += 32;
1368 dst_argb += 32;
1369 }
1370 }
1371
ARGBShadeRow_MSA(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)1372 void ARGBShadeRow_MSA(const uint8_t* src_argb,
1373 uint8_t* dst_argb,
1374 int width,
1375 uint32_t value) {
1376 int x;
1377 v16u8 src0, dst0;
1378 v8u16 vec0, vec1;
1379 v4u32 reg0, reg1, reg2, reg3, rgba_scale;
1380 v8i16 zero = {0};
1381
1382 rgba_scale[0] = value;
1383 rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale);
1384 rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale);
1385
1386 for (x = 0; x < width; x += 4) {
1387 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
1388 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
1389 vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
1390 reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
1391 reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
1392 reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
1393 reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
1394 reg0 *= rgba_scale;
1395 reg1 *= rgba_scale;
1396 reg2 *= rgba_scale;
1397 reg3 *= rgba_scale;
1398 reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
1399 reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
1400 reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
1401 reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
1402 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1403 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1404 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1405 ST_UB(dst0, dst_argb);
1406 src_argb += 16;
1407 dst_argb += 16;
1408 }
1409 }
1410
ARGBGrayRow_MSA(const uint8_t * src_argb,uint8_t * dst_argb,int width)1411 void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
1412 int x;
1413 v16u8 src0, src1, vec0, vec1, dst0, dst1;
1414 v8u16 reg0;
1415 v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
1416 v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
1417
1418 for (x = 0; x < width; x += 8) {
1419 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
1420 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
1421 vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
1422 vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
1423 reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
1424 reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
1425 reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
1426 vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
1427 vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
1428 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
1429 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);
1430 ST_UB2(dst0, dst1, dst_argb, 16);
1431 src_argb += 32;
1432 dst_argb += 32;
1433 }
1434 }
1435
ARGBSepiaRow_MSA(uint8_t * dst_argb,int width)1436 void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) {
1437 int x;
1438 v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;
1439 v8u16 reg0, reg1, reg2;
1440 v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411);
1441 v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23);
1442 v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816);
1443 v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D);
1444 v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218);
1445 v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32);
1446 v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF);
1447
1448 for (x = 0; x < width; x += 8) {
1449 src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0);
1450 src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16);
1451 vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
1452 vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
1453 vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1);
1454 reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411);
1455 reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816);
1456 reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218);
1457 reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23);
1458 reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D);
1459 reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32);
1460 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7);
1461 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7);
1462 reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7);
1463 reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF);
1464 reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF);
1465 vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0);
1466 vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1);
1467 vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2);
1468 vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
1469 vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
1470 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);
1471 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);
1472 ST_UB2(dst0, dst1, dst_argb, 16);
1473 dst_argb += 32;
1474 }
1475 }
1476
ARGB4444ToARGBRow_MSA(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)1477 void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
1478 uint8_t* dst_argb,
1479 int width) {
1480 int x;
1481 v16u8 src0, src1;
1482 v8u16 vec0, vec1, vec2, vec3;
1483 v16u8 dst0, dst1, dst2, dst3;
1484
1485 for (x = 0; x < width; x += 16) {
1486 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0);
1487 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16);
1488 vec0 = (v8u16)__msa_andi_b(src0, 0x0F);
1489 vec1 = (v8u16)__msa_andi_b(src1, 0x0F);
1490 vec2 = (v8u16)__msa_andi_b(src0, 0xF0);
1491 vec3 = (v8u16)__msa_andi_b(src1, 0xF0);
1492 vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4);
1493 vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4);
1494 vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4);
1495 vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4);
1496 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
1497 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
1498 dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
1499 dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
1500 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1501 src_argb4444 += 32;
1502 dst_argb += 64;
1503 }
1504 }
1505
ARGB1555ToARGBRow_MSA(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)1506 void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
1507 uint8_t* dst_argb,
1508 int width) {
1509 int x;
1510 v8u16 src0, src1;
1511 v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
1512 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6;
1513 v16u8 dst0, dst1, dst2, dst3;
1514 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1515
1516 for (x = 0; x < width; x += 16) {
1517 src0 = (v8u16)__msa_ld_h((void*)src_argb1555, 0);
1518 src1 = (v8u16)__msa_ld_h((void*)src_argb1555, 16);
1519 vec0 = src0 & const_0x1F;
1520 vec1 = src1 & const_0x1F;
1521 src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
1522 src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
1523 vec2 = src0 & const_0x1F;
1524 vec3 = src1 & const_0x1F;
1525 src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
1526 src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
1527 vec4 = src0 & const_0x1F;
1528 vec5 = src1 & const_0x1F;
1529 src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
1530 src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
1531 reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1532 reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
1533 reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
1534 reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
1535 reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3);
1536 reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3);
1537 reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3);
1538 reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2);
1539 reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2);
1540 reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2);
1541 reg3 = -reg3;
1542 reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4);
1543 reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4);
1544 reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5);
1545 reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5);
1546 dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0);
1547 dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0);
1548 dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1);
1549 dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1);
1550 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1551 src_argb1555 += 32;
1552 dst_argb += 64;
1553 }
1554 }
1555
RGB565ToARGBRow_MSA(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)1556 void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
1557 uint8_t* dst_argb,
1558 int width) {
1559 int x;
1560 v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
1561 v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
1562 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
1563 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
1564 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1565 v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
1566 v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
1567
1568 for (x = 0; x < width; x += 16) {
1569 src0 = (v8u16)__msa_ld_h((void*)src_rgb565, 0);
1570 src1 = (v8u16)__msa_ld_h((void*)src_rgb565, 16);
1571 vec0 = src0 & const_0x1F;
1572 vec1 = src0 & const_0x7E0;
1573 vec2 = src0 & const_0xF800;
1574 vec3 = src1 & const_0x1F;
1575 vec4 = src1 & const_0x7E0;
1576 vec5 = src1 & const_0xF800;
1577 reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
1578 reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
1579 reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
1580 reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
1581 reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
1582 reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
1583 reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
1584 reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
1585 reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
1586 reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
1587 reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
1588 reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
1589 res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0);
1590 res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1);
1591 res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3);
1592 res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4);
1593 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
1594 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
1595 dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2);
1596 dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2);
1597 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1598 src_rgb565 += 32;
1599 dst_argb += 64;
1600 }
1601 }
1602
RGB24ToARGBRow_MSA(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)1603 void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24,
1604 uint8_t* dst_argb,
1605 int width) {
1606 int x;
1607 v16u8 src0, src1, src2;
1608 v16u8 vec0, vec1, vec2;
1609 v16u8 dst0, dst1, dst2, dst3;
1610 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
1611 v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
1612
1613 for (x = 0; x < width; x += 16) {
1614 src0 = (v16u8)__msa_ld_b((void*)src_rgb24, 0);
1615 src1 = (v16u8)__msa_ld_b((void*)src_rgb24, 16);
1616 src2 = (v16u8)__msa_ld_b((void*)src_rgb24, 32);
1617 vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
1618 vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
1619 vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
1620 dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0);
1621 dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0);
1622 dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1);
1623 dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2);
1624 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1625 src_rgb24 += 48;
1626 dst_argb += 64;
1627 }
1628 }
1629
RAWToARGBRow_MSA(const uint8_t * src_raw,uint8_t * dst_argb,int width)1630 void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
1631 int x;
1632 v16u8 src0, src1, src2;
1633 v16u8 vec0, vec1, vec2;
1634 v16u8 dst0, dst1, dst2, dst3;
1635 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
1636 v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19};
1637
1638 for (x = 0; x < width; x += 16) {
1639 src0 = (v16u8)__msa_ld_b((void*)src_raw, 0);
1640 src1 = (v16u8)__msa_ld_b((void*)src_raw, 16);
1641 src2 = (v16u8)__msa_ld_b((void*)src_raw, 32);
1642 vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
1643 vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
1644 vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
1645 dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0);
1646 dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0);
1647 dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1);
1648 dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2);
1649 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1650 src_raw += 48;
1651 dst_argb += 64;
1652 }
1653 }
1654
ARGB1555ToYRow_MSA(const uint8_t * src_argb1555,uint8_t * dst_y,int width)1655 void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
1656 uint8_t* dst_y,
1657 int width) {
1658 int x;
1659 v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
1660 v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
1661 v16u8 dst0;
1662 v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
1663 v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
1664 v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
1665 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1666 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
1667
1668 for (x = 0; x < width; x += 16) {
1669 src0 = (v8u16)__msa_ld_b((void*)src_argb1555, 0);
1670 src1 = (v8u16)__msa_ld_b((void*)src_argb1555, 16);
1671 vec0 = src0 & const_0x1F;
1672 vec1 = src1 & const_0x1F;
1673 src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
1674 src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
1675 vec2 = src0 & const_0x1F;
1676 vec3 = src1 & const_0x1F;
1677 src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
1678 src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
1679 vec4 = src0 & const_0x1F;
1680 vec5 = src1 & const_0x1F;
1681 reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
1682 reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);
1683 reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);
1684 reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);
1685 reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);
1686 reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
1687 reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);
1688 reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);
1689 reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);
1690 reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);
1691 reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);
1692 reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);
1693 reg0 *= const_0x19;
1694 reg1 *= const_0x19;
1695 reg2 *= const_0x81;
1696 reg3 *= const_0x81;
1697 reg4 *= const_0x42;
1698 reg5 *= const_0x42;
1699 reg0 += reg2;
1700 reg1 += reg3;
1701 reg0 += reg4;
1702 reg1 += reg5;
1703 reg0 += const_0x1080;
1704 reg1 += const_0x1080;
1705 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
1706 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
1707 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
1708 ST_UB(dst0, dst_y);
1709 src_argb1555 += 32;
1710 dst_y += 16;
1711 }
1712 }
1713
RGB565ToYRow_MSA(const uint8_t * src_rgb565,uint8_t * dst_y,int width)1714 void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
1715 int x;
1716 v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1717 v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
1718 v4u32 res0, res1, res2, res3;
1719 v16u8 dst0;
1720 v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);
1721 v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);
1722 v8i16 const_0x1080 = __msa_fill_h(0x1080);
1723 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1724 v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
1725 v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
1726
1727 for (x = 0; x < width; x += 16) {
1728 src0 = (v8u16)__msa_ld_b((void*)src_rgb565, 0);
1729 src1 = (v8u16)__msa_ld_b((void*)src_rgb565, 16);
1730 vec0 = src0 & const_0x1F;
1731 vec1 = src0 & const_0x7E0;
1732 vec2 = src0 & const_0xF800;
1733 vec3 = src1 & const_0x1F;
1734 vec4 = src1 & const_0x7E0;
1735 vec5 = src1 & const_0xF800;
1736 reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
1737 reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
1738 reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
1739 reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
1740 reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
1741 reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
1742 reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
1743 reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
1744 reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
1745 reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
1746 reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
1747 reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
1748 vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);
1749 vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);
1750 vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);
1751 vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);
1752 vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);
1753 vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);
1754 vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);
1755 vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);
1756 res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);
1757 res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);
1758 res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);
1759 res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);
1760 res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);
1761 res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);
1762 res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);
1763 res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);
1764 res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);
1765 res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);
1766 res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);
1767 res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);
1768 vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);
1769 vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);
1770 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1771 ST_UB(dst0, dst_y);
1772 src_rgb565 += 32;
1773 dst_y += 16;
1774 }
1775 }
1776
RGB24ToYRow_MSA(const uint8_t * src_argb0,uint8_t * dst_y,int width)1777 void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
1778 int x;
1779 v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
1780 v8u16 vec0, vec1, vec2, vec3;
1781 v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119);
1782 v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42);
1783 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
1784 v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
1785 v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
1786 18, 19, 20, 21, 21, 22, 23, 24};
1787 v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
1788 v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
1789 v16i8 zero = {0};
1790
1791 for (x = 0; x < width; x += 16) {
1792 src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
1793 src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
1794 src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
1795 reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
1796 reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
1797 reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
1798 reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
1799 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1800 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1801 vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
1802 vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
1803 vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119);
1804 vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119);
1805 vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42);
1806 vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42);
1807 vec0 += const_0x1080;
1808 vec1 += const_0x1080;
1809 vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
1810 vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
1811 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1812 ST_UB(dst0, dst_y);
1813 src_argb0 += 48;
1814 dst_y += 16;
1815 }
1816 }
1817
RAWToYRow_MSA(const uint8_t * src_argb0,uint8_t * dst_y,int width)1818 void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
1819 int x;
1820 v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
1821 v8u16 vec0, vec1, vec2, vec3;
1822 v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142);
1823 v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19);
1824 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
1825 v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
1826 v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
1827 18, 19, 20, 21, 21, 22, 23, 24};
1828 v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
1829 v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
1830 v16i8 zero = {0};
1831
1832 for (x = 0; x < width; x += 16) {
1833 src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
1834 src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
1835 src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
1836 reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
1837 reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
1838 reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
1839 reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
1840 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1841 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1842 vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
1843 vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
1844 vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142);
1845 vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142);
1846 vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19);
1847 vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19);
1848 vec0 += const_0x1080;
1849 vec1 += const_0x1080;
1850 vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
1851 vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
1852 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1853 ST_UB(dst0, dst_y);
1854 src_argb0 += 48;
1855 dst_y += 16;
1856 }
1857 }
1858
ARGB1555ToUVRow_MSA(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)1859 void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
1860 int src_stride_argb1555,
1861 uint8_t* dst_u,
1862 uint8_t* dst_v,
1863 int width) {
1864 int x;
1865 const uint16_t* s = (const uint16_t*)src_argb1555;
1866 const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555);
1867 int64_t res0, res1;
1868 v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
1869 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
1870 v16u8 dst0;
1871 v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
1872 v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
1873 v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
1874 v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
1875 v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
1876 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
1877 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1878
1879 for (x = 0; x < width; x += 16) {
1880 src0 = (v8u16)__msa_ld_b((void*)s, 0);
1881 src1 = (v8u16)__msa_ld_b((void*)s, 16);
1882 src2 = (v8u16)__msa_ld_b((void*)t, 0);
1883 src3 = (v8u16)__msa_ld_b((void*)t, 16);
1884 vec0 = src0 & const_0x1F;
1885 vec1 = src1 & const_0x1F;
1886 vec0 += src2 & const_0x1F;
1887 vec1 += src3 & const_0x1F;
1888 vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1889 src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
1890 src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
1891 src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
1892 src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
1893 vec2 = src0 & const_0x1F;
1894 vec3 = src1 & const_0x1F;
1895 vec2 += src2 & const_0x1F;
1896 vec3 += src3 & const_0x1F;
1897 vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
1898 src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
1899 src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
1900 src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
1901 src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
1902 vec4 = src0 & const_0x1F;
1903 vec5 = src1 & const_0x1F;
1904 vec4 += src2 & const_0x1F;
1905 vec5 += src3 & const_0x1F;
1906 vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
1907 vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
1908 vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
1909 vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
1910 vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
1911 vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
1912 vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
1913 vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
1914 vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);
1915 vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);
1916 reg0 = vec6 * const_0x70;
1917 reg1 = vec0 * const_0x4A;
1918 reg2 = vec2 * const_0x70;
1919 reg3 = vec0 * const_0x5E;
1920 reg0 += const_0x8080;
1921 reg1 += vec2 * const_0x26;
1922 reg2 += const_0x8080;
1923 reg3 += vec6 * const_0x12;
1924 reg0 -= reg1;
1925 reg2 -= reg3;
1926 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
1927 reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
1928 dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
1929 res0 = __msa_copy_u_d((v2i64)dst0, 0);
1930 res1 = __msa_copy_u_d((v2i64)dst0, 1);
1931 SD(res0, dst_u);
1932 SD(res1, dst_v);
1933 s += 16;
1934 t += 16;
1935 dst_u += 8;
1936 dst_v += 8;
1937 }
1938 }
1939
RGB565ToUVRow_MSA(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)1940 void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
1941 int src_stride_rgb565,
1942 uint8_t* dst_u,
1943 uint8_t* dst_v,
1944 int width) {
1945 int x;
1946 const uint16_t* s = (const uint16_t*)src_rgb565;
1947 const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565);
1948 int64_t res0, res1;
1949 v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
1950 v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
1951 v16u8 dst0;
1952 v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
1953 v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
1954 v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
1955 v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
1956 v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
1957 v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);
1958 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1959 v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
1960
1961 for (x = 0; x < width; x += 16) {
1962 src0 = (v8u16)__msa_ld_b((void*)s, 0);
1963 src1 = (v8u16)__msa_ld_b((void*)s, 16);
1964 src2 = (v8u16)__msa_ld_b((void*)t, 0);
1965 src3 = (v8u16)__msa_ld_b((void*)t, 16);
1966 vec0 = src0 & const_0x1F;
1967 vec1 = src1 & const_0x1F;
1968 vec0 += src2 & const_0x1F;
1969 vec1 += src3 & const_0x1F;
1970 vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1971 src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
1972 src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
1973 src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
1974 src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
1975 vec2 = src0 & const_0x3F;
1976 vec3 = src1 & const_0x3F;
1977 vec2 += src2 & const_0x3F;
1978 vec3 += src3 & const_0x3F;
1979 vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
1980 src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);
1981 src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);
1982 src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);
1983 src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);
1984 vec4 = src0 & const_0x1F;
1985 vec5 = src1 & const_0x1F;
1986 vec4 += src2 & const_0x1F;
1987 vec5 += src3 & const_0x1F;
1988 vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
1989 vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
1990 vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
1991 vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
1992 vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
1993 vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
1994 vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
1995 vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
1996 reg0 = vec3 * const_0x70;
1997 reg1 = vec1 * const_0x4A;
1998 reg2 = vec4 * const_0x70;
1999 reg3 = vec1 * const_0x5E;
2000 reg0 += const_32896;
2001 reg1 += vec4 * const_0x26;
2002 reg2 += const_32896;
2003 reg3 += vec3 * const_0x12;
2004 reg0 -= reg1;
2005 reg2 -= reg3;
2006 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
2007 reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
2008 dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
2009 res0 = __msa_copy_u_d((v2i64)dst0, 0);
2010 res1 = __msa_copy_u_d((v2i64)dst0, 1);
2011 SD(res0, dst_u);
2012 SD(res1, dst_v);
2013 s += 16;
2014 t += 16;
2015 dst_u += 8;
2016 dst_v += 8;
2017 }
2018 }
2019
RGB24ToUVRow_MSA(const uint8_t * src_rgb0,int src_stride_rgb,uint8_t * dst_u,uint8_t * dst_v,int width)2020 void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
2021 int src_stride_rgb,
2022 uint8_t* dst_u,
2023 uint8_t* dst_v,
2024 int width) {
2025 int x;
2026 const uint8_t* s = src_rgb0;
2027 const uint8_t* t = src_rgb0 + src_stride_rgb;
2028 int64_t res0, res1;
2029 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2030 v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
2031 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2032 v8i16 reg0, reg1, reg2, reg3;
2033 v16u8 dst0;
2034 v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
2035 v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
2036 v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
2037 v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
2038 v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
2039 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2040 v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
2041 v16i8 zero = {0};
2042
2043 for (x = 0; x < width; x += 16) {
2044 inp0 = (v16u8)__msa_ld_b((void*)s, 0);
2045 inp1 = (v16u8)__msa_ld_b((void*)s, 16);
2046 inp2 = (v16u8)__msa_ld_b((void*)s, 32);
2047 inp3 = (v16u8)__msa_ld_b((void*)t, 0);
2048 inp4 = (v16u8)__msa_ld_b((void*)t, 16);
2049 inp5 = (v16u8)__msa_ld_b((void*)t, 32);
2050 src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
2051 src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
2052 src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
2053 src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
2054 src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
2055 src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
2056 src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
2057 src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
2058 src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
2059 src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
2060 src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
2061 src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
2062 src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
2063 src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
2064 vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
2065 vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
2066 vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
2067 vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
2068 vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
2069 vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
2070 vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
2071 vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
2072 vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
2073 vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
2074 vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
2075 vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
2076 vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
2077 vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
2078 vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
2079 vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
2080 reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
2081 reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
2082 reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
2083 reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
2084 reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
2085 reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
2086 reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
2087 reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
2088 reg0 = __msa_srai_h((v8i16)reg0, 2);
2089 reg1 = __msa_srai_h((v8i16)reg1, 2);
2090 reg2 = __msa_srai_h((v8i16)reg2, 2);
2091 reg3 = __msa_srai_h((v8i16)reg3, 2);
2092 vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
2093 vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
2094 vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
2095 vec7 = (v8u16)__msa_pckod_h(reg3, reg2);
2096 vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
2097 vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
2098 vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
2099 vec3 = vec0 * const_0x70;
2100 vec4 = vec1 * const_0x4A;
2101 vec5 = vec2 * const_0x26;
2102 vec2 *= const_0x70;
2103 vec1 *= const_0x5E;
2104 vec0 *= const_0x12;
2105 reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
2106 reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
2107 reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
2108 reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
2109 reg0 += reg1;
2110 reg2 += reg3;
2111 reg0 = __msa_srai_h(reg0, 8);
2112 reg2 = __msa_srai_h(reg2, 8);
2113 dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
2114 res0 = __msa_copy_u_d((v2i64)dst0, 0);
2115 res1 = __msa_copy_u_d((v2i64)dst0, 1);
2116 SD(res0, dst_u);
2117 SD(res1, dst_v);
2118 t += 48;
2119 s += 48;
2120 dst_u += 8;
2121 dst_v += 8;
2122 }
2123 }
2124
RAWToUVRow_MSA(const uint8_t * src_rgb0,int src_stride_rgb,uint8_t * dst_u,uint8_t * dst_v,int width)2125 void RAWToUVRow_MSA(const uint8_t* src_rgb0,
2126 int src_stride_rgb,
2127 uint8_t* dst_u,
2128 uint8_t* dst_v,
2129 int width) {
2130 int x;
2131 const uint8_t* s = src_rgb0;
2132 const uint8_t* t = src_rgb0 + src_stride_rgb;
2133 int64_t res0, res1;
2134 v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
2135 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2136 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2137 v8i16 reg0, reg1, reg2, reg3;
2138 v16u8 dst0;
2139 v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
2140 v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
2141 v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
2142 v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
2143 v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
2144 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2145 v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
2146 v16i8 zero = {0};
2147
2148 for (x = 0; x < width; x += 16) {
2149 inp0 = (v16u8)__msa_ld_b((void*)s, 0);
2150 inp1 = (v16u8)__msa_ld_b((void*)s, 16);
2151 inp2 = (v16u8)__msa_ld_b((void*)s, 32);
2152 inp3 = (v16u8)__msa_ld_b((void*)t, 0);
2153 inp4 = (v16u8)__msa_ld_b((void*)t, 16);
2154 inp5 = (v16u8)__msa_ld_b((void*)t, 32);
2155 src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
2156 src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
2157 src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
2158 src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
2159 src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
2160 src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
2161 src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
2162 src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
2163 src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
2164 src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
2165 src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
2166 src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
2167 src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
2168 src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
2169 vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
2170 vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
2171 vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
2172 vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
2173 vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
2174 vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
2175 vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
2176 vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
2177 vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
2178 vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
2179 vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
2180 vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
2181 vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
2182 vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
2183 vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
2184 vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
2185 reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
2186 reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
2187 reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
2188 reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
2189 reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
2190 reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
2191 reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
2192 reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
2193 reg0 = __msa_srai_h(reg0, 2);
2194 reg1 = __msa_srai_h(reg1, 2);
2195 reg2 = __msa_srai_h(reg2, 2);
2196 reg3 = __msa_srai_h(reg3, 2);
2197 vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
2198 vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
2199 vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
2200 vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
2201 vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
2202 vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
2203 vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
2204 vec3 = vec0 * const_0x70;
2205 vec4 = vec1 * const_0x4A;
2206 vec5 = vec2 * const_0x26;
2207 vec2 *= const_0x70;
2208 vec1 *= const_0x5E;
2209 vec0 *= const_0x12;
2210 reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
2211 reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
2212 reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
2213 reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
2214 reg0 += reg1;
2215 reg2 += reg3;
2216 reg0 = __msa_srai_h(reg0, 8);
2217 reg2 = __msa_srai_h(reg2, 8);
2218 dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
2219 res0 = __msa_copy_u_d((v2i64)dst0, 0);
2220 res1 = __msa_copy_u_d((v2i64)dst0, 1);
2221 SD(res0, dst_u);
2222 SD(res1, dst_v);
2223 t += 48;
2224 s += 48;
2225 dst_u += 8;
2226 dst_v += 8;
2227 }
2228 }
2229
NV12ToARGBRow_MSA(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2230 void NV12ToARGBRow_MSA(const uint8_t* src_y,
2231 const uint8_t* src_uv,
2232 uint8_t* dst_argb,
2233 const struct YuvConstants* yuvconstants,
2234 int width) {
2235 int x;
2236 uint64_t val0, val1;
2237 v16u8 src0, src1, res0, res1, dst0, dst1;
2238 v8i16 vec0, vec1, vec2;
2239 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2240 v4i32 vec_ubvr, vec_ugvg;
2241 v16u8 zero = {0};
2242 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2243
2244 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2245 vec_br, vec_yg);
2246 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2247 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2248
2249 for (x = 0; x < width; x += 8) {
2250 val0 = LD(src_y);
2251 val1 = LD(src_uv);
2252 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2253 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2254 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2255 vec0, vec1, vec2);
2256 res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
2257 res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
2258 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
2259 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
2260 ST_UB2(dst0, dst1, dst_argb, 16);
2261 src_y += 8;
2262 src_uv += 8;
2263 dst_argb += 32;
2264 }
2265 }
2266
NV12ToRGB565Row_MSA(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)2267 void NV12ToRGB565Row_MSA(const uint8_t* src_y,
2268 const uint8_t* src_uv,
2269 uint8_t* dst_rgb565,
2270 const struct YuvConstants* yuvconstants,
2271 int width) {
2272 int x;
2273 uint64_t val0, val1;
2274 v16u8 src0, src1, dst0;
2275 v8i16 vec0, vec1, vec2;
2276 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2277 v4i32 vec_ubvr, vec_ugvg;
2278 v16u8 zero = {0};
2279
2280 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2281 vec_br, vec_yg);
2282 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2283 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2284
2285 for (x = 0; x < width; x += 8) {
2286 val0 = LD(src_y);
2287 val1 = LD(src_uv);
2288 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2289 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2290 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2291 vec0, vec1, vec2);
2292 vec0 = vec0 >> 3;
2293 vec1 = (vec1 >> 2) << 5;
2294 vec2 = (vec2 >> 3) << 11;
2295 dst0 = (v16u8)(vec0 | vec1 | vec2);
2296 ST_UB(dst0, dst_rgb565);
2297 src_y += 8;
2298 src_uv += 8;
2299 dst_rgb565 += 16;
2300 }
2301 }
2302
NV21ToARGBRow_MSA(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2303 void NV21ToARGBRow_MSA(const uint8_t* src_y,
2304 const uint8_t* src_vu,
2305 uint8_t* dst_argb,
2306 const struct YuvConstants* yuvconstants,
2307 int width) {
2308 int x;
2309 uint64_t val0, val1;
2310 v16u8 src0, src1, res0, res1, dst0, dst1;
2311 v8i16 vec0, vec1, vec2;
2312 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2313 v4i32 vec_ubvr, vec_ugvg;
2314 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2315 v16u8 zero = {0};
2316 v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
2317
2318 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2319 vec_br, vec_yg);
2320 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2321 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2322
2323 for (x = 0; x < width; x += 8) {
2324 val0 = LD(src_y);
2325 val1 = LD(src_vu);
2326 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2327 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2328 src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
2329 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2330 vec0, vec1, vec2);
2331 res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
2332 res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
2333 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
2334 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
2335 ST_UB2(dst0, dst1, dst_argb, 16);
2336 src_y += 8;
2337 src_vu += 8;
2338 dst_argb += 32;
2339 }
2340 }
2341
SobelRow_MSA(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)2342 void SobelRow_MSA(const uint8_t* src_sobelx,
2343 const uint8_t* src_sobely,
2344 uint8_t* dst_argb,
2345 int width) {
2346 int x;
2347 v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
2348 v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};
2349 v16i8 const_0x4 = __msa_ldi_b(0x4);
2350 v16i8 mask1 = mask0 + const_0x4;
2351 v16i8 mask2 = mask1 + const_0x4;
2352 v16i8 mask3 = mask2 + const_0x4;
2353 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2354
2355 for (x = 0; x < width; x += 16) {
2356 src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0);
2357 src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0);
2358 vec0 = __msa_adds_u_b(src0, src1);
2359 dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0);
2360 dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0);
2361 dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0);
2362 dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0);
2363 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2364 src_sobelx += 16;
2365 src_sobely += 16;
2366 dst_argb += 64;
2367 }
2368 }
2369
SobelToPlaneRow_MSA(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)2370 void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
2371 const uint8_t* src_sobely,
2372 uint8_t* dst_y,
2373 int width) {
2374 int x;
2375 v16u8 src0, src1, src2, src3, dst0, dst1;
2376
2377 for (x = 0; x < width; x += 32) {
2378 src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0);
2379 src1 = (v16u8)__msa_ld_b((void*)src_sobelx, 16);
2380 src2 = (v16u8)__msa_ld_b((void*)src_sobely, 0);
2381 src3 = (v16u8)__msa_ld_b((void*)src_sobely, 16);
2382 dst0 = __msa_adds_u_b(src0, src2);
2383 dst1 = __msa_adds_u_b(src1, src3);
2384 ST_UB2(dst0, dst1, dst_y, 16);
2385 src_sobelx += 32;
2386 src_sobely += 32;
2387 dst_y += 32;
2388 }
2389 }
2390
SobelXYRow_MSA(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)2391 void SobelXYRow_MSA(const uint8_t* src_sobelx,
2392 const uint8_t* src_sobely,
2393 uint8_t* dst_argb,
2394 int width) {
2395 int x;
2396 v16u8 src0, src1, vec0, vec1, vec2;
2397 v16u8 reg0, reg1, dst0, dst1, dst2, dst3;
2398 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2399
2400 for (x = 0; x < width; x += 16) {
2401 src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0);
2402 src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0);
2403 vec0 = __msa_adds_u_b(src0, src1);
2404 vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
2405 vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
2406 reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0);
2407 reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0);
2408 dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);
2409 dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
2410 dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
2411 dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
2412 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2413 src_sobelx += 16;
2414 src_sobely += 16;
2415 dst_argb += 64;
2416 }
2417 }
2418
ARGBToYJRow_MSA(const uint8_t * src_argb0,uint8_t * dst_y,int width)2419 void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
2420 int x;
2421 v16u8 src0, src1, src2, src3, dst0;
2422 v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
2423 v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
2424 v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
2425
2426 for (x = 0; x < width; x += 16) {
2427 src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
2428 src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
2429 src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
2430 src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
2431 ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
2432 dst0);
2433 ST_UB(dst0, dst_y);
2434 src_argb0 += 64;
2435 dst_y += 16;
2436 }
2437 }
2438
BGRAToYRow_MSA(const uint8_t * src_argb0,uint8_t * dst_y,int width)2439 void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
2440 int x;
2441 v16u8 src0, src1, src2, src3, dst0;
2442 v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
2443 v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981);
2444 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
2445
2446 for (x = 0; x < width; x += 16) {
2447 src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
2448 src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
2449 src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
2450 src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
2451 ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
2452 dst0);
2453 ST_UB(dst0, dst_y);
2454 src_argb0 += 64;
2455 dst_y += 16;
2456 }
2457 }
2458
ABGRToYRow_MSA(const uint8_t * src_argb0,uint8_t * dst_y,int width)2459 void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
2460 int x;
2461 v16u8 src0, src1, src2, src3, dst0;
2462 v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
2463 v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19);
2464 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
2465
2466 for (x = 0; x < width; x += 16) {
2467 src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
2468 src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
2469 src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
2470 src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
2471 ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
2472 dst0);
2473 ST_UB(dst0, dst_y);
2474 src_argb0 += 64;
2475 dst_y += 16;
2476 }
2477 }
2478
RGBAToYRow_MSA(const uint8_t * src_argb0,uint8_t * dst_y,int width)2479 void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
2480 int x;
2481 v16u8 src0, src1, src2, src3, dst0;
2482 v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
2483 v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281);
2484 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
2485
2486 for (x = 0; x < width; x += 16) {
2487 src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
2488 src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
2489 src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
2490 src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
2491 ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
2492 dst0);
2493 ST_UB(dst0, dst_y);
2494 src_argb0 += 64;
2495 dst_y += 16;
2496 }
2497 }
2498
ARGBToUVJRow_MSA(const uint8_t * src_rgb0,int src_stride_rgb,uint8_t * dst_u,uint8_t * dst_v,int width)2499 void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
2500 int src_stride_rgb,
2501 uint8_t* dst_u,
2502 uint8_t* dst_v,
2503 int width) {
2504 int x;
2505 const uint8_t* s = src_rgb0;
2506 const uint8_t* t = src_rgb0 + src_stride_rgb;
2507 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2508 v16u8 vec0, vec1, vec2, vec3;
2509 v16u8 dst0, dst1;
2510 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
2511 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
2512 18, 19, 22, 23, 26, 27, 30, 31};
2513 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
2514 v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
2515 v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
2516 v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
2517 v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
2518 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2519
2520 for (x = 0; x < width; x += 32) {
2521 src0 = (v16u8)__msa_ld_b((void*)s, 0);
2522 src1 = (v16u8)__msa_ld_b((void*)s, 16);
2523 src2 = (v16u8)__msa_ld_b((void*)s, 32);
2524 src3 = (v16u8)__msa_ld_b((void*)s, 48);
2525 src4 = (v16u8)__msa_ld_b((void*)t, 0);
2526 src5 = (v16u8)__msa_ld_b((void*)t, 16);
2527 src6 = (v16u8)__msa_ld_b((void*)t, 32);
2528 src7 = (v16u8)__msa_ld_b((void*)t, 48);
2529 src0 = __msa_aver_u_b(src0, src4);
2530 src1 = __msa_aver_u_b(src1, src5);
2531 src2 = __msa_aver_u_b(src2, src6);
2532 src3 = __msa_aver_u_b(src3, src7);
2533 src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
2534 src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
2535 src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
2536 src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
2537 vec0 = __msa_aver_u_b(src4, src6);
2538 vec1 = __msa_aver_u_b(src5, src7);
2539 src0 = (v16u8)__msa_ld_b((void*)s, 64);
2540 src1 = (v16u8)__msa_ld_b((void*)s, 80);
2541 src2 = (v16u8)__msa_ld_b((void*)s, 96);
2542 src3 = (v16u8)__msa_ld_b((void*)s, 112);
2543 src4 = (v16u8)__msa_ld_b((void*)t, 64);
2544 src5 = (v16u8)__msa_ld_b((void*)t, 80);
2545 src6 = (v16u8)__msa_ld_b((void*)t, 96);
2546 src7 = (v16u8)__msa_ld_b((void*)t, 112);
2547 src0 = __msa_aver_u_b(src0, src4);
2548 src1 = __msa_aver_u_b(src1, src5);
2549 src2 = __msa_aver_u_b(src2, src6);
2550 src3 = __msa_aver_u_b(src3, src7);
2551 src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
2552 src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
2553 src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
2554 src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
2555 vec2 = __msa_aver_u_b(src4, src6);
2556 vec3 = __msa_aver_u_b(src5, src7);
2557 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
2558 const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
2559 dst1);
2560 ST_UB(dst0, dst_v);
2561 ST_UB(dst1, dst_u);
2562 s += 128;
2563 t += 128;
2564 dst_v += 16;
2565 dst_u += 16;
2566 }
2567 }
2568
BGRAToUVRow_MSA(const uint8_t * src_rgb0,int src_stride_rgb,uint8_t * dst_u,uint8_t * dst_v,int width)2569 void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
2570 int src_stride_rgb,
2571 uint8_t* dst_u,
2572 uint8_t* dst_v,
2573 int width) {
2574 int x;
2575 const uint8_t* s = src_rgb0;
2576 const uint8_t* t = src_rgb0 + src_stride_rgb;
2577 v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
2578 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
2579 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
2580 18, 19, 22, 23, 26, 27, 30, 31};
2581 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
2582 v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
2583 v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
2584 v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
2585 v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
2586 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2587
2588 for (x = 0; x < width; x += 32) {
2589 READ_ARGB(s, t, vec0, vec1, vec2, vec3);
2590 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
2591 const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
2592 dst1);
2593 ST_UB(dst0, dst_v);
2594 ST_UB(dst1, dst_u);
2595 s += 128;
2596 t += 128;
2597 dst_v += 16;
2598 dst_u += 16;
2599 }
2600 }
2601
ABGRToUVRow_MSA(const uint8_t * src_rgb0,int src_stride_rgb,uint8_t * dst_u,uint8_t * dst_v,int width)2602 void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
2603 int src_stride_rgb,
2604 uint8_t* dst_u,
2605 uint8_t* dst_v,
2606 int width) {
2607 int x;
2608 const uint8_t* s = src_rgb0;
2609 const uint8_t* t = src_rgb0 + src_stride_rgb;
2610 v16u8 src0, src1, src2, src3;
2611 v16u8 dst0, dst1;
2612 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
2613 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
2614 18, 19, 22, 23, 26, 27, 30, 31};
2615 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
2616 v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
2617 v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
2618 v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
2619 v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
2620 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2621
2622 for (x = 0; x < width; x += 32) {
2623 READ_ARGB(s, t, src0, src1, src2, src3);
2624 ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
2625 const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
2626 dst1);
2627 ST_UB(dst0, dst_u);
2628 ST_UB(dst1, dst_v);
2629 s += 128;
2630 t += 128;
2631 dst_u += 16;
2632 dst_v += 16;
2633 }
2634 }
2635
RGBAToUVRow_MSA(const uint8_t * src_rgb0,int src_stride_rgb,uint8_t * dst_u,uint8_t * dst_v,int width)2636 void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
2637 int src_stride_rgb,
2638 uint8_t* dst_u,
2639 uint8_t* dst_v,
2640 int width) {
2641 int x;
2642 const uint8_t* s = src_rgb0;
2643 const uint8_t* t = src_rgb0 + src_stride_rgb;
2644 v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
2645 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
2646 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
2647 18, 19, 22, 23, 26, 27, 30, 31};
2648 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
2649 v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
2650 v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
2651 v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
2652 v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
2653 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2654
2655 for (x = 0; x < width; x += 32) {
2656 READ_ARGB(s, t, vec0, vec1, vec2, vec3);
2657 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
2658 const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
2659 dst1);
2660 ST_UB(dst0, dst_u);
2661 ST_UB(dst1, dst_v);
2662 s += 128;
2663 t += 128;
2664 dst_u += 16;
2665 dst_v += 16;
2666 }
2667 }
2668
I444ToARGBRow_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2669 void I444ToARGBRow_MSA(const uint8_t* src_y,
2670 const uint8_t* src_u,
2671 const uint8_t* src_v,
2672 uint8_t* dst_argb,
2673 const struct YuvConstants* yuvconstants,
2674 int width) {
2675 int x;
2676 v16u8 src0, src1, src2, dst0, dst1;
2677 v8u16 vec0, vec1, vec2;
2678 v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
2679 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2680 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2681 v8i16 zero = {0};
2682
2683 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2684 vec_br, vec_yg);
2685
2686 for (x = 0; x < width; x += 8) {
2687 READI444(src_y, src_u, src_v, src0, src1, src2);
2688 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
2689 reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
2690 reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
2691 reg0 *= vec_yg;
2692 reg1 *= vec_yg;
2693 reg0 = __msa_srai_w(reg0, 16);
2694 reg1 = __msa_srai_w(reg1, 16);
2695 reg4 = reg0 + vec_br;
2696 reg5 = reg1 + vec_br;
2697 reg2 = reg0 + vec_bg;
2698 reg3 = reg1 + vec_bg;
2699 reg0 += vec_bb;
2700 reg1 += vec_bb;
2701 vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
2702 vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
2703 reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
2704 reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
2705 reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
2706 reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
2707 reg0 -= reg6 * vec_ub;
2708 reg1 -= reg7 * vec_ub;
2709 reg2 -= reg6 * vec_ug;
2710 reg3 -= reg7 * vec_ug;
2711 reg4 -= reg8 * vec_vr;
2712 reg5 -= reg9 * vec_vr;
2713 reg2 -= reg8 * vec_vg;
2714 reg3 -= reg9 * vec_vg;
2715 reg0 = __msa_srai_w(reg0, 6);
2716 reg1 = __msa_srai_w(reg1, 6);
2717 reg2 = __msa_srai_w(reg2, 6);
2718 reg3 = __msa_srai_w(reg3, 6);
2719 reg4 = __msa_srai_w(reg4, 6);
2720 reg5 = __msa_srai_w(reg5, 6);
2721 CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
2722 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
2723 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
2724 vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
2725 vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
2726 vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
2727 dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
2728 dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);
2729 ST_UB2(dst0, dst1, dst_argb, 16);
2730 src_y += 8;
2731 src_u += 8;
2732 src_v += 8;
2733 dst_argb += 32;
2734 }
2735 }
2736
I400ToARGBRow_MSA(const uint8_t * src_y,uint8_t * dst_argb,int width)2737 void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
2738 int x;
2739 v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
2740 v8i16 vec0, vec1;
2741 v4i32 reg0, reg1, reg2, reg3;
2742 v4i32 vec_yg = __msa_fill_w(0x4A35);
2743 v8i16 vec_ygb = __msa_fill_h(0xFB78);
2744 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2745 v8i16 max = __msa_ldi_h(0xFF);
2746 v8i16 zero = {0};
2747
2748 for (x = 0; x < width; x += 16) {
2749 src0 = (v16u8)__msa_ld_b((void*)src_y, 0);
2750 vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
2751 vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
2752 reg0 = (v4i32)__msa_ilvr_h(zero, vec0);
2753 reg1 = (v4i32)__msa_ilvl_h(zero, vec0);
2754 reg2 = (v4i32)__msa_ilvr_h(zero, vec1);
2755 reg3 = (v4i32)__msa_ilvl_h(zero, vec1);
2756 reg0 *= vec_yg;
2757 reg1 *= vec_yg;
2758 reg2 *= vec_yg;
2759 reg3 *= vec_yg;
2760 reg0 = __msa_srai_w(reg0, 16);
2761 reg1 = __msa_srai_w(reg1, 16);
2762 reg2 = __msa_srai_w(reg2, 16);
2763 reg3 = __msa_srai_w(reg3, 16);
2764 vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
2765 vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
2766 vec0 += vec_ygb;
2767 vec1 += vec_ygb;
2768 vec0 = __msa_srai_h(vec0, 6);
2769 vec1 = __msa_srai_h(vec1, 6);
2770 vec0 = __msa_maxi_s_h(vec0, 0);
2771 vec1 = __msa_maxi_s_h(vec1, 0);
2772 vec0 = __msa_min_s_h(max, vec0);
2773 vec1 = __msa_min_s_h(max, vec1);
2774 res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
2775 res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0);
2776 res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0);
2777 res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0);
2778 res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0);
2779 dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1);
2780 dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1);
2781 dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2);
2782 dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2);
2783 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2784 src_y += 16;
2785 dst_argb += 64;
2786 }
2787 }
2788
J400ToARGBRow_MSA(const uint8_t * src_y,uint8_t * dst_argb,int width)2789 void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
2790 int x;
2791 v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3;
2792 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2793
2794 for (x = 0; x < width; x += 16) {
2795 src0 = (v16u8)__msa_ld_b((void*)src_y, 0);
2796 vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
2797 vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
2798 vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0);
2799 vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0);
2800 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
2801 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
2802 dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
2803 dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
2804 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2805 src_y += 16;
2806 dst_argb += 64;
2807 }
2808 }
2809
YUY2ToARGBRow_MSA(const uint8_t * src_yuy2,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2810 void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
2811 uint8_t* dst_argb,
2812 const struct YuvConstants* yuvconstants,
2813 int width) {
2814 int x;
2815 v16u8 src0, src1, src2;
2816 v8i16 vec0, vec1, vec2;
2817 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2818 v4i32 vec_ubvr, vec_ugvg;
2819 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2820
2821 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2822 vec_br, vec_yg);
2823 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2824 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2825
2826 for (x = 0; x < width; x += 8) {
2827 src0 = (v16u8)__msa_ld_b((void*)src_yuy2, 0);
2828 src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
2829 src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
2830 YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2831 vec0, vec1, vec2);
2832 STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
2833 src_yuy2 += 16;
2834 dst_argb += 32;
2835 }
2836 }
2837
UYVYToARGBRow_MSA(const uint8_t * src_uyvy,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2838 void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
2839 uint8_t* dst_argb,
2840 const struct YuvConstants* yuvconstants,
2841 int width) {
2842 int x;
2843 v16u8 src0, src1, src2;
2844 v8i16 vec0, vec1, vec2;
2845 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2846 v4i32 vec_ubvr, vec_ugvg;
2847 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2848
2849 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2850 vec_br, vec_yg);
2851 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2852 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2853
2854 for (x = 0; x < width; x += 8) {
2855 src0 = (v16u8)__msa_ld_b((void*)src_uyvy, 0);
2856 src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
2857 src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
2858 YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2859 vec0, vec1, vec2);
2860 STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
2861 src_uyvy += 16;
2862 dst_argb += 32;
2863 }
2864 }
2865
InterpolateRow_MSA(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int width,int32_t source_y_fraction)2866 void InterpolateRow_MSA(uint8_t* dst_ptr,
2867 const uint8_t* src_ptr,
2868 ptrdiff_t src_stride,
2869 int width,
2870 int32_t source_y_fraction) {
2871 int32_t y1_fraction = source_y_fraction;
2872 int32_t y0_fraction = 256 - y1_fraction;
2873 uint16_t y_fractions;
2874 const uint8_t* s = src_ptr;
2875 const uint8_t* t = src_ptr + src_stride;
2876 int x;
2877 v16u8 src0, src1, src2, src3, dst0, dst1;
2878 v8u16 vec0, vec1, vec2, vec3, y_frac;
2879
2880 if (0 == y1_fraction) {
2881 memcpy(dst_ptr, src_ptr, width);
2882 return;
2883 }
2884
2885 if (128 == y1_fraction) {
2886 for (x = 0; x < width; x += 32) {
2887 src0 = (v16u8)__msa_ld_b((void*)s, 0);
2888 src1 = (v16u8)__msa_ld_b((void*)s, 16);
2889 src2 = (v16u8)__msa_ld_b((void*)t, 0);
2890 src3 = (v16u8)__msa_ld_b((void*)t, 16);
2891 dst0 = __msa_aver_u_b(src0, src2);
2892 dst1 = __msa_aver_u_b(src1, src3);
2893 ST_UB2(dst0, dst1, dst_ptr, 16);
2894 s += 32;
2895 t += 32;
2896 dst_ptr += 32;
2897 }
2898 return;
2899 }
2900
2901 y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8));
2902 y_frac = (v8u16)__msa_fill_h(y_fractions);
2903
2904 for (x = 0; x < width; x += 32) {
2905 src0 = (v16u8)__msa_ld_b((void*)s, 0);
2906 src1 = (v16u8)__msa_ld_b((void*)s, 16);
2907 src2 = (v16u8)__msa_ld_b((void*)t, 0);
2908 src3 = (v16u8)__msa_ld_b((void*)t, 16);
2909 vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
2910 vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
2911 vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
2912 vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
2913 vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac);
2914 vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac);
2915 vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac);
2916 vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac);
2917 vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8);
2918 vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8);
2919 vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8);
2920 vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8);
2921 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
2922 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
2923 ST_UB2(dst0, dst1, dst_ptr, 16);
2924 s += 32;
2925 t += 32;
2926 dst_ptr += 32;
2927 }
2928 }
2929
ARGBSetRow_MSA(uint8_t * dst_argb,uint32_t v32,int width)2930 void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) {
2931 int x;
2932 v4i32 dst0 = __builtin_msa_fill_w(v32);
2933
2934 for (x = 0; x < width; x += 4) {
2935 ST_UB(dst0, dst_argb);
2936 dst_argb += 16;
2937 }
2938 }
2939
RAWToRGB24Row_MSA(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)2940 void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
2941 int x;
2942 v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
2943 v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17};
2944 v16i8 shuffler1 = {8, 7, 12, 11, 10, 15, 14, 13,
2945 18, 17, 16, 21, 20, 19, 24, 23};
2946 v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25,
2947 24, 23, 28, 27, 26, 31, 30, 29};
2948
2949 for (x = 0; x < width; x += 16) {
2950 src0 = (v16u8)__msa_ld_b((void*)src_raw, 0);
2951 src1 = (v16u8)__msa_ld_b((void*)src_raw, 16);
2952 src2 = (v16u8)__msa_ld_b((void*)src_raw, 32);
2953 src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8);
2954 src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
2955 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
2956 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3);
2957 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1);
2958 ST_UB2(dst0, dst1, dst_rgb24, 16);
2959 ST_UB(dst2, (dst_rgb24 + 32));
2960 src_raw += 48;
2961 dst_rgb24 += 48;
2962 }
2963 }
2964
MergeUVRow_MSA(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)2965 void MergeUVRow_MSA(const uint8_t* src_u,
2966 const uint8_t* src_v,
2967 uint8_t* dst_uv,
2968 int width) {
2969 int x;
2970 v16u8 src0, src1, dst0, dst1;
2971
2972 for (x = 0; x < width; x += 16) {
2973 src0 = (v16u8)__msa_ld_b((void*)src_u, 0);
2974 src1 = (v16u8)__msa_ld_b((void*)src_v, 0);
2975 dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);
2976 dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);
2977 ST_UB2(dst0, dst1, dst_uv, 16);
2978 src_u += 16;
2979 src_v += 16;
2980 dst_uv += 32;
2981 }
2982 }
2983
ARGBExtractAlphaRow_MSA(const uint8_t * src_argb,uint8_t * dst_a,int width)2984 void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
2985 uint8_t* dst_a,
2986 int width) {
2987 int i;
2988 v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
2989
2990 for (i = 0; i < width; i += 16) {
2991 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
2992 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
2993 src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
2994 src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
2995 vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
2996 vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
2997 dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
2998 ST_UB(dst0, dst_a);
2999 src_argb += 64;
3000 dst_a += 16;
3001 }
3002 }
3003
ARGBBlendRow_MSA(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3004 void ARGBBlendRow_MSA(const uint8_t* src_argb0,
3005 const uint8_t* src_argb1,
3006 uint8_t* dst_argb,
3007 int width) {
3008 int x;
3009 v16u8 src0, src1, src2, src3, dst0, dst1;
3010 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3011 v8u16 vec8, vec9, vec10, vec11, vec12, vec13;
3012 v8u16 const_256 = (v8u16)__msa_ldi_h(256);
3013 v16u8 const_255 = (v16u8)__msa_ldi_b(255);
3014 v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
3015 v16i8 zero = {0};
3016
3017 for (x = 0; x < width; x += 8) {
3018 src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
3019 src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
3020 src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
3021 src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
3022 vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
3023 vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
3024 vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
3025 vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
3026 vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
3027 vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
3028 vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3);
3029 vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3);
3030 vec8 = (v8u16)__msa_fill_h(vec0[3]);
3031 vec9 = (v8u16)__msa_fill_h(vec0[7]);
3032 vec10 = (v8u16)__msa_fill_h(vec1[3]);
3033 vec11 = (v8u16)__msa_fill_h(vec1[7]);
3034 vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
3035 vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
3036 vec10 = (v8u16)__msa_fill_h(vec2[3]);
3037 vec11 = (v8u16)__msa_fill_h(vec2[7]);
3038 vec12 = (v8u16)__msa_fill_h(vec3[3]);
3039 vec13 = (v8u16)__msa_fill_h(vec3[7]);
3040 vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
3041 vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12);
3042 vec8 = const_256 - vec8;
3043 vec9 = const_256 - vec9;
3044 vec10 = const_256 - vec10;
3045 vec11 = const_256 - vec11;
3046 vec8 *= vec4;
3047 vec9 *= vec5;
3048 vec10 *= vec6;
3049 vec11 *= vec7;
3050 vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8);
3051 vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);
3052 vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);
3053 vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);
3054 vec0 += vec8;
3055 vec1 += vec9;
3056 vec2 += vec10;
3057 vec3 += vec11;
3058 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
3059 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
3060 dst0 = __msa_bmnz_v(dst0, const_255, mask);
3061 dst1 = __msa_bmnz_v(dst1, const_255, mask);
3062 ST_UB2(dst0, dst1, dst_argb, 16);
3063 src_argb0 += 32;
3064 src_argb1 += 32;
3065 dst_argb += 32;
3066 }
3067 }
3068
ARGBQuantizeRow_MSA(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)3069 void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
3070 int scale,
3071 int interval_size,
3072 int interval_offset,
3073 int width) {
3074 int x;
3075 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
3076 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3077 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3078 v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3079 v4i32 vec_scale = __msa_fill_w(scale);
3080 v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size);
3081 v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset);
3082 v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};
3083 v16i8 zero = {0};
3084
3085 for (x = 0; x < width; x += 8) {
3086 src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0);
3087 src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16);
3088 src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32);
3089 src3 = (v16u8)__msa_ld_b((void*)dst_argb, 48);
3090 vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0);
3091 vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0);
3092 vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
3093 vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
3094 vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
3095 vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
3096 vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3);
3097 vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3);
3098 tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
3099 tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
3100 tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
3101 tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
3102 tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2);
3103 tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2);
3104 tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3);
3105 tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3);
3106 tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4);
3107 tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4);
3108 tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5);
3109 tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5);
3110 tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6);
3111 tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6);
3112 tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7);
3113 tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7);
3114 tmp0 *= vec_scale;
3115 tmp1 *= vec_scale;
3116 tmp2 *= vec_scale;
3117 tmp3 *= vec_scale;
3118 tmp4 *= vec_scale;
3119 tmp5 *= vec_scale;
3120 tmp6 *= vec_scale;
3121 tmp7 *= vec_scale;
3122 tmp8 *= vec_scale;
3123 tmp9 *= vec_scale;
3124 tmp10 *= vec_scale;
3125 tmp11 *= vec_scale;
3126 tmp12 *= vec_scale;
3127 tmp13 *= vec_scale;
3128 tmp14 *= vec_scale;
3129 tmp15 *= vec_scale;
3130 tmp0 >>= 16;
3131 tmp1 >>= 16;
3132 tmp2 >>= 16;
3133 tmp3 >>= 16;
3134 tmp4 >>= 16;
3135 tmp5 >>= 16;
3136 tmp6 >>= 16;
3137 tmp7 >>= 16;
3138 tmp8 >>= 16;
3139 tmp9 >>= 16;
3140 tmp10 >>= 16;
3141 tmp11 >>= 16;
3142 tmp12 >>= 16;
3143 tmp13 >>= 16;
3144 tmp14 >>= 16;
3145 tmp15 >>= 16;
3146 vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
3147 vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
3148 vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
3149 vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
3150 vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
3151 vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
3152 vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
3153 vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
3154 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
3155 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
3156 dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
3157 dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
3158 dst0 *= vec_int_sz;
3159 dst1 *= vec_int_sz;
3160 dst2 *= vec_int_sz;
3161 dst3 *= vec_int_sz;
3162 dst0 += vec_int_ofst;
3163 dst1 += vec_int_ofst;
3164 dst2 += vec_int_ofst;
3165 dst3 += vec_int_ofst;
3166 dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0);
3167 dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1);
3168 dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2);
3169 dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3);
3170 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
3171 dst_argb += 64;
3172 }
3173 }
3174
ARGBColorMatrixRow_MSA(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)3175 void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
3176 uint8_t* dst_argb,
3177 const int8_t* matrix_argb,
3178 int width) {
3179 int32_t x;
3180 v16i8 src0;
3181 v16u8 src1, src2, dst0, dst1;
3182 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3183 v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3184 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3185 v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3186 v16i8 zero = {0};
3187 v8i16 max = __msa_ldi_h(255);
3188
3189 src0 = __msa_ld_b((void*)matrix_argb, 0);
3190 vec0 = (v8i16)__msa_ilvr_b(zero, src0);
3191 vec1 = (v8i16)__msa_ilvl_b(zero, src0);
3192
3193 for (x = 0; x < width; x += 8) {
3194 src1 = (v16u8)__msa_ld_b((void*)src_argb, 0);
3195 src2 = (v16u8)__msa_ld_b((void*)src_argb, 16);
3196 vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
3197 vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
3198 vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
3199 vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
3200 vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2);
3201 vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3);
3202 vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4);
3203 vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5);
3204 vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2);
3205 vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3);
3206 vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4);
3207 vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5);
3208 vec10 = vec2 * vec0;
3209 vec11 = vec2 * vec1;
3210 vec12 = vec6 * vec0;
3211 vec13 = vec6 * vec1;
3212 tmp0 = __msa_hadd_s_w(vec10, vec10);
3213 tmp1 = __msa_hadd_s_w(vec11, vec11);
3214 tmp2 = __msa_hadd_s_w(vec12, vec12);
3215 tmp3 = __msa_hadd_s_w(vec13, vec13);
3216 vec14 = vec3 * vec0;
3217 vec15 = vec3 * vec1;
3218 vec16 = vec7 * vec0;
3219 vec17 = vec7 * vec1;
3220 tmp4 = __msa_hadd_s_w(vec14, vec14);
3221 tmp5 = __msa_hadd_s_w(vec15, vec15);
3222 tmp6 = __msa_hadd_s_w(vec16, vec16);
3223 tmp7 = __msa_hadd_s_w(vec17, vec17);
3224 vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
3225 vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
3226 vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
3227 vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
3228 tmp0 = __msa_hadd_s_w(vec10, vec10);
3229 tmp1 = __msa_hadd_s_w(vec11, vec11);
3230 tmp2 = __msa_hadd_s_w(vec12, vec12);
3231 tmp3 = __msa_hadd_s_w(vec13, vec13);
3232 tmp0 = __msa_srai_w(tmp0, 6);
3233 tmp1 = __msa_srai_w(tmp1, 6);
3234 tmp2 = __msa_srai_w(tmp2, 6);
3235 tmp3 = __msa_srai_w(tmp3, 6);
3236 vec2 = vec4 * vec0;
3237 vec6 = vec4 * vec1;
3238 vec3 = vec8 * vec0;
3239 vec7 = vec8 * vec1;
3240 tmp8 = __msa_hadd_s_w(vec2, vec2);
3241 tmp9 = __msa_hadd_s_w(vec6, vec6);
3242 tmp10 = __msa_hadd_s_w(vec3, vec3);
3243 tmp11 = __msa_hadd_s_w(vec7, vec7);
3244 vec4 = vec5 * vec0;
3245 vec8 = vec5 * vec1;
3246 vec5 = vec9 * vec0;
3247 vec9 = vec9 * vec1;
3248 tmp12 = __msa_hadd_s_w(vec4, vec4);
3249 tmp13 = __msa_hadd_s_w(vec8, vec8);
3250 tmp14 = __msa_hadd_s_w(vec5, vec5);
3251 tmp15 = __msa_hadd_s_w(vec9, vec9);
3252 vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
3253 vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
3254 vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
3255 vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
3256 tmp4 = __msa_hadd_s_w(vec14, vec14);
3257 tmp5 = __msa_hadd_s_w(vec15, vec15);
3258 tmp6 = __msa_hadd_s_w(vec16, vec16);
3259 tmp7 = __msa_hadd_s_w(vec17, vec17);
3260 tmp4 = __msa_srai_w(tmp4, 6);
3261 tmp5 = __msa_srai_w(tmp5, 6);
3262 tmp6 = __msa_srai_w(tmp6, 6);
3263 tmp7 = __msa_srai_w(tmp7, 6);
3264 vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
3265 vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
3266 vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
3267 vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
3268 vec10 = __msa_maxi_s_h(vec10, 0);
3269 vec11 = __msa_maxi_s_h(vec11, 0);
3270 vec12 = __msa_maxi_s_h(vec12, 0);
3271 vec13 = __msa_maxi_s_h(vec13, 0);
3272 vec10 = __msa_min_s_h(vec10, max);
3273 vec11 = __msa_min_s_h(vec11, max);
3274 vec12 = __msa_min_s_h(vec12, max);
3275 vec13 = __msa_min_s_h(vec13, max);
3276 dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
3277 dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12);
3278 ST_UB2(dst0, dst1, dst_argb, 16);
3279 src_argb += 32;
3280 dst_argb += 32;
3281 }
3282 }
3283
SplitUVRow_MSA(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)3284 void SplitUVRow_MSA(const uint8_t* src_uv,
3285 uint8_t* dst_u,
3286 uint8_t* dst_v,
3287 int width) {
3288 int x;
3289 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
3290
3291 for (x = 0; x < width; x += 32) {
3292 src0 = (v16u8)__msa_ld_b((void*)src_uv, 0);
3293 src1 = (v16u8)__msa_ld_b((void*)src_uv, 16);
3294 src2 = (v16u8)__msa_ld_b((void*)src_uv, 32);
3295 src3 = (v16u8)__msa_ld_b((void*)src_uv, 48);
3296 dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
3297 dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
3298 dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
3299 dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
3300 ST_UB2(dst0, dst1, dst_u, 16);
3301 ST_UB2(dst2, dst3, dst_v, 16);
3302 src_uv += 64;
3303 dst_u += 32;
3304 dst_v += 32;
3305 }
3306 }
3307
SetRow_MSA(uint8_t * dst,uint8_t v8,int width)3308 void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) {
3309 int x;
3310 v16u8 dst0 = (v16u8)__msa_fill_b(v8);
3311
3312 for (x = 0; x < width; x += 16) {
3313 ST_UB(dst0, dst);
3314 dst += 16;
3315 }
3316 }
3317
MirrorUVRow_MSA(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)3318 void MirrorUVRow_MSA(const uint8_t* src_uv,
3319 uint8_t* dst_u,
3320 uint8_t* dst_v,
3321 int width) {
3322 int x;
3323 v16u8 src0, src1, src2, src3;
3324 v16u8 dst0, dst1, dst2, dst3;
3325 v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0};
3326 v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1};
3327
3328 src_uv += (2 * width);
3329
3330 for (x = 0; x < width; x += 32) {
3331 src_uv -= 64;
3332 src2 = (v16u8)__msa_ld_b((void*)src_uv, 0);
3333 src3 = (v16u8)__msa_ld_b((void*)src_uv, 16);
3334 src0 = (v16u8)__msa_ld_b((void*)src_uv, 32);
3335 src1 = (v16u8)__msa_ld_b((void*)src_uv, 48);
3336 dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
3337 dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
3338 dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
3339 dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
3340 ST_UB2(dst0, dst1, dst_v, 16);
3341 ST_UB2(dst2, dst3, dst_u, 16);
3342 dst_u += 32;
3343 dst_v += 32;
3344 }
3345 }
3346
SobelXRow_MSA(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int32_t width)3347 void SobelXRow_MSA(const uint8_t* src_y0,
3348 const uint8_t* src_y1,
3349 const uint8_t* src_y2,
3350 uint8_t* dst_sobelx,
3351 int32_t width) {
3352 int x;
3353 v16u8 src0, src1, src2, src3, src4, src5, dst0;
3354 v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
3355 v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9};
3356 v16i8 tmp = __msa_ldi_b(8);
3357 v16i8 mask1 = mask0 + tmp;
3358 v8i16 zero = {0};
3359 v8i16 max = __msa_ldi_h(255);
3360
3361 for (x = 0; x < width; x += 16) {
3362 src0 = (v16u8)__msa_ld_b((void*)src_y0, 0);
3363 src1 = (v16u8)__msa_ld_b((void*)src_y0, 16);
3364 src2 = (v16u8)__msa_ld_b((void*)src_y1, 0);
3365 src3 = (v16u8)__msa_ld_b((void*)src_y1, 16);
3366 src4 = (v16u8)__msa_ld_b((void*)src_y2, 0);
3367 src5 = (v16u8)__msa_ld_b((void*)src_y2, 16);
3368 vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
3369 vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
3370 vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
3371 vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
3372 vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4);
3373 vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
3374 vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
3375 vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
3376 vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
3377 vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
3378 vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4);
3379 vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5);
3380 vec0 += vec2;
3381 vec1 += vec3;
3382 vec4 += vec2;
3383 vec5 += vec3;
3384 vec0 += vec4;
3385 vec1 += vec5;
3386 vec0 = __msa_add_a_h(zero, vec0);
3387 vec1 = __msa_add_a_h(zero, vec1);
3388 vec0 = __msa_maxi_s_h(vec0, 0);
3389 vec1 = __msa_maxi_s_h(vec1, 0);
3390 vec0 = __msa_min_s_h(max, vec0);
3391 vec1 = __msa_min_s_h(max, vec1);
3392 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
3393 ST_UB(dst0, dst_sobelx);
3394 src_y0 += 16;
3395 src_y1 += 16;
3396 src_y2 += 16;
3397 dst_sobelx += 16;
3398 }
3399 }
3400
SobelYRow_MSA(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int32_t width)3401 void SobelYRow_MSA(const uint8_t* src_y0,
3402 const uint8_t* src_y1,
3403 uint8_t* dst_sobely,
3404 int32_t width) {
3405 int x;
3406 v16u8 src0, src1, dst0;
3407 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
3408 v8i16 zero = {0};
3409 v8i16 max = __msa_ldi_h(255);
3410
3411 for (x = 0; x < width; x += 16) {
3412 src0 = (v16u8)__msa_ld_b((void*)src_y0, 0);
3413 src1 = (v16u8)__msa_ld_b((void*)src_y1, 0);
3414 vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0);
3415 vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0);
3416 vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
3417 vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
3418 vec0 -= vec2;
3419 vec1 -= vec3;
3420 vec6[0] = src_y0[16] - src_y1[16];
3421 vec6[1] = src_y0[17] - src_y1[17];
3422 vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2);
3423 vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2);
3424 vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4);
3425 vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4);
3426 vec0 += vec2;
3427 vec1 += vec3;
3428 vec4 += vec2;
3429 vec5 += vec3;
3430 vec0 += vec4;
3431 vec1 += vec5;
3432 vec0 = __msa_add_a_h(zero, vec0);
3433 vec1 = __msa_add_a_h(zero, vec1);
3434 vec0 = __msa_maxi_s_h(vec0, 0);
3435 vec1 = __msa_maxi_s_h(vec1, 0);
3436 vec0 = __msa_min_s_h(max, vec0);
3437 vec1 = __msa_min_s_h(max, vec1);
3438 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
3439 ST_UB(dst0, dst_sobely);
3440 src_y0 += 16;
3441 src_y1 += 16;
3442 dst_sobely += 16;
3443 }
3444 }
3445
HalfFloatRow_MSA(const uint16_t * src,uint16_t * dst,float scale,int width)3446 void HalfFloatRow_MSA(const uint16_t* src,
3447 uint16_t* dst,
3448 float scale,
3449 int width) {
3450 int i;
3451 v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
3452 v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3453 v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7;
3454 v4f32 mult_vec;
3455 v8i16 zero = {0};
3456 mult_vec[0] = 1.9259299444e-34f * scale;
3457 mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0);
3458
3459 for (i = 0; i < width; i += 32) {
3460 src0 = (v8u16)__msa_ld_h((void*)src, 0);
3461 src1 = (v8u16)__msa_ld_h((void*)src, 16);
3462 src2 = (v8u16)__msa_ld_h((void*)src, 32);
3463 src3 = (v8u16)__msa_ld_h((void*)src, 48);
3464 vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0);
3465 vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0);
3466 vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1);
3467 vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1);
3468 vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2);
3469 vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2);
3470 vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3);
3471 vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3);
3472 fvec0 = __msa_ffint_u_w(vec0);
3473 fvec1 = __msa_ffint_u_w(vec1);
3474 fvec2 = __msa_ffint_u_w(vec2);
3475 fvec3 = __msa_ffint_u_w(vec3);
3476 fvec4 = __msa_ffint_u_w(vec4);
3477 fvec5 = __msa_ffint_u_w(vec5);
3478 fvec6 = __msa_ffint_u_w(vec6);
3479 fvec7 = __msa_ffint_u_w(vec7);
3480 fvec0 *= mult_vec;
3481 fvec1 *= mult_vec;
3482 fvec2 *= mult_vec;
3483 fvec3 *= mult_vec;
3484 fvec4 *= mult_vec;
3485 fvec5 *= mult_vec;
3486 fvec6 *= mult_vec;
3487 fvec7 *= mult_vec;
3488 vec0 = ((v4u32)fvec0) >> 13;
3489 vec1 = ((v4u32)fvec1) >> 13;
3490 vec2 = ((v4u32)fvec2) >> 13;
3491 vec3 = ((v4u32)fvec3) >> 13;
3492 vec4 = ((v4u32)fvec4) >> 13;
3493 vec5 = ((v4u32)fvec5) >> 13;
3494 vec6 = ((v4u32)fvec6) >> 13;
3495 vec7 = ((v4u32)fvec7) >> 13;
3496 dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
3497 dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2);
3498 dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
3499 dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
3500 ST_UH2(dst0, dst1, dst, 8);
3501 ST_UH2(dst2, dst3, dst + 16, 8);
3502 src += 32;
3503 dst += 32;
3504 }
3505 }
3506
3507 #ifdef __cplusplus
3508 } // extern "C"
3509 } // namespace libyuv
3510 #endif
3511
3512 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
3513