1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_DSP_MIPS_MACROS_MSA_H_
12 #define VPX_DSP_MIPS_MACROS_MSA_H_
13 
14 #include <msa.h>
15 
16 #include "./vpx_config.h"
17 #include "vpx/vpx_integer.h"
18 
19 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
20 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
21 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
22 
23 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
24 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
25 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
26 
27 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
28 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
29 
30 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
31 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
32 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
33 
34 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
35 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
36 
37 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
38 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
39 
40 #if (__mips_isa_rev >= 6)
41 #define LH(psrc) ({                                 \
42   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
43   uint16_t val_m;                                   \
44                                                     \
45   __asm__ __volatile__ (                            \
46       "lh  %[val_m],  %[psrc_m]  \n\t"              \
47                                                     \
48       : [val_m] "=r" (val_m)                        \
49       : [psrc_m] "m" (*psrc_m)                      \
50   );                                                \
51                                                     \
52   val_m;                                            \
53 })
54 
55 #define LW(psrc) ({                                 \
56   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
57   uint32_t val_m;                                   \
58                                                     \
59   __asm__ __volatile__ (                            \
60       "lw  %[val_m],  %[psrc_m]  \n\t"              \
61                                                     \
62       : [val_m] "=r" (val_m)                        \
63       : [psrc_m] "m" (*psrc_m)                      \
64   );                                                \
65                                                     \
66   val_m;                                            \
67 })
68 
69 #if (__mips == 64)
70 #define LD(psrc) ({                                 \
71   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
72   uint64_t val_m = 0;                               \
73                                                     \
74   __asm__ __volatile__ (                            \
75       "ld  %[val_m],  %[psrc_m]  \n\t"              \
76                                                     \
77       : [val_m] "=r" (val_m)                        \
78       : [psrc_m] "m" (*psrc_m)                      \
79   );                                                \
80                                                     \
81   val_m;                                            \
82 })
83 #else  // !(__mips == 64)
84 #define LD(psrc) ({                                        \
85   const uint8_t *psrc_m = (const uint8_t *)(psrc);         \
86   uint32_t val0_m, val1_m;                                 \
87   uint64_t val_m = 0;                                      \
88                                                            \
89   val0_m = LW(psrc_m);                                     \
90   val1_m = LW(psrc_m + 4);                                 \
91                                                            \
92   val_m = (uint64_t)(val1_m);                              \
93   val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
94   val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
95                                                            \
96   val_m;                                                   \
97 })
98 #endif  // (__mips == 64)
99 
100 #define SH(val, pdst) {                 \
101   uint8_t *pdst_m = (uint8_t *)(pdst);  \
102   const uint16_t val_m = (val);         \
103                                         \
104   __asm__ __volatile__ (                \
105       "sh  %[val_m],  %[pdst_m]  \n\t"  \
106                                         \
107       : [pdst_m] "=m" (*pdst_m)         \
108       : [val_m] "r" (val_m)             \
109   );                                    \
110 }
111 
112 #define SW(val, pdst) {                 \
113   uint8_t *pdst_m = (uint8_t *)(pdst);  \
114   const uint32_t val_m = (val);         \
115                                         \
116   __asm__ __volatile__ (                \
117       "sw  %[val_m],  %[pdst_m]  \n\t"  \
118                                         \
119       : [pdst_m] "=m" (*pdst_m)         \
120       : [val_m] "r" (val_m)             \
121   );                                    \
122 }
123 
124 #define SD(val, pdst) {                 \
125   uint8_t *pdst_m = (uint8_t *)(pdst);  \
126   const uint64_t val_m = (val);         \
127                                         \
128   __asm__ __volatile__ (                \
129       "sd  %[val_m],  %[pdst_m]  \n\t"  \
130                                         \
131       : [pdst_m] "=m" (*pdst_m)         \
132       : [val_m] "r" (val_m)             \
133   );                                    \
134 }
135 #else  // !(__mips_isa_rev >= 6)
136 #define LH(psrc) ({                                 \
137   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
138   uint16_t val_m;                                   \
139                                                     \
140   __asm__ __volatile__ (                            \
141       "ulh  %[val_m],  %[psrc_m]  \n\t"             \
142                                                     \
143       : [val_m] "=r" (val_m)                        \
144       : [psrc_m] "m" (*psrc_m)                      \
145   );                                                \
146                                                     \
147   val_m;                                            \
148 })
149 
150 #define LW(psrc) ({                                 \
151   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
152   uint32_t val_m;                                   \
153                                                     \
154   __asm__ __volatile__ (                            \
155       "ulw  %[val_m],  %[psrc_m]  \n\t"             \
156                                                     \
157       : [val_m] "=r" (val_m)                        \
158       : [psrc_m] "m" (*psrc_m)                      \
159   );                                                \
160                                                     \
161   val_m;                                            \
162 })
163 
164 #if (__mips == 64)
165 #define LD(psrc) ({                                 \
166   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
167   uint64_t val_m = 0;                               \
168                                                     \
169   __asm__ __volatile__ (                            \
170       "uld  %[val_m],  %[psrc_m]  \n\t"             \
171                                                     \
172       : [val_m] "=r" (val_m)                        \
173       : [psrc_m] "m" (*psrc_m)                      \
174   );                                                \
175                                                     \
176   val_m;                                            \
177 })
178 #else  // !(__mips == 64)
179 #define LD(psrc) ({                                        \
180   const uint8_t *psrc_m1 = (const uint8_t *)(psrc);        \
181   uint32_t val0_m, val1_m;                                 \
182   uint64_t val_m = 0;                                      \
183                                                            \
184   val0_m = LW(psrc_m1);                                    \
185   val1_m = LW(psrc_m1 + 4);                                \
186                                                            \
187   val_m = (uint64_t)(val1_m);                              \
188   val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
189   val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
190                                                            \
191   val_m;                                                   \
192 })
193 #endif  // (__mips == 64)
194 
195 #define SH(val, pdst) {                  \
196   uint8_t *pdst_m = (uint8_t *)(pdst);   \
197   const uint16_t val_m = (val);          \
198                                          \
199   __asm__ __volatile__ (                 \
200       "ush  %[val_m],  %[pdst_m]  \n\t"  \
201                                          \
202       : [pdst_m] "=m" (*pdst_m)          \
203       : [val_m] "r" (val_m)              \
204   );                                     \
205 }
206 
207 #define SW(val, pdst) {                  \
208   uint8_t *pdst_m = (uint8_t *)(pdst);   \
209   const uint32_t val_m = (val);          \
210                                          \
211   __asm__ __volatile__ (                 \
212       "usw  %[val_m],  %[pdst_m]  \n\t"  \
213                                          \
214       : [pdst_m] "=m" (*pdst_m)          \
215       : [val_m] "r" (val_m)              \
216   );                                     \
217 }
218 
219 #define SD(val, pdst) {                                     \
220   uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \
221   uint32_t val0_m, val1_m;                                  \
222                                                             \
223   val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
224   val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
225                                                             \
226   SW(val0_m, pdst_m1);                                      \
227   SW(val1_m, pdst_m1 + 4);                                  \
228 }
229 #endif  // (__mips_isa_rev >= 6)
230 
231 /* Description : Load 4 words with stride
232    Arguments   : Inputs  - psrc, stride
233                  Outputs - out0, out1, out2, out3
234    Details     : Load word in 'out0' from (psrc)
235                  Load word in 'out1' from (psrc + stride)
236                  Load word in 'out2' from (psrc + 2 * stride)
237                  Load word in 'out3' from (psrc + 3 * stride)
238 */
239 #define LW4(psrc, stride, out0, out1, out2, out3) {  \
240   out0 = LW((psrc));                                 \
241   out1 = LW((psrc) + stride);                        \
242   out2 = LW((psrc) + 2 * stride);                    \
243   out3 = LW((psrc) + 3 * stride);                    \
244 }
245 
246 /* Description : Load double words with stride
247    Arguments   : Inputs  - psrc, stride
248                  Outputs - out0, out1
249    Details     : Load double word in 'out0' from (psrc)
250                  Load double word in 'out1' from (psrc + stride)
251 */
252 #define LD2(psrc, stride, out0, out1) {  \
253   out0 = LD((psrc));                     \
254   out1 = LD((psrc) + stride);            \
255 }
256 #define LD4(psrc, stride, out0, out1, out2, out3) {  \
257   LD2((psrc), stride, out0, out1);                   \
258   LD2((psrc) + 2 * stride, stride, out2, out3);      \
259 }
260 
261 /* Description : Store 4 words with stride
262    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
263    Details     : Store word from 'in0' to (pdst)
264                  Store word from 'in1' to (pdst + stride)
265                  Store word from 'in2' to (pdst + 2 * stride)
266                  Store word from 'in3' to (pdst + 3 * stride)
267 */
268 #define SW4(in0, in1, in2, in3, pdst, stride) {  \
269   SW(in0, (pdst))                                \
270   SW(in1, (pdst) + stride);                      \
271   SW(in2, (pdst) + 2 * stride);                  \
272   SW(in3, (pdst) + 3 * stride);                  \
273 }
274 
275 /* Description : Store 4 double words with stride
276    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
277    Details     : Store double word from 'in0' to (pdst)
278                  Store double word from 'in1' to (pdst + stride)
279                  Store double word from 'in2' to (pdst + 2 * stride)
280                  Store double word from 'in3' to (pdst + 3 * stride)
281 */
282 #define SD4(in0, in1, in2, in3, pdst, stride) {  \
283   SD(in0, (pdst))                                \
284   SD(in1, (pdst) + stride);                      \
285   SD(in2, (pdst) + 2 * stride);                  \
286   SD(in3, (pdst) + 3 * stride);                  \
287 }
288 
289 /* Description : Load vectors with 16 byte elements with stride
290    Arguments   : Inputs  - psrc, stride
291                  Outputs - out0, out1
292                  Return Type - as per RTYPE
293    Details     : Load 16 byte elements in 'out0' from (psrc)
294                  Load 16 byte elements in 'out1' from (psrc + stride)
295 */
296 #define LD_B2(RTYPE, psrc, stride, out0, out1) {  \
297   out0 = LD_B(RTYPE, (psrc));                     \
298   out1 = LD_B(RTYPE, (psrc) + stride);            \
299 }
300 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
301 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
302 
303 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) {  \
304   LD_B2(RTYPE, (psrc), stride, out0, out1);             \
305   out2 = LD_B(RTYPE, (psrc) + 2 * stride);              \
306 }
307 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
308 
309 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
310   LD_B2(RTYPE, (psrc), stride, out0, out1);                   \
311   LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);     \
312 }
313 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
314 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
315 
316 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) {  \
317   LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);             \
318   out4 = LD_B(RTYPE, (psrc) + 4 * stride);                          \
319 }
320 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
321 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
322 
323 #define LD_B7(RTYPE, psrc, stride,                             \
324               out0, out1, out2, out3, out4, out5, out6) {      \
325   LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
326   LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
327 }
328 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
329 
330 #define LD_B8(RTYPE, psrc, stride,                                    \
331               out0, out1, out2, out3, out4, out5, out6, out7) {       \
332   LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
333   LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
334 }
335 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
336 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
337 
338 /* Description : Load vectors with 8 halfword elements with stride
339    Arguments   : Inputs  - psrc, stride
340                  Outputs - out0, out1
341    Details     : Load 8 halfword elements in 'out0' from (psrc)
342                  Load 8 halfword elements in 'out1' from (psrc + stride)
343 */
344 #define LD_H2(RTYPE, psrc, stride, out0, out1) {  \
345   out0 = LD_H(RTYPE, (psrc));                     \
346   out1 = LD_H(RTYPE, (psrc) + (stride));          \
347 }
348 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
349 
350 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
351   LD_H2(RTYPE, (psrc), stride, out0, out1);                   \
352   LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);      \
353 }
354 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
355 
356 #define LD_H8(RTYPE, psrc, stride,                                    \
357               out0, out1, out2, out3, out4, out5, out6, out7) {       \
358   LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
359   LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
360 }
361 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
362 
363 #define LD_H16(RTYPE, psrc, stride,                                     \
364                out0, out1, out2, out3, out4, out5, out6, out7,          \
365                out8, out9, out10, out11, out12, out13, out14, out15) {  \
366   LD_H8(RTYPE, (psrc), stride,                                          \
367         out0, out1, out2, out3, out4, out5, out6, out7);                \
368   LD_H8(RTYPE, (psrc) + 8 * stride, stride,                             \
369         out8, out9, out10, out11, out12, out13, out14, out15);          \
370 }
371 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
372 
373 /* Description : Load 4x4 block of signed halfword elements from 1D source
374                  data into 4 vectors (Each vector with 4 signed halfwords)
375    Arguments   : Input   - psrc
376                  Outputs - out0, out1, out2, out3
377 */
378 #define LD4x4_SH(psrc, out0, out1, out2, out3) {         \
379   out0 = LD_SH(psrc);                                    \
380   out2 = LD_SH(psrc + 8);                                \
381   out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);  \
382   out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2);  \
383 }
384 
385 /* Description : Load 2 vectors of signed word elements with stride
386    Arguments   : Inputs  - psrc, stride
387                  Outputs - out0, out1
388                  Return Type - signed word
389 */
390 #define LD_SW2(psrc, stride, out0, out1) {  \
391   out0 = LD_SW((psrc));                     \
392   out1 = LD_SW((psrc) + stride);            \
393 }
394 
395 /* Description : Store vectors of 16 byte elements with stride
396    Arguments   : Inputs - in0, in1, pdst, stride
397    Details     : Store 16 byte elements from 'in0' to (pdst)
398                  Store 16 byte elements from 'in1' to (pdst + stride)
399 */
400 #define ST_B2(RTYPE, in0, in1, pdst, stride) {  \
401   ST_B(RTYPE, in0, (pdst));                     \
402   ST_B(RTYPE, in1, (pdst) + stride);            \
403 }
404 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
405 
406 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
407   ST_B2(RTYPE, in0, in1, (pdst), stride);                 \
408   ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
409 }
410 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
411 
412 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,      \
413               pdst, stride) {                                     \
414   ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
415   ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
416 }
417 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
418 
419 /* Description : Store vectors of 8 halfword elements with stride
420    Arguments   : Inputs - in0, in1, pdst, stride
421    Details     : Store 8 halfword elements from 'in0' to (pdst)
422                  Store 8 halfword elements from 'in1' to (pdst + stride)
423 */
424 #define ST_H2(RTYPE, in0, in1, pdst, stride) {  \
425   ST_H(RTYPE, in0, (pdst));                     \
426   ST_H(RTYPE, in1, (pdst) + stride);            \
427 }
428 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
429 
430 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
431   ST_H2(RTYPE, in0, in1, (pdst), stride);                 \
432   ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
433 }
434 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
435 
436 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) {  \
437   ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                           \
438   ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);              \
439 }
440 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
441 
442 /* Description : Store vectors of word elements with stride
443    Arguments   : Inputs - in0, in1, pdst, stride
444    Details     : Store 4 word elements from 'in0' to (pdst)
445                  Store 4 word elements from 'in1' to (pdst + stride)
446 */
447 #define ST_SW2(in0, in1, pdst, stride) {  \
448   ST_SW(in0, (pdst));                     \
449   ST_SW(in1, (pdst) + stride);            \
450 }
451 
452 /* Description : Store 2x4 byte block to destination memory from input vector
453    Arguments   : Inputs - in, stidx, pdst, stride
454    Details     : Index 'stidx' halfword element from 'in' vector is copied to
455                  the GP register and stored to (pdst)
456                  Index 'stidx+1' halfword element from 'in' vector is copied to
457                  the GP register and stored to (pdst + stride)
458                  Index 'stidx+2' halfword element from 'in' vector is copied to
459                  the GP register and stored to (pdst + 2 * stride)
460                  Index 'stidx+3' halfword element from 'in' vector is copied to
461                  the GP register and stored to (pdst + 3 * stride)
462 */
463 #define ST2x4_UB(in, stidx, pdst, stride) {         \
464   uint16_t out0_m, out1_m, out2_m, out3_m;          \
465   uint8_t *pblk_2x4_m = (uint8_t *)(pdst);          \
466                                                     \
467   out0_m = __msa_copy_u_h((v8i16)in, (stidx));      \
468   out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1));  \
469   out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2));  \
470   out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3));  \
471                                                     \
472   SH(out0_m, pblk_2x4_m);                           \
473   SH(out1_m, pblk_2x4_m + stride);                  \
474   SH(out2_m, pblk_2x4_m + 2 * stride);              \
475   SH(out3_m, pblk_2x4_m + 3 * stride);              \
476 }
477 
478 /* Description : Store 4x2 byte block to destination memory from input vector
479    Arguments   : Inputs - in, pdst, stride
480    Details     : Index 0 word element from 'in' vector is copied to the GP
481                  register and stored to (pdst)
482                  Index 1 word element from 'in' vector is copied to the GP
483                  register and stored to (pdst + stride)
484 */
485 #define ST4x2_UB(in, pdst, stride) {        \
486   uint32_t out0_m, out1_m;                  \
487   uint8_t *pblk_4x2_m = (uint8_t *)(pdst);  \
488                                             \
489   out0_m = __msa_copy_u_w((v4i32)in, 0);    \
490   out1_m = __msa_copy_u_w((v4i32)in, 1);    \
491                                             \
492   SW(out0_m, pblk_4x2_m);                   \
493   SW(out1_m, pblk_4x2_m + stride);          \
494 }
495 
496 /* Description : Store 4x4 byte block to destination memory from input vector
497    Arguments   : Inputs - in0, in1, pdst, stride
498    Details     : 'Idx0' word element from input vector 'in0' is copied to the
499                  GP register and stored to (pdst)
500                  'Idx1' word element from input vector 'in0' is copied to the
501                  GP register and stored to (pdst + stride)
502                  'Idx2' word element from input vector 'in0' is copied to the
503                  GP register and stored to (pdst + 2 * stride)
504                  'Idx3' word element from input vector 'in0' is copied to the
505                  GP register and stored to (pdst + 3 * stride)
506 */
507 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \
508   uint32_t out0_m, out1_m, out2_m, out3_m;                          \
509   uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                          \
510                                                                     \
511   out0_m = __msa_copy_u_w((v4i32)in0, idx0);                        \
512   out1_m = __msa_copy_u_w((v4i32)in0, idx1);                        \
513   out2_m = __msa_copy_u_w((v4i32)in1, idx2);                        \
514   out3_m = __msa_copy_u_w((v4i32)in1, idx3);                        \
515                                                                     \
516   SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);          \
517 }
518 #define ST4x8_UB(in0, in1, pdst, stride) {                        \
519   uint8_t *pblk_4x8 = (uint8_t *)(pdst);                          \
520                                                                   \
521   ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
522   ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
523 }
524 
525 /* Description : Store 8x1 byte block to destination memory from input vector
526    Arguments   : Inputs - in, pdst
527    Details     : Index 0 double word element from 'in' vector is copied to the
528                  GP register and stored to (pdst)
529 */
530 #define ST8x1_UB(in, pdst) {              \
531   uint64_t out0_m;                        \
532                                           \
533   out0_m = __msa_copy_u_d((v2i64)in, 0);  \
534   SD(out0_m, pdst);                       \
535 }
536 
537 /* Description : Store 8x2 byte block to destination memory from input vector
538    Arguments   : Inputs - in, pdst, stride
539    Details     : Index 0 double word element from 'in' vector is copied to the
540                  GP register and stored to (pdst)
541                  Index 1 double word element from 'in' vector is copied to the
542                  GP register and stored to (pdst + stride)
543 */
544 #define ST8x2_UB(in, pdst, stride) {        \
545   uint64_t out0_m, out1_m;                  \
546   uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \
547                                             \
548   out0_m = __msa_copy_u_d((v2i64)in, 0);    \
549   out1_m = __msa_copy_u_d((v2i64)in, 1);    \
550                                             \
551   SD(out0_m, pblk_8x2_m);                   \
552   SD(out1_m, pblk_8x2_m + stride);          \
553 }
554 
555 /* Description : Store 8x4 byte block to destination memory from input
556                  vectors
557    Arguments   : Inputs - in0, in1, pdst, stride
558    Details     : Index 0 double word element from 'in0' vector is copied to the
559                  GP register and stored to (pdst)
560                  Index 1 double word element from 'in0' vector is copied to the
561                  GP register and stored to (pdst + stride)
562                  Index 0 double word element from 'in1' vector is copied to the
563                  GP register and stored to (pdst + 2 * stride)
564                  Index 1 double word element from 'in1' vector is copied to the
565                  GP register and stored to (pdst + 3 * stride)
566 */
567 #define ST8x4_UB(in0, in1, pdst, stride) {                  \
568   uint64_t out0_m, out1_m, out2_m, out3_m;                  \
569   uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \
570                                                             \
571   out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \
572   out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \
573   out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \
574   out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \
575                                                             \
576   SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
577 }
578 
579 /* Description : average with rounding (in0 + in1 + 1) / 2.
580    Arguments   : Inputs  - in0, in1, in2, in3,
581                  Outputs - out0, out1
582                  Return Type - as per RTYPE
583    Details     : Each unsigned byte element from 'in0' vector is added with
584                  each unsigned byte element from 'in1' vector. Then the average
585                  with rounding is calculated and written to 'out0'
586 */
587 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
588   out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1);    \
589   out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3);    \
590 }
591 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
592 
593 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
594                  out0, out1, out2, out3) {                       \
595   AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                \
596   AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                \
597 }
598 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
599 
600 /* Description : Immediate number of elements to slide with zero
601    Arguments   : Inputs  - in0, in1, slide_val
602                  Outputs - out0, out1
603                  Return Type - as per RTYPE
604    Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
605                  value specified in the 'slide_val'
606 */
607 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) {          \
608   v16i8 zero_m = { 0 };                                              \
609   out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val);  \
610   out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val);  \
611 }
612 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
613 
614 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
615                   out0, out1, out2, out3, slide_val) {  \
616   SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);    \
617   SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);    \
618 }
619 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
620 
621 /* Description : Immediate number of elements to slide
622    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
623                  Outputs - out0, out1
624                  Return Type - as per RTYPE
625    Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
626                  value specified in the 'slide_val'
627 */
628 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) {  \
629   out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);         \
630   out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);         \
631 }
632 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
633 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
634 
635 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,      \
636                 out0, out1, out2, slide_val) {                        \
637   SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)   \
638   out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);  \
639 }
640 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
641 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
642 
643 /* Description : Shuffle byte vector elements as per mask vector
644    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
645                  Outputs - out0, out1
646                  Return Type - as per RTYPE
647    Details     : Byte elements from 'in0' & 'in1' are copied selectively to
648                  'out0' as per control vector 'mask0'
649 */
650 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \
651   out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);     \
652   out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2);     \
653 }
654 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
655 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
656 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
657 
658 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,     \
659                 out0, out1, out2, out3) {                        \
660   VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
661   VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
662 }
663 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
664 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
665 
666 /* Description : Dot product of byte vector elements
667    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
668                  Outputs - out0, out1
669                  Return Type - as per RTYPE
670    Details     : Unsigned byte elements from 'mult0' are multiplied with
671                  unsigned byte elements from 'cnst0' producing a result
672                  twice the size of input i.e. unsigned halfword.
673                  The multiplication result of adjacent odd-even elements
674                  are added together and written to the 'out0' vector
675 */
676 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
677   out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);        \
678   out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);        \
679 }
680 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
681 
682 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,         \
683                  cnst0, cnst1, cnst2, cnst3,                \
684                  out0, out1, out2, out3) {                  \
685   DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
686   DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
687 }
688 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
689 
690 /* Description : Dot product of byte vector elements
691    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
692                  Outputs - out0, out1
693                  Return Type - as per RTYPE
694    Details     : Signed byte elements from 'mult0' are multiplied with
695                  signed byte elements from 'cnst0' producing a result
696                  twice the size of input i.e. signed halfword.
697                  The multiplication result of adjacent odd-even elements
698                  are added together and written to the 'out0' vector
699 */
700 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
701   out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);        \
702   out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);        \
703 }
704 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
705 
706 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
707                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
708   DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
709   DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
710 }
711 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
712 
713 /* Description : Dot product of halfword vector elements
714    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
715                  Outputs - out0, out1
716                  Return Type - as per RTYPE
717    Details     : Signed halfword elements from 'mult0' are multiplied with
718                  signed halfword elements from 'cnst0' producing a result
719                  twice the size of input i.e. signed word.
720                  The multiplication result of adjacent odd-even elements
721                  are added together and written to the 'out0' vector
722 */
723 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
724   out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);        \
725   out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);        \
726 }
727 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
728 
729 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,         \
730                  cnst0, cnst1, cnst2, cnst3,                \
731                  out0, out1, out2, out3) {                  \
732   DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
733   DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
734 }
735 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
736 
737 /* Description : Dot product of word vector elements
738    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
739                  Outputs - out0, out1
740                  Return Type - as per RTYPE
741    Details     : Signed word elements from 'mult0' are multiplied with
742                  signed word elements from 'cnst0' producing a result
743                  twice the size of input i.e. signed double word.
744                  The multiplication result of adjacent odd-even elements
745                  are added together and written to the 'out0' vector
746 */
747 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
748   out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);        \
749   out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);        \
750 }
751 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
752 
753 /* Description : Dot product & addition of byte vector elements
754    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
755                  Outputs - out0, out1
756                  Return Type - as per RTYPE
757    Details     : Signed byte elements from 'mult0' are multiplied with
758                  signed byte elements from 'cnst0' producing a result
759                  twice the size of input i.e. signed halfword.
760                  The multiplication result of adjacent odd-even elements
761                  are added to the 'out0' vector
762 */
763 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
764   out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0);  \
765   out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1);  \
766 }
767 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
768 
769 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
770                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
771   DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
772   DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
773 }
774 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
775 
776 /* Description : Dot product & addition of halfword vector elements
777    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
778                  Outputs - out0, out1
779                  Return Type - as per RTYPE
780    Details     : Signed halfword elements from 'mult0' are multiplied with
781                  signed halfword elements from 'cnst0' producing a result
782                  twice the size of input i.e. signed word.
783                  The multiplication result of adjacent odd-even elements
784                  are added to the 'out0' vector
785 */
786 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
787   out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \
788   out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \
789 }
790 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
791 
792 /* Description : Dot product & addition of double word vector elements
793    Arguments   : Inputs  - mult0, mult1
794                  Outputs - out0, out1
795                  Return Type - as per RTYPE
796    Details     : Each signed word element from 'mult0' is multiplied with itself
797                  producing an intermediate result twice the size of input
798                  i.e. signed double word
799                  The multiplication result of adjacent odd-even elements
800                  are added to the 'out0' vector
801 */
802 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) {                       \
803   out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0);  \
804   out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1);  \
805 }
806 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
807 
808 /* Description : Minimum values between unsigned elements of
809                  either vector are copied to the output vector
810    Arguments   : Inputs  - in0, in1, min_vec
811                  Outputs - in place operation
812                  Return Type - as per RTYPE
813    Details     : Minimum of unsigned halfword element values from 'in0' and
814                  'min_vec' are written to output vector 'in0'
815 */
816 #define MIN_UH2(RTYPE, in0, in1, min_vec) {         \
817   in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec);  \
818   in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec);  \
819 }
820 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
821 
822 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) {  \
823   MIN_UH2(RTYPE, in0, in1, min_vec);                   \
824   MIN_UH2(RTYPE, in2, in3, min_vec);                   \
825 }
826 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
827 
828 /* Description : Clips all signed halfword elements of input vector
829                  between 0 & 255
830    Arguments   : Input  - in
831                  Output - out_m
832                  Return Type - signed halfword
833 */
834 #define CLIP_SH_0_255(in) ({                          \
835   v8i16 max_m = __msa_ldi_h(255);                     \
836   v8i16 out_m;                                        \
837                                                       \
838   out_m = __msa_maxi_s_h((v8i16)in, 0);               \
839   out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \
840   out_m;                                              \
841 })
842 #define CLIP_SH2_0_255(in0, in1) {  \
843   in0 = CLIP_SH_0_255(in0);         \
844   in1 = CLIP_SH_0_255(in1);         \
845 }
846 #define CLIP_SH4_0_255(in0, in1, in2, in3) {  \
847   CLIP_SH2_0_255(in0, in1);                   \
848   CLIP_SH2_0_255(in2, in3);                   \
849 }
850 
851 /* Description : Horizontal addition of 4 signed word elements of input vector
852    Arguments   : Input  - in       (signed word vector)
853                  Output - sum_m    (i32 sum)
854                  Return Type - signed word (GP)
855    Details     : 4 signed word elements of 'in' vector are added together and
856                  the resulting integer sum is returned
857 */
858 #define HADD_SW_S32(in) ({                        \
859   v2i64 res0_m, res1_m;                           \
860   int32_t sum_m;                                  \
861                                                   \
862   res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);  \
863   res1_m = __msa_splati_d(res0_m, 1);             \
864   res0_m = res0_m + res1_m;                       \
865   sum_m = __msa_copy_s_w((v4i32)res0_m, 0);       \
866   sum_m;                                          \
867 })
868 
869 /* Description : Horizontal addition of 8 unsigned halfword elements
870    Arguments   : Inputs  - in       (unsigned halfword vector)
871                  Outputs - sum_m    (u32 sum)
872                  Return Type - unsigned word
873    Details     : 8 unsigned halfword elements of input vector are added
874                  together and the resulting integer sum is returned
875 */
876 #define HADD_UH_U32(in) ({                           \
877   v4u32 res_m;                                       \
878   v2u64 res0_m, res1_m;                              \
879   uint32_t sum_m;                                    \
880                                                      \
881   res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);      \
882   res0_m = __msa_hadd_u_d(res_m, res_m);             \
883   res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);  \
884   res0_m = res0_m + res1_m;                          \
885   sum_m = __msa_copy_u_w((v4i32)res0_m, 0);          \
886   sum_m;                                             \
887 })
888 
889 /* Description : Horizontal addition of unsigned byte vector elements
890    Arguments   : Inputs  - in0, in1
891                  Outputs - out0, out1
892                  Return Type - as per RTYPE
893    Details     : Each unsigned odd byte element from 'in0' is added to
894                  even unsigned byte element from 'in0' (pairwise) and the
895                  halfword result is written to 'out0'
896 */
897 #define HADD_UB2(RTYPE, in0, in1, out0, out1) {          \
898   out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \
899   out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1);  \
900 }
901 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
902 
903 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \
904   HADD_UB2(RTYPE, in0, in1, out0, out1);                               \
905   HADD_UB2(RTYPE, in2, in3, out2, out3);                               \
906 }
907 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
908 
909 /* Description : Horizontal subtraction of unsigned byte vector elements
910    Arguments   : Inputs  - in0, in1
911                  Outputs - out0, out1
912                  Return Type - as per RTYPE
913    Details     : Each unsigned odd byte element from 'in0' is subtracted from
914                  even unsigned byte element from 'in0' (pairwise) and the
915                  halfword result is written to 'out0'
916 */
917 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) {          \
918   out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \
919   out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \
920 }
921 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
922 
923 /* Description : SAD (Sum of Absolute Difference)
924    Arguments   : Inputs  - in0, in1, ref0, ref1
925                  Outputs - sad_m                 (halfword vector)
926                  Return Type - unsigned halfword
927    Details     : Absolute difference of all the byte elements from 'in0' with
928                  'ref0' is calculated and preserved in 'diff0'. Then even-odd
929                  pairs are added together to generate 8 halfword results.
930 */
931 #define SAD_UB2_UH(in0, in1, ref0, ref1) ({                 \
932   v16u8 diff0_m, diff1_m;                                   \
933   v8u16 sad_m = { 0 };                                      \
934                                                             \
935   diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);        \
936   diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);        \
937                                                             \
938   sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m);  \
939   sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m);  \
940                                                             \
941   sad_m;                                                    \
942 })
943 
944 /* Description : Horizontal subtraction of signed halfword vector elements
945    Arguments   : Inputs  - in0, in1
946                  Outputs - out0, out1
947                  Return Type - as per RTYPE
948    Details     : Each signed odd halfword element from 'in0' is subtracted from
949                  even signed halfword element from 'in0' (pairwise) and the
950                  word result is written to 'out0'
951 */
952 #define HSUB_UH2(RTYPE, in0, in1, out0, out1) {          \
953   out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0);  \
954   out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1);  \
955 }
956 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
957 
958 /* Description : Set element n input vector to GPR value
959    Arguments   : Inputs - in0, in1, in2, in3
960                  Output - out
961                  Return Type - as per RTYPE
962    Details     : Set element 0 in vector 'out' to value specified in 'in0'
963 */
964 #define INSERT_W2(RTYPE, in0, in1, out) {           \
965   out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
966   out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \
967 }
968 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
969 
970 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) {  \
971   out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);   \
972   out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);   \
973   out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);   \
974   out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);   \
975 }
976 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
977 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
978 
979 #define INSERT_D2(RTYPE, in0, in1, out) {           \
980   out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
981   out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
982 }
983 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
984 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
985 
986 /* Description : Interleave even byte elements from vectors
987    Arguments   : Inputs  - in0, in1, in2, in3
988                  Outputs - out0, out1
989                  Return Type - as per RTYPE
990    Details     : Even byte elements of 'in0' and 'in1' are interleaved
991                  and written to 'out0'
992 */
993 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
994   out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);     \
995   out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);     \
996 }
997 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
998 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
999 
1000 /* Description : Interleave even halfword elements from vectors
1001    Arguments   : Inputs  - in0, in1, in2, in3
1002                  Outputs - out0, out1
1003                  Return Type - as per RTYPE
1004    Details     : Even halfword elements of 'in0' and 'in1' are interleaved
1005                  and written to 'out0'
1006 */
1007 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1008   out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);     \
1009   out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);     \
1010 }
1011 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1012 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1013 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1014 
1015 /* Description : Interleave even word elements from vectors
1016    Arguments   : Inputs  - in0, in1, in2, in3
1017                  Outputs - out0, out1
1018                  Return Type - as per RTYPE
1019    Details     : Even word elements of 'in0' and 'in1' are interleaved
1020                  and written to 'out0'
1021 */
1022 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1023   out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);     \
1024   out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);     \
1025 }
1026 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1027 
1028 /* Description : Interleave even double word elements from vectors
1029    Arguments   : Inputs  - in0, in1, in2, in3
1030                  Outputs - out0, out1
1031                  Return Type - as per RTYPE
1032    Details     : Even double word elements of 'in0' and 'in1' are interleaved
1033                  and written to 'out0'
1034 */
1035 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1036   out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);     \
1037   out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);     \
1038 }
1039 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1040 
1041 /* Description : Interleave left half of byte elements from vectors
1042    Arguments   : Inputs  - in0, in1, in2, in3
1043                  Outputs - out0, out1
1044                  Return Type - as per RTYPE
1045    Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
1046                  and written to 'out0'.
1047 */
1048 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1049   out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);     \
1050   out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);     \
1051 }
1052 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1053 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1054 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1055 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1056 
1057 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1058                 out0, out1, out2, out3) {                       \
1059   ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1060   ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1061 }
1062 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1063 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1064 
1065 /* Description : Interleave left half of halfword elements from vectors
1066    Arguments   : Inputs  - in0, in1, in2, in3
1067                  Outputs - out0, out1
1068                  Return Type - as per RTYPE
1069    Details     : Left half of halfword elements of 'in0' and 'in1' are
1070                  interleaved and written to 'out0'.
1071 */
1072 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1073   out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);     \
1074   out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3);     \
1075 }
1076 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1077 
1078 /* Description : Interleave left half of word elements from vectors
1079    Arguments   : Inputs  - in0, in1, in2, in3
1080                  Outputs - out0, out1
1081                  Return Type - as per RTYPE
1082    Details     : Left half of word elements of 'in0' and 'in1' are interleaved
1083                  and written to 'out0'.
1084 */
1085 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1086   out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);     \
1087   out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3);     \
1088 }
1089 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1090 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1091 
1092 /* Description : Interleave right half of byte elements from vectors
1093    Arguments   : Inputs  - in0, in1, in2, in3
1094                  Outputs - out0, out1
1095                  Return Type - as per RTYPE
1096    Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
1097                  and written to out0.
1098 */
1099 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1100   out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \
1101   out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);     \
1102 }
1103 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1104 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1105 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1106 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1107 
1108 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1109                 out0, out1, out2, out3) {                       \
1110   ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1111   ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1112 }
1113 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1114 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1115 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1116 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1117 
1118 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
1119                 in8, in9, in10, in11, in12, in13, in14, in15,      \
1120                 out0, out1, out2, out3, out4, out5, out6, out7) {  \
1121   ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,           \
1122           out0, out1, out2, out3);                                 \
1123   ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,     \
1124           out4, out5, out6, out7);                                 \
1125 }
1126 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1127 
1128 /* Description : Interleave right half of halfword elements from vectors
1129    Arguments   : Inputs  - in0, in1, in2, in3
1130                  Outputs - out0, out1
1131                  Return Type - as per RTYPE
1132    Details     : Right half of halfword elements of 'in0' and 'in1' are
1133                  interleaved and written to 'out0'.
1134 */
1135 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1136   out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \
1137   out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \
1138 }
1139 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1140 
1141 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1142                 out0, out1, out2, out3) {                       \
1143   ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1144   ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1145 }
1146 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1147 
1148 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1149   out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);     \
1150   out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3);     \
1151 }
1152 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1153 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1154 
1155 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1156                 out0, out1, out2, out3) {                       \
1157   ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1158   ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1159 }
1160 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1161 
1162 /* Description : Interleave right half of double word elements from vectors
1163    Arguments   : Inputs  - in0, in1, in2, in3
1164                  Outputs - out0, out1
1165                  Return Type - as per RTYPE
1166    Details     : Right half of double word elements of 'in0' and 'in1' are
1167                  interleaved and written to 'out0'.
1168 */
1169 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {   \
1170   out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \
1171   out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3));  \
1172 }
1173 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1174 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1175 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1176 
1177 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) {  \
1178   ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                         \
1179   out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));                 \
1180 }
1181 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1182 
1183 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1184                 out0, out1, out2, out3) {                       \
1185   ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1186   ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1187 }
1188 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1189 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1190 
1191 /* Description : Interleave both left and right half of input vectors
1192    Arguments   : Inputs  - in0, in1
1193                  Outputs - out0, out1
1194                  Return Type - as per RTYPE
1195    Details     : Right half of byte elements from 'in0' and 'in1' are
1196                  interleaved and written to 'out0'
1197 */
1198 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) {        \
1199   out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
1200   out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
1201 }
1202 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1203 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1204 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1205 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1206 
1207 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) {        \
1208   out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
1209   out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
1210 }
1211 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1212 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1213 
1214 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) {        \
1215   out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
1216   out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
1217 }
1218 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1219 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1220 
1221 /* Description : Saturate the halfword element values to the max
1222                  unsigned value of (sat_val + 1) bits
1223                  The element data width remains unchanged
1224    Arguments   : Inputs  - in0, in1, sat_val
1225                  Outputs - in place operation
1226                  Return Type - as per RTYPE
1227    Details     : Each unsigned halfword element from 'in0' is saturated to the
1228                  value generated with (sat_val + 1) bit range.
1229                  The results are written in place
1230 */
1231 #define SAT_UH2(RTYPE, in0, in1, sat_val) {         \
1232   in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \
1233   in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val);  \
1234 }
1235 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1236 
1237 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
1238   SAT_UH2(RTYPE, in0, in1, sat_val);                   \
1239   SAT_UH2(RTYPE, in2, in3, sat_val)                    \
1240 }
1241 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1242 
1243 /* Description : Saturate the halfword element values to the max
1244                  unsigned value of (sat_val + 1) bits
1245                  The element data width remains unchanged
1246    Arguments   : Inputs  - in0, in1, sat_val
1247                  Outputs - in place operation
1248                  Return Type - as per RTYPE
1249    Details     : Each unsigned halfword element from 'in0' is saturated to the
1250                  value generated with (sat_val + 1) bit range
1251                  The results are written in place
1252 */
1253 #define SAT_SH2(RTYPE, in0, in1, sat_val) {         \
1254   in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \
1255   in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val);  \
1256 }
1257 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1258 
1259 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
1260   SAT_SH2(RTYPE, in0, in1, sat_val);                   \
1261   SAT_SH2(RTYPE, in2, in3, sat_val);                   \
1262 }
1263 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1264 
1265 /* Description : Indexed halfword element values are replicated to all
1266                  elements in output vector
1267    Arguments   : Inputs  - in, idx0, idx1
1268                  Outputs - out0, out1
1269                  Return Type - as per RTYPE
1270    Details     : 'idx0' element value from 'in' vector is replicated to all
1271                   elements in 'out0' vector
1272                   Valid index range for halfword operation is 0-7
1273 */
1274 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) {  \
1275   out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);        \
1276   out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);        \
1277 }
1278 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1279 
1280 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
1281                   out0, out1, out2, out3) {           \
1282   SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);       \
1283   SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);       \
1284 }
1285 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1286 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1287 
1288 /* Description : Pack even byte elements of vector pairs
1289    Arguments   : Inputs  - in0, in1, in2, in3
1290                  Outputs - out0, out1
1291                  Return Type - as per RTYPE
1292    Details     : Even byte elements of 'in0' are copied to the left half of
1293                  'out0' & even byte elements of 'in1' are copied to the right
1294                  half of 'out0'.
1295 */
1296 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1297   out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \
1298   out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);     \
1299 }
1300 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1301 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1302 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1303 
1304 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1305                  out0, out1, out2, out3) {                       \
1306   PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1307   PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1308 }
1309 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1310 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1311 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1312 
1313 /* Description : Pack even halfword elements of vector pairs
1314    Arguments   : Inputs  - in0, in1, in2, in3
1315                  Outputs - out0, out1
1316                  Return Type - as per RTYPE
1317    Details     : Even halfword elements of 'in0' are copied to the left half of
1318                  'out0' & even halfword elements of 'in1' are copied to the
1319                  right half of 'out0'.
1320 */
1321 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1322   out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);     \
1323   out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);     \
1324 }
1325 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1326 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1327 
1328 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1329                  out0, out1, out2, out3) {                       \
1330   PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1331   PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1332 }
1333 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1334 
1335 /* Description : Pack even double word elements of vector pairs
1336    Arguments   : Inputs  - in0, in1, in2, in3
1337                  Outputs - out0, out1
1338                  Return Type - as per RTYPE
1339    Details     : Even double elements of 'in0' are copied to the left half of
1340                  'out0' & even double elements of 'in1' are copied to the right
1341                  half of 'out0'.
1342 */
1343 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1344   out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1);     \
1345   out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3);     \
1346 }
1347 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1348 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1349 
1350 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1351                  out0, out1, out2, out3) {                       \
1352   PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1353   PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1354 }
1355 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1356 
1357 /* Description : Each byte element is logically xor'ed with immediate 128
1358    Arguments   : Inputs  - in0, in1
1359                  Outputs - in place operation
1360                  Return Type - as per RTYPE
1361    Details     : Each unsigned byte element from input vector 'in0' is
1362                  logically xor'ed with 128 and the result is stored in-place.
1363 */
1364 #define XORI_B2_128(RTYPE, in0, in1) {         \
1365   in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128);  \
1366   in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128);  \
1367 }
1368 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1369 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1370 
1371 #define XORI_B3_128(RTYPE, in0, in1, in2) {    \
1372   XORI_B2_128(RTYPE, in0, in1);                \
1373   in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128);  \
1374 }
1375 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1376 
1377 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) {  \
1378   XORI_B2_128(RTYPE, in0, in1);                   \
1379   XORI_B2_128(RTYPE, in2, in3);                   \
1380 }
1381 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1382 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1383 
1384 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) {  \
1385   XORI_B4_128(RTYPE, in0, in1, in2, in3);                        \
1386   XORI_B3_128(RTYPE, in4, in5, in6);                             \
1387 }
1388 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1389 
1390 /* Description : Average of signed halfword elements -> (a + b) / 2
1391    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1392                  Outputs - out0, out1, out2, out3
1393                  Return Type - as per RTYPE
1394    Details     : Each signed halfword element from 'in0' is added to each
1395                  signed halfword element of 'in1' with full precision resulting
1396                  in one extra bit in the result. The result is then divided by
1397                  2 and written to 'out0'
1398 */
1399 #define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1400                 out0, out1, out2, out3) {                       \
1401   out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);          \
1402   out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);          \
1403   out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);          \
1404   out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);          \
1405 }
1406 #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
1407 
1408 /* Description : Addition of signed halfword elements and signed saturation
1409    Arguments   : Inputs  - in0, in1, in2, in3
1410                  Outputs - out0, out1
1411                  Return Type - as per RTYPE
1412    Details     : Signed halfword elements from 'in0' are added to signed
1413                  halfword elements of 'in1'. The result is then signed saturated
1414                  between halfword data type range
1415 */
1416 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1417   out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1);    \
1418   out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3);    \
1419 }
1420 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1421 
1422 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1423                  out0, out1, out2, out3) {                       \
1424   ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1425   ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1426 }
1427 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1428 
1429 /* Description : Shift left all elements of vector (generic for all data types)
1430    Arguments   : Inputs  - in0, in1, in2, in3, shift
1431                  Outputs - in place operation
1432                  Return Type - as per input vector RTYPE
1433    Details     : Each element of vector 'in0' is left shifted by 'shift' and
1434                  the result is written in-place.
1435 */
1436 #define SLLI_4V(in0, in1, in2, in3, shift) {  \
1437   in0 = in0 << shift;                         \
1438   in1 = in1 << shift;                         \
1439   in2 = in2 << shift;                         \
1440   in3 = in3 << shift;                         \
1441 }
1442 
1443 /* Description : Arithmetic shift right all elements of vector
1444                  (generic for all data types)
1445    Arguments   : Inputs  - in0, in1, in2, in3, shift
1446                  Outputs - in place operation
1447                  Return Type - as per input vector RTYPE
1448    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1449                  the result is written in-place. 'shift' is a GP variable.
1450 */
1451 #define SRA_4V(in0, in1, in2, in3, shift) {  \
1452   in0 = in0 >> shift;                        \
1453   in1 = in1 >> shift;                        \
1454   in2 = in2 >> shift;                        \
1455   in3 = in3 >> shift;                        \
1456 }
1457 
1458 /* Description : Shift right arithmetic rounded words
1459    Arguments   : Inputs  - in0, in1, shift
1460                  Outputs - in place operation
1461                  Return Type - as per RTYPE
1462    Details     : Each element of vector 'in0' is shifted right arithmetically by
1463                  the number of bits in the corresponding element in the vector
1464                  'shift'. The last discarded bit is added to shifted value for
1465                  rounding and the result is written in-place.
1466                  'shift' is a vector.
1467 */
1468 #define SRAR_W2(RTYPE, in0, in1, shift) {               \
1469   in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift);  \
1470   in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift);  \
1471 }
1472 
1473 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) {  \
1474   SRAR_W2(RTYPE, in0, in1, shift)                    \
1475   SRAR_W2(RTYPE, in2, in3, shift)                    \
1476 }
1477 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
1478 
1479 /* Description : Shift right arithmetic rounded (immediate)
1480    Arguments   : Inputs  - in0, in1, shift
1481                  Outputs - in place operation
1482                  Return Type - as per RTYPE
1483    Details     : Each element of vector 'in0' is shifted right arithmetically by
1484                  the value in 'shift'. The last discarded bit is added to the
1485                  shifted value for rounding and the result is written in-place.
1486                  'shift' is an immediate value.
1487 */
1488 #define SRARI_H2(RTYPE, in0, in1, shift) {        \
1489   in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \
1490   in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift);  \
1491 }
1492 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1493 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1494 
1495 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) {  \
1496   SRARI_H2(RTYPE, in0, in1, shift);                   \
1497   SRARI_H2(RTYPE, in2, in3, shift);                   \
1498 }
1499 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
1500 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
1501 
1502 #define SRARI_W2(RTYPE, in0, in1, shift) {        \
1503   in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
1504   in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
1505 }
1506 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1507 
1508 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) {  \
1509   SRARI_W2(RTYPE, in0, in1, shift);                   \
1510   SRARI_W2(RTYPE, in2, in3, shift);                   \
1511 }
1512 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1513 
1514 /* Description : Logical shift right all elements of vector (immediate)
1515    Arguments   : Inputs  - in0, in1, in2, in3, shift
1516                  Outputs - out0, out1, out2, out3
1517                  Return Type - as per RTYPE
1518    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1519                  the result is written in-place. 'shift' is an immediate value.
1520 */
1521 #define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) {  \
1522   out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                             \
1523   out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                             \
1524   out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                             \
1525   out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                             \
1526 }
1527 #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
1528 
1529 /* Description : Multiplication of pairs of vectors
1530    Arguments   : Inputs  - in0, in1, in2, in3
1531                  Outputs - out0, out1
1532    Details     : Each element from 'in0' is multiplied with elements from 'in1'
1533                  and the result is written to 'out0'
1534 */
1535 #define MUL2(in0, in1, in2, in3, out0, out1) {  \
1536   out0 = in0 * in1;                             \
1537   out1 = in2 * in3;                             \
1538 }
1539 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
1540              out0, out1, out2, out3) {                \
1541   MUL2(in0, in1, in2, in3, out0, out1);               \
1542   MUL2(in4, in5, in6, in7, out2, out3);               \
1543 }
1544 
1545 /* Description : Addition of 2 pairs of vectors
1546    Arguments   : Inputs  - in0, in1, in2, in3
1547                  Outputs - out0, out1
1548    Details     : Each element in 'in0' is added to 'in1' and result is written
1549                  to 'out0'.
1550 */
1551 #define ADD2(in0, in1, in2, in3, out0, out1) {  \
1552   out0 = in0 + in1;                             \
1553   out1 = in2 + in3;                             \
1554 }
1555 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
1556              out0, out1, out2, out3) {                \
1557   ADD2(in0, in1, in2, in3, out0, out1);               \
1558   ADD2(in4, in5, in6, in7, out2, out3);               \
1559 }
1560 
1561 /* Description : Subtraction of 2 pairs of vectors
1562    Arguments   : Inputs  - in0, in1, in2, in3
1563                  Outputs - out0, out1
1564    Details     : Each element in 'in1' is subtracted from 'in0' and result is
1565                  written to 'out0'.
1566 */
1567 #define SUB2(in0, in1, in2, in3, out0, out1) {  \
1568   out0 = in0 - in1;                             \
1569   out1 = in2 - in3;                             \
1570 }
1571 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
1572              out0, out1, out2, out3) {                \
1573   out0 = in0 - in1;                                   \
1574   out1 = in2 - in3;                                   \
1575   out2 = in4 - in5;                                   \
1576   out3 = in6 - in7;                                   \
1577 }
1578 
1579 /* Description : Sign extend halfword elements from right half of the vector
1580    Arguments   : Input  - in    (halfword vector)
1581                  Output - out   (sign extended word vector)
1582                  Return Type - signed word
1583    Details     : Sign bit of halfword elements from input vector 'in' is
1584                  extracted and interleaved with same vector 'in0' to generate
1585                  4 word elements keeping sign intact
1586 */
1587 #define UNPCK_R_SH_SW(in, out) {                 \
1588   v8i16 sign_m;                                  \
1589                                                  \
1590   sign_m = __msa_clti_s_h((v8i16)in, 0);         \
1591   out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);  \
1592 }
1593 
1594 /* Description : Zero extend unsigned byte elements to halfword elements
1595    Arguments   : Input   - in          (unsigned byte vector)
1596                  Outputs - out0, out1  (unsigned  halfword vectors)
1597                  Return Type - signed halfword
1598    Details     : Zero extended right half of vector is returned in 'out0'
1599                  Zero extended left half of vector is returned in 'out1'
1600 */
1601 #define UNPCK_UB_SH(in, out0, out1) {   \
1602   v16i8 zero_m = { 0 };                 \
1603                                         \
1604   ILVRL_B2_SH(zero_m, in, out0, out1);  \
1605 }
1606 
1607 /* Description : Sign extend halfword elements from input vector and return
1608                  the result in pair of vectors
1609    Arguments   : Input   - in            (halfword vector)
1610                  Outputs - out0, out1   (sign extended word vectors)
1611                  Return Type - signed word
1612    Details     : Sign bit of halfword elements from input vector 'in' is
1613                  extracted and interleaved right with same vector 'in0' to
1614                  generate 4 signed word elements in 'out0'
1615                  Then interleaved left with same vector 'in0' to
1616                  generate 4 signed word elements in 'out1'
1617 */
1618 #define UNPCK_SH_SW(in, out0, out1) {    \
1619   v8i16 tmp_m;                           \
1620                                          \
1621   tmp_m = __msa_clti_s_h((v8i16)in, 0);  \
1622   ILVRL_H2_SW(tmp_m, in, out0, out1);    \
1623 }
1624 
1625 /* Description : Butterfly of 4 input vectors
1626    Arguments   : Inputs  - in0, in1, in2, in3
1627                  Outputs - out0, out1, out2, out3
1628    Details     : Butterfly operation
1629 */
1630 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
1631   out0 = in0 + in3;                                                \
1632   out1 = in1 + in2;                                                \
1633                                                                    \
1634   out2 = in1 - in2;                                                \
1635   out3 = in0 - in3;                                                \
1636 }
1637 
1638 /* Description : Butterfly of 8 input vectors
1639    Arguments   : Inputs  - in0 ...  in7
1640                  Outputs - out0 .. out7
1641    Details     : Butterfly operation
1642 */
1643 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,            \
1644                     out0, out1, out2, out3, out4, out5, out6, out7) {  \
1645   out0 = in0 + in7;                                                    \
1646   out1 = in1 + in6;                                                    \
1647   out2 = in2 + in5;                                                    \
1648   out3 = in3 + in4;                                                    \
1649                                                                        \
1650   out4 = in3 - in4;                                                    \
1651   out5 = in2 - in5;                                                    \
1652   out6 = in1 - in6;                                                    \
1653   out7 = in0 - in7;                                                    \
1654 }
1655 
1656 /* Description : Butterfly of 16 input vectors
1657    Arguments   : Inputs  - in0 ...  in15
1658                  Outputs - out0 .. out15
1659    Details     : Butterfly operation
1660 */
1661 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                  \
1662                      in8, in9,  in10, in11, in12, in13, in14, in15,           \
1663                      out0, out1, out2, out3, out4, out5, out6, out7,          \
1664                      out8, out9, out10, out11, out12, out13, out14, out15) {  \
1665   out0 = in0 + in15;                                                          \
1666   out1 = in1 + in14;                                                          \
1667   out2 = in2 + in13;                                                          \
1668   out3 = in3 + in12;                                                          \
1669   out4 = in4 + in11;                                                          \
1670   out5 = in5 + in10;                                                          \
1671   out6 = in6 + in9;                                                           \
1672   out7 = in7 + in8;                                                           \
1673                                                                               \
1674   out8 = in7 - in8;                                                           \
1675   out9 = in6 - in9;                                                           \
1676   out10 = in5 - in10;                                                         \
1677   out11 = in4 - in11;                                                         \
1678   out12 = in3 - in12;                                                         \
1679   out13 = in2 - in13;                                                         \
1680   out14 = in1 - in14;                                                         \
1681   out15 = in0 - in15;                                                         \
1682 }
1683 
1684 /* Description : Transpose input 8x8 byte block
1685    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1686                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1687                  Return Type - as per RTYPE
1688 */
1689 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
1690                         out0, out1, out2, out3, out4, out5, out6, out7) {  \
1691   v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
1692   v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
1693                                                                            \
1694   ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                       \
1695              tmp0_m, tmp1_m, tmp2_m, tmp3_m);                              \
1696   ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                             \
1697   ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                             \
1698   ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                             \
1699   ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                             \
1700   SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                             \
1701   SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                             \
1702 }
1703 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
1704 
1705 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
1706    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1707                            in8, in9, in10, in11, in12, in13, in14, in15
1708                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1709                  Return Type - unsigned byte
1710 */
1711 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
1712                             in8, in9, in10, in11, in12, in13, in14, in15,      \
1713                             out0, out1, out2, out3, out4, out5, out6, out7) {  \
1714   v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                        \
1715   v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                        \
1716                                                                                \
1717   ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                                 \
1718   ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                               \
1719   ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                               \
1720   ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                               \
1721                                                                                \
1722   tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                     \
1723   tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                     \
1724   tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                     \
1725   tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                     \
1726   out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                       \
1727   tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                     \
1728   out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                       \
1729   tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                     \
1730                                                                                \
1731   ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                     \
1732   out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1733   out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1734                                                                                \
1735   tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                 \
1736   tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                     \
1737   out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1738   out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1739                                                                                \
1740   ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);                 \
1741   out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1742   out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1743                                                                                \
1744   tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
1745   tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
1746   tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
1747   tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
1748   out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1749   out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1750 }
1751 
1752 /* Description : Transpose 4x4 block with half word elements in vectors
1753    Arguments   : Inputs  - in0, in1, in2, in3
1754                  Outputs - out0, out1, out2, out3
1755                  Return Type - signed halfword
1756 */
1757 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
1758   v8i16 s0_m, s1_m;                                                       \
1759                                                                           \
1760   ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                             \
1761   ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                    \
1762   out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                   \
1763   out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);                   \
1764 }
1765 
1766 /* Description : Transpose 4x8 block with half word elements in vectors
1767    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1768                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1769                  Return Type - signed halfword
1770 */
1771 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \
1772                            out0, out1, out2, out3, out4, out5, out6, out7) {  \
1773   v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                       \
1774   v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                       \
1775   v8i16 zero_m = { 0 };                                                       \
1776                                                                               \
1777   ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                          \
1778              tmp0_n, tmp1_n, tmp2_n, tmp3_n);                                 \
1779   ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                                \
1780   ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                                \
1781                                                                               \
1782   out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
1783   out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
1784   out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
1785   out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
1786                                                                               \
1787   out4 = zero_m;                                                              \
1788   out5 = zero_m;                                                              \
1789   out6 = zero_m;                                                              \
1790   out7 = zero_m;                                                              \
1791 }
1792 
1793 /* Description : Transpose 8x4 block with half word elements in vectors
1794    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1795                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1796                  Return Type - signed halfword
1797 */
1798 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
1799   v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
1800                                                                           \
1801   ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                         \
1802   ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                         \
1803   ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);                 \
1804   ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);                 \
1805 }
1806 
1807 /* Description : Transpose 8x8 block with half word elements in vectors
1808    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1809                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1810                  Return Type - as per RTYPE
1811 */
1812 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
1813                        out0, out1, out2, out3, out4, out5, out6, out7) {  \
1814   v8i16 s0_m, s1_m;                                                       \
1815   v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
1816   v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
1817                                                                           \
1818   ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
1819   ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
1820   ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
1821   ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
1822   ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
1823   ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
1824   ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
1825   ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
1826   PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,         \
1827            tmp3_m, tmp7_m, out0, out2, out4, out6);                       \
1828   out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
1829   out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
1830   out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
1831   out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
1832 }
1833 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
1834 
1835 /* Description : Transpose 4x4 block with word elements in vectors
1836    Arguments   : Inputs  - in0, in1, in2, in3
1837                  Outputs - out0, out1, out2, out3
1838                  Return Type - signed word
1839 */
1840 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) {  \
1841   v4i32 s0_m, s1_m, s2_m, s3_m;                                           \
1842                                                                           \
1843   ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                      \
1844   ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                      \
1845                                                                           \
1846   out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);                   \
1847   out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                   \
1848   out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                   \
1849   out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                   \
1850 }
1851 
1852 /* Description : Add block 4x4
1853    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
1854    Details     : Least significant 4 bytes from each input vector are added to
1855                  the destination bytes, clipped between 0-255 and stored.
1856 */
1857 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \
1858   uint32_t src0_m, src1_m, src2_m, src3_m;                      \
1859   v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
1860   v16i8 dst0_m = { 0 };                                         \
1861   v16i8 dst1_m = { 0 };                                         \
1862   v16i8 zero_m = { 0 };                                         \
1863                                                                 \
1864   ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
1865   LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
1866   INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
1867   INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
1868   ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
1869   ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
1870   CLIP_SH2_0_255(res0_m, res1_m);                               \
1871   PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
1872   ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
1873 }
1874 
1875 /* Description : Pack even elements of input vectors & xor with 128
1876    Arguments   : Inputs - in0, in1
1877                  Output - out_m
1878                  Return Type - unsigned byte
1879    Details     : Signed byte even elements from 'in0' and 'in1' are packed
1880                  together in one vector and the resulting vector is xor'ed with
1881                  128 to shift the range from signed to unsigned byte
1882 */
1883 #define PCKEV_XORI128_UB(in0, in1) ({                    \
1884   v16u8 out_m;                                           \
1885                                                          \
1886   out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0);  \
1887   out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);        \
1888   out_m;                                                 \
1889 })
1890 
1891 /* Description : Converts inputs to unsigned bytes, interleave, average & store
1892                  as 8x4 unsigned byte block
1893    Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
1894                           pdst, stride
1895 */
1896 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,                      \
1897                                 dst0, dst1, dst2, dst3, pdst, stride) {  \
1898   v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
1899   uint8_t *pdst_m = (uint8_t *)(pdst);                                   \
1900                                                                          \
1901   tmp0_m = PCKEV_XORI128_UB(in0, in1);                                   \
1902   tmp1_m = PCKEV_XORI128_UB(in2, in3);                                   \
1903   ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                    \
1904   AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);           \
1905   ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                              \
1906 }
1907 
1908 /* Description : Pack even byte elements and store byte vector in destination
1909                  memory
1910    Arguments   : Inputs - in0, in1, pdst
1911 */
1912 #define PCKEV_ST_SB(in0, in1, pdst) {             \
1913   v16i8 tmp_m;                                    \
1914                                                   \
1915   tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0);  \
1916   ST_SB(tmp_m, (pdst));                           \
1917 }
1918 
1919 /* Description : Horizontal 2 tap filter kernel code
1920    Arguments   : Inputs - in0, in1, mask, coeff, shift
1921 */
1922 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({    \
1923   v16i8 tmp0_m;                                                \
1924   v8u16 tmp1_m;                                                \
1925                                                                \
1926   tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0);  \
1927   tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);        \
1928   tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);         \
1929                                                                \
1930   tmp1_m;                                                      \
1931 })
1932 #endif  /* VPX_DSP_MIPS_MACROS_MSA_H_ */
1933