1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_DSP_MIPS_MACROS_MSA_H_
12 #define VPX_DSP_MIPS_MACROS_MSA_H_
13 
14 #include <msa.h>
15 
16 #include "./vpx_config.h"
17 #include "vpx/vpx_integer.h"
18 
19 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
20 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
21 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
22 
23 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
24 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
25 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
26 
27 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
28 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
29 
30 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
31 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
32 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
33 
34 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
35 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
36 
37 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
38 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
39 
40 #if (__mips_isa_rev >= 6)
41 #define LH(psrc)                                          \
42   ({                                                      \
43     const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
44     uint16_t val_m;                                       \
45                                                           \
46     __asm__ __volatile__("lh  %[val_m],  %[psrc_m]  \n\t" \
47                                                           \
48                          : [val_m] "=r"(val_m)            \
49                          : [psrc_m] "m"(*psrc_m));        \
50                                                           \
51     val_m;                                                \
52   })
53 
54 #define LW(psrc)                                          \
55   ({                                                      \
56     const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
57     uint32_t val_m;                                       \
58                                                           \
59     __asm__ __volatile__("lw  %[val_m],  %[psrc_m]  \n\t" \
60                                                           \
61                          : [val_m] "=r"(val_m)            \
62                          : [psrc_m] "m"(*psrc_m));        \
63                                                           \
64     val_m;                                                \
65   })
66 
67 #if (__mips == 64)
68 #define LD(psrc)                                          \
69   ({                                                      \
70     const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
71     uint64_t val_m = 0;                                   \
72                                                           \
73     __asm__ __volatile__("ld  %[val_m],  %[psrc_m]  \n\t" \
74                                                           \
75                          : [val_m] "=r"(val_m)            \
76                          : [psrc_m] "m"(*psrc_m));        \
77                                                           \
78     val_m;                                                \
79   })
80 #else  // !(__mips == 64)
81 #define LD(psrc)                                            \
82   ({                                                        \
83     const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
84     uint32_t val0_m, val1_m;                                \
85     uint64_t val_m = 0;                                     \
86                                                             \
87     val0_m = LW(psrc_m);                                    \
88     val1_m = LW(psrc_m + 4);                                \
89                                                             \
90     val_m = (uint64_t)(val1_m);                             \
91     val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
92     val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
93                                                             \
94     val_m;                                                  \
95   })
96 #endif  // (__mips == 64)
97 
98 #define SH(val, pdst)                                     \
99   {                                                       \
100     uint8_t *pdst_m = (uint8_t *)(pdst);                  \
101     const uint16_t val_m = (val);                         \
102                                                           \
103     __asm__ __volatile__("sh  %[val_m],  %[pdst_m]  \n\t" \
104                                                           \
105                          : [pdst_m] "=m"(*pdst_m)         \
106                          : [val_m] "r"(val_m));           \
107   }
108 
109 #define SW(val, pdst)                                     \
110   {                                                       \
111     uint8_t *pdst_m = (uint8_t *)(pdst);                  \
112     const uint32_t val_m = (val);                         \
113                                                           \
114     __asm__ __volatile__("sw  %[val_m],  %[pdst_m]  \n\t" \
115                                                           \
116                          : [pdst_m] "=m"(*pdst_m)         \
117                          : [val_m] "r"(val_m));           \
118   }
119 
120 #define SD(val, pdst)                                     \
121   {                                                       \
122     uint8_t *pdst_m = (uint8_t *)(pdst);                  \
123     const uint64_t val_m = (val);                         \
124                                                           \
125     __asm__ __volatile__("sd  %[val_m],  %[pdst_m]  \n\t" \
126                                                           \
127                          : [pdst_m] "=m"(*pdst_m)         \
128                          : [val_m] "r"(val_m));           \
129   }
130 #else  // !(__mips_isa_rev >= 6)
131 #define LH(psrc)                                           \
132   ({                                                       \
133     const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
134     uint16_t val_m;                                        \
135                                                            \
136     __asm__ __volatile__("ulh  %[val_m],  %[psrc_m]  \n\t" \
137                                                            \
138                          : [val_m] "=r"(val_m)             \
139                          : [psrc_m] "m"(*psrc_m));         \
140                                                            \
141     val_m;                                                 \
142   })
143 
144 #define LW(psrc)                                           \
145   ({                                                       \
146     const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
147     uint32_t val_m;                                        \
148                                                            \
149     __asm__ __volatile__("ulw  %[val_m],  %[psrc_m]  \n\t" \
150                                                            \
151                          : [val_m] "=r"(val_m)             \
152                          : [psrc_m] "m"(*psrc_m));         \
153                                                            \
154     val_m;                                                 \
155   })
156 
157 #if (__mips == 64)
158 #define LD(psrc)                                           \
159   ({                                                       \
160     const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
161     uint64_t val_m = 0;                                    \
162                                                            \
163     __asm__ __volatile__("uld  %[val_m],  %[psrc_m]  \n\t" \
164                                                            \
165                          : [val_m] "=r"(val_m)             \
166                          : [psrc_m] "m"(*psrc_m));         \
167                                                            \
168     val_m;                                                 \
169   })
170 #else  // !(__mips == 64)
171 #define LD(psrc)                                                              \
172   ({                                                                          \
173     const uint8_t *psrc_m1 = (const uint8_t *)(psrc);                         \
174     uint32_t val0_m, val1_m;                                                  \
175     uint64_t val_m_combined = 0;                                              \
176                                                                               \
177     val0_m = LW(psrc_m1);                                                     \
178     val1_m = LW(psrc_m1 + 4);                                                 \
179                                                                               \
180     val_m_combined = (uint64_t)(val1_m);                                      \
181     val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \
182     val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m);           \
183                                                                               \
184     val_m_combined;                                                           \
185   })
186 #endif  // (__mips == 64)
187 
188 #define SH(val, pdst)                                      \
189   {                                                        \
190     uint8_t *pdst_m = (uint8_t *)(pdst);                   \
191     const uint16_t val_m = (val);                          \
192                                                            \
193     __asm__ __volatile__("ush  %[val_m],  %[pdst_m]  \n\t" \
194                                                            \
195                          : [pdst_m] "=m"(*pdst_m)          \
196                          : [val_m] "r"(val_m));            \
197   }
198 
199 #define SW(val, pdst)                                      \
200   {                                                        \
201     uint8_t *pdst_m = (uint8_t *)(pdst);                   \
202     const uint32_t val_m = (val);                          \
203                                                            \
204     __asm__ __volatile__("usw  %[val_m],  %[pdst_m]  \n\t" \
205                                                            \
206                          : [pdst_m] "=m"(*pdst_m)          \
207                          : [val_m] "r"(val_m));            \
208   }
209 
210 #define SD(val, pdst)                                        \
211   {                                                          \
212     uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
213     uint32_t val0_m, val1_m;                                 \
214                                                              \
215     val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
216     val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
217                                                              \
218     SW(val0_m, pdst_m1);                                     \
219     SW(val1_m, pdst_m1 + 4);                                 \
220   }
221 #endif  // (__mips_isa_rev >= 6)
222 
223 /* Description : Load 4 words with stride
224    Arguments   : Inputs  - psrc, stride
225                  Outputs - out0, out1, out2, out3
226    Details     : Load word in 'out0' from (psrc)
227                  Load word in 'out1' from (psrc + stride)
228                  Load word in 'out2' from (psrc + 2 * stride)
229                  Load word in 'out3' from (psrc + 3 * stride)
230 */
231 #define LW4(psrc, stride, out0, out1, out2, out3) \
232   {                                               \
233     out0 = LW((psrc));                            \
234     out1 = LW((psrc) + stride);                   \
235     out2 = LW((psrc) + 2 * stride);               \
236     out3 = LW((psrc) + 3 * stride);               \
237   }
238 
239 /* Description : Load double words with stride
240    Arguments   : Inputs  - psrc, stride
241                  Outputs - out0, out1
242    Details     : Load double word in 'out0' from (psrc)
243                  Load double word in 'out1' from (psrc + stride)
244 */
245 #define LD2(psrc, stride, out0, out1) \
246   {                                   \
247     out0 = LD((psrc));                \
248     out1 = LD((psrc) + stride);       \
249   }
250 #define LD4(psrc, stride, out0, out1, out2, out3) \
251   {                                               \
252     LD2((psrc), stride, out0, out1);              \
253     LD2((psrc) + 2 * stride, stride, out2, out3); \
254   }
255 
256 /* Description : Store 4 words with stride
257    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
258    Details     : Store word from 'in0' to (pdst)
259                  Store word from 'in1' to (pdst + stride)
260                  Store word from 'in2' to (pdst + 2 * stride)
261                  Store word from 'in3' to (pdst + 3 * stride)
262 */
263 #define SW4(in0, in1, in2, in3, pdst, stride) \
264   {                                           \
265     SW(in0, (pdst))                           \
266     SW(in1, (pdst) + stride);                 \
267     SW(in2, (pdst) + 2 * stride);             \
268     SW(in3, (pdst) + 3 * stride);             \
269   }
270 
271 /* Description : Store 4 double words with stride
272    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
273    Details     : Store double word from 'in0' to (pdst)
274                  Store double word from 'in1' to (pdst + stride)
275                  Store double word from 'in2' to (pdst + 2 * stride)
276                  Store double word from 'in3' to (pdst + 3 * stride)
277 */
278 #define SD4(in0, in1, in2, in3, pdst, stride) \
279   {                                           \
280     SD(in0, (pdst))                           \
281     SD(in1, (pdst) + stride);                 \
282     SD(in2, (pdst) + 2 * stride);             \
283     SD(in3, (pdst) + 3 * stride);             \
284   }
285 
286 /* Description : Load vectors with 16 byte elements with stride
287    Arguments   : Inputs  - psrc, stride
288                  Outputs - out0, out1
289                  Return Type - as per RTYPE
290    Details     : Load 16 byte elements in 'out0' from (psrc)
291                  Load 16 byte elements in 'out1' from (psrc + stride)
292 */
293 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
294   {                                            \
295     out0 = LD_B(RTYPE, (psrc));                \
296     out1 = LD_B(RTYPE, (psrc) + stride);       \
297   }
298 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
299 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
300 
301 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
302   {                                                  \
303     LD_B2(RTYPE, (psrc), stride, out0, out1);        \
304     out2 = LD_B(RTYPE, (psrc) + 2 * stride);         \
305   }
306 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
307 
308 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
309   {                                                        \
310     LD_B2(RTYPE, (psrc), stride, out0, out1);              \
311     LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
312   }
313 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
314 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
315 
316 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
317   {                                                              \
318     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
319     out4 = LD_B(RTYPE, (psrc) + 4 * stride);                     \
320   }
321 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
322 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
323 
324 #define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
325   {                                                                          \
326     LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
327     LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
328   }
329 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
330 
331 #define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
332               out7)                                                          \
333   {                                                                          \
334     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
335     LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
336   }
337 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
338 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
339 
340 /* Description : Load vectors with 8 halfword elements with stride
341    Arguments   : Inputs  - psrc, stride
342                  Outputs - out0, out1
343    Details     : Load 8 halfword elements in 'out0' from (psrc)
344                  Load 8 halfword elements in 'out1' from (psrc + stride)
345 */
346 #define LD_H2(RTYPE, psrc, stride, out0, out1) \
347   {                                            \
348     out0 = LD_H(RTYPE, (psrc));                \
349     out1 = LD_H(RTYPE, (psrc) + (stride));     \
350   }
351 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
352 
353 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
354   {                                                        \
355     LD_H2(RTYPE, (psrc), stride, out0, out1);              \
356     LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
357   }
358 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
359 
360 #define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
361               out7)                                                          \
362   {                                                                          \
363     LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
364     LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
365   }
366 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
367 
368 #define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
369                out7, out8, out9, out10, out11, out12, out13, out14, out15)     \
370   {                                                                            \
371     LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
372           out7);                                                               \
373     LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
374           out13, out14, out15);                                                \
375   }
376 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
377 
378 /* Description : Load 4x4 block of signed halfword elements from 1D source
379                  data into 4 vectors (Each vector with 4 signed halfwords)
380    Arguments   : Input   - psrc
381                  Outputs - out0, out1, out2, out3
382 */
383 #define LD4x4_SH(psrc, out0, out1, out2, out3)            \
384   {                                                       \
385     out0 = LD_SH(psrc);                                   \
386     out2 = LD_SH(psrc + 8);                               \
387     out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
388     out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
389   }
390 
391 /* Description : Load 2 vectors of signed word elements with stride
392    Arguments   : Inputs  - psrc, stride
393                  Outputs - out0, out1
394                  Return Type - signed word
395 */
396 #define LD_SW2(psrc, stride, out0, out1) \
397   {                                      \
398     out0 = LD_SW((psrc));                \
399     out1 = LD_SW((psrc) + stride);       \
400   }
401 
402 /* Description : Store vectors of 16 byte elements with stride
403    Arguments   : Inputs - in0, in1, pdst, stride
404    Details     : Store 16 byte elements from 'in0' to (pdst)
405                  Store 16 byte elements from 'in1' to (pdst + stride)
406 */
407 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
408   {                                          \
409     ST_B(RTYPE, in0, (pdst));                \
410     ST_B(RTYPE, in1, (pdst) + stride);       \
411   }
412 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
413 
414 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
415   {                                                      \
416     ST_B2(RTYPE, in0, in1, (pdst), stride);              \
417     ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
418   }
419 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
420 
421 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
422   {                                                                        \
423     ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
424     ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
425   }
426 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
427 
428 /* Description : Store vectors of 8 halfword elements with stride
429    Arguments   : Inputs - in0, in1, pdst, stride
430    Details     : Store 8 halfword elements from 'in0' to (pdst)
431                  Store 8 halfword elements from 'in1' to (pdst + stride)
432 */
433 #define ST_H2(RTYPE, in0, in1, pdst, stride) \
434   {                                          \
435     ST_H(RTYPE, in0, (pdst));                \
436     ST_H(RTYPE, in1, (pdst) + stride);       \
437   }
438 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
439 
440 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
441   {                                                      \
442     ST_H2(RTYPE, in0, in1, (pdst), stride);              \
443     ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
444   }
445 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
446 
447 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
448   {                                                                        \
449     ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                      \
450     ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
451   }
452 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
453 
454 /* Description : Store vectors of word elements with stride
455    Arguments   : Inputs - in0, in1, pdst, stride
456    Details     : Store 4 word elements from 'in0' to (pdst)
457                  Store 4 word elements from 'in1' to (pdst + stride)
458 */
459 #define ST_SW2(in0, in1, pdst, stride) \
460   {                                    \
461     ST_SW(in0, (pdst));                \
462     ST_SW(in1, (pdst) + stride);       \
463   }
464 
465 /* Description : Store 2x4 byte block to destination memory from input vector
466    Arguments   : Inputs - in, stidx, pdst, stride
467    Details     : Index 'stidx' halfword element from 'in' vector is copied to
468                  the GP register and stored to (pdst)
469                  Index 'stidx+1' halfword element from 'in' vector is copied to
470                  the GP register and stored to (pdst + stride)
471                  Index 'stidx+2' halfword element from 'in' vector is copied to
472                  the GP register and stored to (pdst + 2 * stride)
473                  Index 'stidx+3' halfword element from 'in' vector is copied to
474                  the GP register and stored to (pdst + 3 * stride)
475 */
476 #define ST2x4_UB(in, stidx, pdst, stride)            \
477   {                                                  \
478     uint16_t out0_m, out1_m, out2_m, out3_m;         \
479     uint8_t *pblk_2x4_m = (uint8_t *)(pdst);         \
480                                                      \
481     out0_m = __msa_copy_u_h((v8i16)in, (stidx));     \
482     out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
483     out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
484     out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
485                                                      \
486     SH(out0_m, pblk_2x4_m);                          \
487     SH(out1_m, pblk_2x4_m + stride);                 \
488     SH(out2_m, pblk_2x4_m + 2 * stride);             \
489     SH(out3_m, pblk_2x4_m + 3 * stride);             \
490   }
491 
492 /* Description : Store 4x2 byte block to destination memory from input vector
493    Arguments   : Inputs - in, pdst, stride
494    Details     : Index 0 word element from 'in' vector is copied to the GP
495                  register and stored to (pdst)
496                  Index 1 word element from 'in' vector is copied to the GP
497                  register and stored to (pdst + stride)
498 */
499 #define ST4x2_UB(in, pdst, stride)           \
500   {                                          \
501     uint32_t out0_m, out1_m;                 \
502     uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
503                                              \
504     out0_m = __msa_copy_u_w((v4i32)in, 0);   \
505     out1_m = __msa_copy_u_w((v4i32)in, 1);   \
506                                              \
507     SW(out0_m, pblk_4x2_m);                  \
508     SW(out1_m, pblk_4x2_m + stride);         \
509   }
510 
511 /* Description : Store 4x4 byte block to destination memory from input vector
512    Arguments   : Inputs - in0, in1, pdst, stride
513    Details     : 'Idx0' word element from input vector 'in0' is copied to the
514                  GP register and stored to (pdst)
515                  'Idx1' word element from input vector 'in0' is copied to the
516                  GP register and stored to (pdst + stride)
517                  'Idx2' word element from input vector 'in0' is copied to the
518                  GP register and stored to (pdst + 2 * stride)
519                  'Idx3' word element from input vector 'in0' is copied to the
520                  GP register and stored to (pdst + 3 * stride)
521 */
522 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
523   {                                                              \
524     uint32_t out0_m, out1_m, out2_m, out3_m;                     \
525     uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                     \
526                                                                  \
527     out0_m = __msa_copy_u_w((v4i32)in0, idx0);                   \
528     out1_m = __msa_copy_u_w((v4i32)in0, idx1);                   \
529     out2_m = __msa_copy_u_w((v4i32)in1, idx2);                   \
530     out3_m = __msa_copy_u_w((v4i32)in1, idx3);                   \
531                                                                  \
532     SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);     \
533   }
534 #define ST4x8_UB(in0, in1, pdst, stride)                           \
535   {                                                                \
536     uint8_t *pblk_4x8 = (uint8_t *)(pdst);                         \
537                                                                    \
538     ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);              \
539     ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
540   }
541 
542 /* Description : Store 8x1 byte block to destination memory from input vector
543    Arguments   : Inputs - in, pdst
544    Details     : Index 0 double word element from 'in' vector is copied to the
545                  GP register and stored to (pdst)
546 */
547 #define ST8x1_UB(in, pdst)                 \
548   {                                        \
549     uint64_t out0_m;                       \
550                                            \
551     out0_m = __msa_copy_u_d((v2i64)in, 0); \
552     SD(out0_m, pdst);                      \
553   }
554 
555 /* Description : Store 8x2 byte block to destination memory from input vector
556    Arguments   : Inputs - in, pdst, stride
557    Details     : Index 0 double word element from 'in' vector is copied to the
558                  GP register and stored to (pdst)
559                  Index 1 double word element from 'in' vector is copied to the
560                  GP register and stored to (pdst + stride)
561 */
562 #define ST8x2_UB(in, pdst, stride)           \
563   {                                          \
564     uint64_t out0_m, out1_m;                 \
565     uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
566                                              \
567     out0_m = __msa_copy_u_d((v2i64)in, 0);   \
568     out1_m = __msa_copy_u_d((v2i64)in, 1);   \
569                                              \
570     SD(out0_m, pblk_8x2_m);                  \
571     SD(out1_m, pblk_8x2_m + stride);         \
572   }
573 
574 /* Description : Store 8x4 byte block to destination memory from input
575                  vectors
576    Arguments   : Inputs - in0, in1, pdst, stride
577    Details     : Index 0 double word element from 'in0' vector is copied to the
578                  GP register and stored to (pdst)
579                  Index 1 double word element from 'in0' vector is copied to the
580                  GP register and stored to (pdst + stride)
581                  Index 0 double word element from 'in1' vector is copied to the
582                  GP register and stored to (pdst + 2 * stride)
583                  Index 1 double word element from 'in1' vector is copied to the
584                  GP register and stored to (pdst + 3 * stride)
585 */
586 #define ST8x4_UB(in0, in1, pdst, stride)                     \
587   {                                                          \
588     uint64_t out0_m, out1_m, out2_m, out3_m;                 \
589     uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                 \
590                                                              \
591     out0_m = __msa_copy_u_d((v2i64)in0, 0);                  \
592     out1_m = __msa_copy_u_d((v2i64)in0, 1);                  \
593     out2_m = __msa_copy_u_d((v2i64)in1, 0);                  \
594     out3_m = __msa_copy_u_d((v2i64)in1, 1);                  \
595                                                              \
596     SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
597   }
598 
599 /* Description : average with rounding (in0 + in1 + 1) / 2.
600    Arguments   : Inputs  - in0, in1, in2, in3,
601                  Outputs - out0, out1
602                  Return Type - as per RTYPE
603    Details     : Each unsigned byte element from 'in0' vector is added with
604                  each unsigned byte element from 'in1' vector. Then the average
605                  with rounding is calculated and written to 'out0'
606 */
607 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)   \
608   {                                                       \
609     out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
610     out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
611   }
612 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
613 
614 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
615                  out2, out3)                                                \
616   {                                                                         \
617     AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                         \
618     AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                         \
619   }
620 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
621 
622 /* Description : Immediate number of elements to slide with zero
623    Arguments   : Inputs  - in0, in1, slide_val
624                  Outputs - out0, out1
625                  Return Type - as per RTYPE
626    Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
627                  value specified in the 'slide_val'
628 */
629 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)             \
630   {                                                                   \
631     v16i8 zero_m = { 0 };                                             \
632     out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
633     out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
634   }
635 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
636 
637 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \
638                   slide_val)                                         \
639   {                                                                  \
640     SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);               \
641     SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);               \
642   }
643 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
644 
645 /* Description : Immediate number of elements to slide
646    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
647                  Outputs - out0, out1
648                  Return Type - as per RTYPE
649    Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
650                  value specified in the 'slide_val'
651 */
652 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
653   {                                                                       \
654     out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);    \
655     out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);    \
656   }
657 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
658 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
659 
660 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
661                 out2, slide_val)                                             \
662   {                                                                          \
663     SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)        \
664     out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);       \
665   }
666 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
667 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
668 
669 /* Description : Shuffle byte vector elements as per mask vector
670    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
671                  Outputs - out0, out1
672                  Return Type - as per RTYPE
673    Details     : Byte elements from 'in0' & 'in1' are copied selectively to
674                  'out0' as per control vector 'mask0'
675 */
676 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
677   {                                                                   \
678     out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
679     out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
680   }
681 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
682 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
683 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
684 
685 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
686                 out3)                                                          \
687   {                                                                            \
688     VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);              \
689     VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);              \
690   }
691 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
692 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
693 
694 /* Description : Dot product of byte vector elements
695    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
696                  Outputs - out0, out1
697                  Return Type - as per RTYPE
698    Details     : Unsigned byte elements from 'mult0' are multiplied with
699                  unsigned byte elements from 'cnst0' producing a result
700                  twice the size of input i.e. unsigned halfword.
701                  The multiplication result of adjacent odd-even elements
702                  are added together and written to the 'out0' vector
703 */
704 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
705   {                                                             \
706     out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);   \
707     out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);   \
708   }
709 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
710 
711 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
712                  cnst3, out0, out1, out2, out3)                          \
713   {                                                                      \
714     DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
715     DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
716   }
717 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
718 
719 /* Description : Dot product of byte vector elements
720    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
721                  Outputs - out0, out1
722                  Return Type - as per RTYPE
723    Details     : Signed byte elements from 'mult0' are multiplied with
724                  signed byte elements from 'cnst0' producing a result
725                  twice the size of input i.e. signed halfword.
726                  The multiplication result of adjacent odd-even elements
727                  are added together and written to the 'out0' vector
728 */
729 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
730   {                                                             \
731     out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);   \
732     out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);   \
733   }
734 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
735 
736 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
737                  cnst3, out0, out1, out2, out3)                          \
738   {                                                                      \
739     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
740     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
741   }
742 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
743 
744 /* Description : Dot product of halfword vector elements
745    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
746                  Outputs - out0, out1
747                  Return Type - as per RTYPE
748    Details     : Signed halfword elements from 'mult0' are multiplied with
749                  signed halfword elements from 'cnst0' producing a result
750                  twice the size of input i.e. signed word.
751                  The multiplication result of adjacent odd-even elements
752                  are added together and written to the 'out0' vector
753 */
754 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
755   {                                                             \
756     out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);   \
757     out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);   \
758   }
759 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
760 
761 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
762                  cnst3, out0, out1, out2, out3)                          \
763   {                                                                      \
764     DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
765     DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
766   }
767 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
768 
769 /* Description : Dot product of word vector elements
770    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
771                  Outputs - out0, out1
772                  Return Type - as per RTYPE
773    Details     : Signed word elements from 'mult0' are multiplied with
774                  signed word elements from 'cnst0' producing a result
775                  twice the size of input i.e. signed double word.
776                  The multiplication result of adjacent odd-even elements
777                  are added together and written to the 'out0' vector
778 */
779 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
780   {                                                             \
781     out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);   \
782     out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);   \
783   }
784 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
785 
786 /* Description : Dot product & addition of byte vector elements
787    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
788                  Outputs - out0, out1
789                  Return Type - as per RTYPE
790    Details     : Signed byte elements from 'mult0' are multiplied with
791                  signed byte elements from 'cnst0' producing a result
792                  twice the size of input i.e. signed halfword.
793                  The multiplication result of adjacent odd-even elements
794                  are added to the 'out0' vector
795 */
796 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
797   {                                                                         \
798     out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
799     out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
800   }
801 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
802 
803 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
804                   cnst3, out0, out1, out2, out3)                          \
805   {                                                                       \
806     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
807     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
808   }
809 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
810 
811 /* Description : Dot product & addition of halfword vector elements
812    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
813                  Outputs - out0, out1
814                  Return Type - as per RTYPE
815    Details     : Signed halfword elements from 'mult0' are multiplied with
816                  signed halfword elements from 'cnst0' producing a result
817                  twice the size of input i.e. signed word.
818                  The multiplication result of adjacent odd-even elements
819                  are added to the 'out0' vector
820 */
821 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
822   {                                                                         \
823     out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
824     out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
825   }
826 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
827 
828 /* Description : Dot product & addition of double word vector elements
829    Arguments   : Inputs  - mult0, mult1
830                  Outputs - out0, out1
831                  Return Type - as per RTYPE
832    Details     : Each signed word element from 'mult0' is multiplied with itself
833                  producing an intermediate result twice the size of input
834                  i.e. signed double word
835                  The multiplication result of adjacent odd-even elements
836                  are added to the 'out0' vector
837 */
838 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1)                          \
839   {                                                                         \
840     out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
841     out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
842   }
843 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
844 
845 /* Description : Minimum values between unsigned elements of
846                  either vector are copied to the output vector
847    Arguments   : Inputs  - in0, in1, min_vec
848                  Outputs - in place operation
849                  Return Type - as per RTYPE
850    Details     : Minimum of unsigned halfword element values from 'in0' and
851                  'min_vec' are written to output vector 'in0'
852 */
853 #define MIN_UH2(RTYPE, in0, in1, min_vec)            \
854   {                                                  \
855     in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
856     in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
857   }
858 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
859 
860 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
861   {                                                 \
862     MIN_UH2(RTYPE, in0, in1, min_vec);              \
863     MIN_UH2(RTYPE, in2, in3, min_vec);              \
864   }
865 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
866 
867 /* Description : Clips all signed halfword elements of input vector
868                  between 0 & 255
869    Arguments   : Input  - in
870                  Output - out_m
871                  Return Type - signed halfword
872 */
873 #define CLIP_SH_0_255(in)                              \
874   ({                                                   \
875     v8i16 max_m = __msa_ldi_h(255);                    \
876     v8i16 out_m;                                       \
877                                                        \
878     out_m = __msa_maxi_s_h((v8i16)in, 0);              \
879     out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
880     out_m;                                             \
881   })
882 #define CLIP_SH2_0_255(in0, in1) \
883   {                              \
884     in0 = CLIP_SH_0_255(in0);    \
885     in1 = CLIP_SH_0_255(in1);    \
886   }
887 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
888   {                                        \
889     CLIP_SH2_0_255(in0, in1);              \
890     CLIP_SH2_0_255(in2, in3);              \
891   }
892 
893 /* Description : Horizontal addition of 4 signed word elements of input vector
894    Arguments   : Input  - in       (signed word vector)
895                  Output - sum_m    (i32 sum)
896                  Return Type - signed word (GP)
897    Details     : 4 signed word elements of 'in' vector are added together and
898                  the resulting integer sum is returned
899 */
900 #define HADD_SW_S32(in)                            \
901   ({                                               \
902     v2i64 res0_m, res1_m;                          \
903     int32_t sum_m;                                 \
904                                                    \
905     res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
906     res1_m = __msa_splati_d(res0_m, 1);            \
907     res0_m = res0_m + res1_m;                      \
908     sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
909     sum_m;                                         \
910   })
911 
912 /* Description : Horizontal addition of 4 unsigned word elements
913    Arguments   : Input  - in       (unsigned word vector)
914                  Output - sum_m    (u32 sum)
915                  Return Type - unsigned word (GP)
916    Details     : 4 unsigned word elements of 'in' vector are added together and
917                  the resulting integer sum is returned
918 */
919 #define HADD_UW_U32(in)                               \
920   ({                                                  \
921     v2u64 res0_m, res1_m;                             \
922     uint32_t sum_m;                                   \
923                                                       \
924     res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in);    \
925     res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
926     res0_m += res1_m;                                 \
927     sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
928     sum_m;                                            \
929   })
930 
931 /* Description : Horizontal addition of 8 unsigned halfword elements
932    Arguments   : Input  - in       (unsigned halfword vector)
933                  Output - sum_m    (u32 sum)
934                  Return Type - unsigned word
935    Details     : 8 unsigned halfword elements of 'in' vector are added
936                  together and the resulting integer sum is returned
937 */
938 #define HADD_UH_U32(in)                           \
939   ({                                              \
940     v4u32 res_m;                                  \
941     uint32_t sum_m;                               \
942                                                   \
943     res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
944     sum_m = HADD_UW_U32(res_m);                   \
945     sum_m;                                        \
946   })
947 
948 /* Description : Horizontal addition of unsigned byte vector elements
949    Arguments   : Inputs  - in0, in1
950                  Outputs - out0, out1
951                  Return Type - as per RTYPE
952    Details     : Each unsigned odd byte element from 'in0' is added to
953                  even unsigned byte element from 'in0' (pairwise) and the
954                  halfword result is written to 'out0'
955 */
956 #define HADD_UB2(RTYPE, in0, in1, out0, out1)             \
957   {                                                       \
958     out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
959     out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
960   }
961 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
962 
963 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
964   {                                                                 \
965     HADD_UB2(RTYPE, in0, in1, out0, out1);                          \
966     HADD_UB2(RTYPE, in2, in3, out2, out3);                          \
967   }
968 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
969 
970 /* Description : Horizontal subtraction of unsigned byte vector elements
971    Arguments   : Inputs  - in0, in1
972                  Outputs - out0, out1
973                  Return Type - as per RTYPE
974    Details     : Each unsigned odd byte element from 'in0' is subtracted from
975                  even unsigned byte element from 'in0' (pairwise) and the
976                  halfword result is written to 'out0'
977 */
978 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)             \
979   {                                                       \
980     out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
981     out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
982   }
983 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
984 
985 /* Description : SAD (Sum of Absolute Difference)
986    Arguments   : Inputs  - in0, in1, ref0, ref1
987                  Outputs - sad_m                 (halfword vector)
988                  Return Type - unsigned halfword
989    Details     : Absolute difference of all the byte elements from 'in0' with
990                  'ref0' is calculated and preserved in 'diff0'. Then even-odd
991                  pairs are added together to generate 8 halfword results.
992 */
993 #define SAD_UB2_UH(in0, in1, ref0, ref1)                     \
994   ({                                                         \
995     v16u8 diff0_m, diff1_m;                                  \
996     v8u16 sad_m = { 0 };                                     \
997                                                              \
998     diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);       \
999     diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);       \
1000                                                              \
1001     sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
1002     sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
1003                                                              \
1004     sad_m;                                                   \
1005   })
1006 
1007 /* Description : Horizontal subtraction of signed halfword vector elements
1008    Arguments   : Inputs  - in0, in1
1009                  Outputs - out0, out1
1010                  Return Type - as per RTYPE
1011    Details     : Each signed odd halfword element from 'in0' is subtracted from
1012                  even signed halfword element from 'in0' (pairwise) and the
1013                  word result is written to 'out0'
1014 */
1015 #define HSUB_UH2(RTYPE, in0, in1, out0, out1)             \
1016   {                                                       \
1017     out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
1018     out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
1019   }
1020 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
1021 
1022 /* Description : Set element n input vector to GPR value
1023    Arguments   : Inputs - in0, in1, in2, in3
1024                  Output - out
1025                  Return Type - as per RTYPE
1026    Details     : Set element 0 in vector 'out' to value specified in 'in0'
1027 */
1028 #define INSERT_W2(RTYPE, in0, in1, out)              \
1029   {                                                  \
1030     out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
1031     out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
1032   }
1033 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1034 
1035 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out)    \
1036   {                                                  \
1037     out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
1038     out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
1039     out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
1040     out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
1041   }
1042 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1043 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1044 
1045 #define INSERT_D2(RTYPE, in0, in1, out)              \
1046   {                                                  \
1047     out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
1048     out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
1049   }
1050 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1051 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1052 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
1053 
1054 /* Description : Interleave even byte elements from vectors
1055    Arguments   : Inputs  - in0, in1, in2, in3
1056                  Outputs - out0, out1
1057                  Return Type - as per RTYPE
1058    Details     : Even byte elements of 'in0' and 'in1' are interleaved
1059                  and written to 'out0'
1060 */
1061 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1062   {                                                      \
1063     out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
1064     out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
1065   }
1066 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1067 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1068 
1069 /* Description : Interleave even halfword elements from vectors
1070    Arguments   : Inputs  - in0, in1, in2, in3
1071                  Outputs - out0, out1
1072                  Return Type - as per RTYPE
1073    Details     : Even halfword elements of 'in0' and 'in1' are interleaved
1074                  and written to 'out0'
1075 */
1076 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1077   {                                                      \
1078     out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
1079     out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
1080   }
1081 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1082 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1083 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1084 
1085 /* Description : Interleave even word elements from vectors
1086    Arguments   : Inputs  - in0, in1, in2, in3
1087                  Outputs - out0, out1
1088                  Return Type - as per RTYPE
1089    Details     : Even word elements of 'in0' and 'in1' are interleaved
1090                  and written to 'out0'
1091 */
1092 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1093   {                                                      \
1094     out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
1095     out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
1096   }
1097 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1098 
1099 /* Description : Interleave even double word elements from vectors
1100    Arguments   : Inputs  - in0, in1, in2, in3
1101                  Outputs - out0, out1
1102                  Return Type - as per RTYPE
1103    Details     : Even double word elements of 'in0' and 'in1' are interleaved
1104                  and written to 'out0'
1105 */
1106 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1107   {                                                      \
1108     out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
1109     out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
1110   }
1111 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1112 
1113 /* Description : Interleave left half of byte elements from vectors
1114    Arguments   : Inputs  - in0, in1, in2, in3
1115                  Outputs - out0, out1
1116                  Return Type - as per RTYPE
1117    Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
1118                  and written to 'out0'.
1119 */
1120 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1121   {                                                     \
1122     out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
1123     out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
1124   }
1125 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1126 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1127 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1128 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1129 
1130 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1131                 out2, out3)                                                \
1132   {                                                                        \
1133     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1134     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1135   }
1136 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1137 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1138 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1139 
1140 /* Description : Interleave left half of halfword elements from vectors
1141    Arguments   : Inputs  - in0, in1, in2, in3
1142                  Outputs - out0, out1
1143                  Return Type - as per RTYPE
1144    Details     : Left half of halfword elements of 'in0' and 'in1' are
1145                  interleaved and written to 'out0'.
1146 */
1147 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1148   {                                                     \
1149     out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
1150     out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
1151   }
1152 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1153 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1154 
1155 /* Description : Interleave left half of word elements from vectors
1156    Arguments   : Inputs  - in0, in1, in2, in3
1157                  Outputs - out0, out1
1158                  Return Type - as per RTYPE
1159    Details     : Left half of word elements of 'in0' and 'in1' are interleaved
1160                  and written to 'out0'.
1161 */
1162 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1163   {                                                     \
1164     out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
1165     out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
1166   }
1167 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1168 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1169 
1170 /* Description : Interleave right half of byte elements from vectors
1171    Arguments   : Inputs  - in0, in1, in2, in3
1172                  Outputs - out0, out1
1173                  Return Type - as per RTYPE
1174    Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
1175                  and written to out0.
1176 */
1177 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1178   {                                                     \
1179     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
1180     out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
1181   }
1182 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1183 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1184 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1185 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1186 
1187 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1188                 out2, out3)                                                \
1189   {                                                                        \
1190     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1191     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1192   }
1193 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1194 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1195 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1196 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1197 
1198 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
1199                 in11, in12, in13, in14, in15, out0, out1, out2, out3, out4,    \
1200                 out5, out6, out7)                                              \
1201   {                                                                            \
1202     ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,   \
1203             out3);                                                             \
1204     ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5,   \
1205             out6, out7);                                                       \
1206   }
1207 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1208 
1209 /* Description : Interleave right half of halfword elements from vectors
1210    Arguments   : Inputs  - in0, in1, in2, in3
1211                  Outputs - out0, out1
1212                  Return Type - as per RTYPE
1213    Details     : Right half of halfword elements of 'in0' and 'in1' are
1214                  interleaved and written to 'out0'.
1215 */
1216 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1217   {                                                     \
1218     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
1219     out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
1220   }
1221 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1222 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1223 
1224 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1225                 out2, out3)                                                \
1226   {                                                                        \
1227     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1228     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1229   }
1230 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1231 
1232 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1233   {                                                     \
1234     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
1235     out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
1236   }
1237 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1238 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1239 
1240 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1241                 out2, out3)                                                \
1242   {                                                                        \
1243     ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1244     ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1245   }
1246 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1247 
1248 /* Description : Interleave right half of double word elements from vectors
1249    Arguments   : Inputs  - in0, in1, in2, in3
1250                  Outputs - out0, out1
1251                  Return Type - as per RTYPE
1252    Details     : Right half of double word elements of 'in0' and 'in1' are
1253                  interleaved and written to 'out0'.
1254 */
1255 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1256   {                                                         \
1257     out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
1258     out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
1259   }
1260 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1261 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1262 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1263 
1264 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1265   {                                                                    \
1266     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                    \
1267     out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));            \
1268   }
1269 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1270 
1271 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1272                 out2, out3)                                                \
1273   {                                                                        \
1274     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1275     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1276   }
1277 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1278 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1279 
1280 /* Description : Interleave both left and right half of input vectors
1281    Arguments   : Inputs  - in0, in1
1282                  Outputs - out0, out1
1283                  Return Type - as per RTYPE
1284    Details     : Right half of byte elements from 'in0' and 'in1' are
1285                  interleaved and written to 'out0'
1286 */
1287 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
1288   {                                                     \
1289     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
1290     out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
1291   }
1292 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1293 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1294 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1295 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1296 
1297 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)           \
1298   {                                                     \
1299     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
1300     out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
1301   }
1302 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1303 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1304 
1305 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)           \
1306   {                                                     \
1307     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
1308     out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
1309   }
1310 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1311 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1312 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1313 
1314 /* Description : Saturate the halfword element values to the max
1315                  unsigned value of (sat_val + 1) bits
1316                  The element data width remains unchanged
1317    Arguments   : Inputs  - in0, in1, sat_val
1318                  Outputs - in place operation
1319                  Return Type - as per RTYPE
1320    Details     : Each unsigned halfword element from 'in0' is saturated to the
1321                  value generated with (sat_val + 1) bit range.
1322                  The results are written in place
1323 */
1324 #define SAT_UH2(RTYPE, in0, in1, sat_val)            \
1325   {                                                  \
1326     in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
1327     in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
1328   }
1329 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1330 
1331 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1332   {                                                 \
1333     SAT_UH2(RTYPE, in0, in1, sat_val);              \
1334     SAT_UH2(RTYPE, in2, in3, sat_val)               \
1335   }
1336 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1337 
1338 /* Description : Saturate the halfword element values to the max
1339                  unsigned value of (sat_val + 1) bits
1340                  The element data width remains unchanged
1341    Arguments   : Inputs  - in0, in1, sat_val
1342                  Outputs - in place operation
1343                  Return Type - as per RTYPE
1344    Details     : Each unsigned halfword element from 'in0' is saturated to the
1345                  value generated with (sat_val + 1) bit range
1346                  The results are written in place
1347 */
1348 #define SAT_SH2(RTYPE, in0, in1, sat_val)            \
1349   {                                                  \
1350     in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
1351     in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
1352   }
1353 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1354 
1355 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1356   {                                                 \
1357     SAT_SH2(RTYPE, in0, in1, sat_val);              \
1358     SAT_SH2(RTYPE, in2, in3, sat_val);              \
1359   }
1360 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1361 
1362 /* Description : Indexed halfword element values are replicated to all
1363                  elements in output vector
1364    Arguments   : Inputs  - in, idx0, idx1
1365                  Outputs - out0, out1
1366                  Return Type - as per RTYPE
1367    Details     : 'idx0' element value from 'in' vector is replicated to all
1368                   elements in 'out0' vector
1369                   Valid index range for halfword operation is 0-7
1370 */
1371 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1372   {                                                  \
1373     out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);   \
1374     out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);   \
1375   }
1376 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1377 
1378 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \
1379   {                                                                          \
1380     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);                            \
1381     SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);                            \
1382   }
1383 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1384 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1385 
1386 /* Description : Pack even byte elements of vector pairs
1387    Arguments   : Inputs  - in0, in1, in2, in3
1388                  Outputs - out0, out1
1389                  Return Type - as per RTYPE
1390    Details     : Even byte elements of 'in0' are copied to the left half of
1391                  'out0' & even byte elements of 'in1' are copied to the right
1392                  half of 'out0'.
1393 */
1394 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1395   {                                                      \
1396     out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
1397     out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
1398   }
1399 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1400 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1401 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1402 
1403 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1404                  out2, out3)                                                \
1405   {                                                                         \
1406     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1407     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1408   }
1409 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1410 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1411 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1412 
1413 /* Description : Pack even halfword elements of vector pairs
1414    Arguments   : Inputs  - in0, in1, in2, in3
1415                  Outputs - out0, out1
1416                  Return Type - as per RTYPE
1417    Details     : Even halfword elements of 'in0' are copied to the left half of
1418                  'out0' & even halfword elements of 'in1' are copied to the
1419                  right half of 'out0'.
1420 */
1421 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1422   {                                                      \
1423     out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
1424     out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
1425   }
1426 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1427 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1428 
1429 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1430                  out2, out3)                                                \
1431   {                                                                         \
1432     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1433     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1434   }
1435 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1436 
1437 /* Description : Pack even double word elements of vector pairs
1438    Arguments   : Inputs  - in0, in1, in2, in3
1439                  Outputs - out0, out1
1440                  Return Type - as per RTYPE
1441    Details     : Even double elements of 'in0' are copied to the left half of
1442                  'out0' & even double elements of 'in1' are copied to the right
1443                  half of 'out0'.
1444 */
1445 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1446   {                                                      \
1447     out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
1448     out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
1449   }
1450 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1451 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1452 
1453 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1454                  out2, out3)                                                \
1455   {                                                                         \
1456     PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1457     PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1458   }
1459 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1460 
1461 /* Description : Each byte element is logically xor'ed with immediate 128
1462    Arguments   : Inputs  - in0, in1
1463                  Outputs - in place operation
1464                  Return Type - as per RTYPE
1465    Details     : Each unsigned byte element from input vector 'in0' is
1466                  logically xor'ed with 128 and the result is stored in-place.
1467 */
1468 #define XORI_B2_128(RTYPE, in0, in1)            \
1469   {                                             \
1470     in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
1471     in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
1472   }
1473 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1474 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1475 
1476 #define XORI_B3_128(RTYPE, in0, in1, in2)       \
1477   {                                             \
1478     XORI_B2_128(RTYPE, in0, in1);               \
1479     in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
1480   }
1481 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1482 
1483 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1484   {                                            \
1485     XORI_B2_128(RTYPE, in0, in1);              \
1486     XORI_B2_128(RTYPE, in2, in3);              \
1487   }
1488 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1489 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1490 
1491 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1492   {                                                           \
1493     XORI_B4_128(RTYPE, in0, in1, in2, in3);                   \
1494     XORI_B3_128(RTYPE, in4, in5, in6);                        \
1495   }
1496 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1497 
1498 /* Description : Average of signed halfword elements -> (a + b) / 2
1499    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1500                  Outputs - out0, out1, out2, out3
1501                  Return Type - as per RTYPE
1502    Details     : Each signed halfword element from 'in0' is added to each
1503                  signed halfword element of 'in1' with full precision resulting
1504                  in one extra bit in the result. The result is then divided by
1505                  2 and written to 'out0'
1506 */
1507 #define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1508                 out2, out3)                                                \
1509   {                                                                        \
1510     out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);                   \
1511     out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);                   \
1512     out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);                   \
1513     out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);                   \
1514   }
1515 #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
1516 
1517 /* Description : Addition of signed halfword elements and signed saturation
1518    Arguments   : Inputs  - in0, in1, in2, in3
1519                  Outputs - out0, out1
1520                  Return Type - as per RTYPE
1521    Details     : Signed halfword elements from 'in0' are added to signed
1522                  halfword elements of 'in1'. The result is then signed saturated
1523                  between halfword data type range
1524 */
1525 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)   \
1526   {                                                       \
1527     out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
1528     out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
1529   }
1530 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1531 
1532 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1533                  out2, out3)                                                \
1534   {                                                                         \
1535     ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1536     ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1537   }
1538 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1539 
1540 /* Description : Shift left all elements of vector (generic for all data types)
1541    Arguments   : Inputs  - in0, in1, in2, in3, shift
1542                  Outputs - in place operation
1543                  Return Type - as per input vector RTYPE
1544    Details     : Each element of vector 'in0' is left shifted by 'shift' and
1545                  the result is written in-place.
1546 */
1547 #define SLLI_4V(in0, in1, in2, in3, shift) \
1548   {                                        \
1549     in0 = in0 << shift;                    \
1550     in1 = in1 << shift;                    \
1551     in2 = in2 << shift;                    \
1552     in3 = in3 << shift;                    \
1553   }
1554 
1555 /* Description : Arithmetic shift right all elements of vector
1556                  (generic for all data types)
1557    Arguments   : Inputs  - in0, in1, in2, in3, shift
1558                  Outputs - in place operation
1559                  Return Type - as per input vector RTYPE
1560    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1561                  the result is written in-place. 'shift' is a GP variable.
1562 */
1563 #define SRA_2V(in0, in1, shift) \
1564   {                             \
1565     in0 = in0 >> shift;         \
1566     in1 = in1 >> shift;         \
1567   }
1568 
1569 #define SRA_4V(in0, in1, in2, in3, shift) \
1570   {                                       \
1571     in0 = in0 >> shift;                   \
1572     in1 = in1 >> shift;                   \
1573     in2 = in2 >> shift;                   \
1574     in3 = in3 >> shift;                   \
1575   }
1576 
1577 /* Description : Shift right arithmetic rounded words
1578    Arguments   : Inputs  - in0, in1, shift
1579                  Outputs - in place operation
1580                  Return Type - as per RTYPE
1581    Details     : Each element of vector 'in0' is shifted right arithmetically by
1582                  the number of bits in the corresponding element in the vector
1583                  'shift'. The last discarded bit is added to shifted value for
1584                  rounding and the result is written in-place.
1585                  'shift' is a vector.
1586 */
1587 #define SRAR_W2(RTYPE, in0, in1, shift)                  \
1588   {                                                      \
1589     in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
1590     in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
1591   }
1592 
1593 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
1594   {                                               \
1595     SRAR_W2(RTYPE, in0, in1, shift)               \
1596     SRAR_W2(RTYPE, in2, in3, shift)               \
1597   }
1598 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
1599 
1600 /* Description : Shift right arithmetic rounded (immediate)
1601    Arguments   : Inputs  - in0, in1, shift
1602                  Outputs - in place operation
1603                  Return Type - as per RTYPE
1604    Details     : Each element of vector 'in0' is shifted right arithmetically by
1605                  the value in 'shift'. The last discarded bit is added to the
1606                  shifted value for rounding and the result is written in-place.
1607                  'shift' is an immediate value.
1608 */
1609 #define SRARI_H2(RTYPE, in0, in1, shift)           \
1610   {                                                \
1611     in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
1612     in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
1613   }
1614 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1615 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1616 
1617 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
1618   {                                                \
1619     SRARI_H2(RTYPE, in0, in1, shift);              \
1620     SRARI_H2(RTYPE, in2, in3, shift);              \
1621   }
1622 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
1623 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
1624 
1625 #define SRARI_W2(RTYPE, in0, in1, shift)           \
1626   {                                                \
1627     in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
1628     in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
1629   }
1630 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1631 
1632 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
1633   {                                                \
1634     SRARI_W2(RTYPE, in0, in1, shift);              \
1635     SRARI_W2(RTYPE, in2, in3, shift);              \
1636   }
1637 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1638 
1639 /* Description : Logical shift right all elements of vector (immediate)
1640    Arguments   : Inputs  - in0, in1, in2, in3, shift
1641                  Outputs - out0, out1, out2, out3
1642                  Return Type - as per RTYPE
1643    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1644                  the result is written in-place. 'shift' is an immediate value.
1645 */
1646 #define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \
1647   {                                                                       \
1648     out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                        \
1649     out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                        \
1650     out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                        \
1651     out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                        \
1652   }
1653 #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
1654 
1655 /* Description : Multiplication of pairs of vectors
1656    Arguments   : Inputs  - in0, in1, in2, in3
1657                  Outputs - out0, out1
1658    Details     : Each element from 'in0' is multiplied with elements from 'in1'
1659                  and the result is written to 'out0'
1660 */
1661 #define MUL2(in0, in1, in2, in3, out0, out1) \
1662   {                                          \
1663     out0 = in0 * in1;                        \
1664     out1 = in2 * in3;                        \
1665   }
1666 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1667   {                                                                          \
1668     MUL2(in0, in1, in2, in3, out0, out1);                                    \
1669     MUL2(in4, in5, in6, in7, out2, out3);                                    \
1670   }
1671 
1672 /* Description : Addition of 2 pairs of vectors
1673    Arguments   : Inputs  - in0, in1, in2, in3
1674                  Outputs - out0, out1
1675    Details     : Each element in 'in0' is added to 'in1' and result is written
1676                  to 'out0'.
1677 */
1678 #define ADD2(in0, in1, in2, in3, out0, out1) \
1679   {                                          \
1680     out0 = in0 + in1;                        \
1681     out1 = in2 + in3;                        \
1682   }
1683 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1684   {                                                                          \
1685     ADD2(in0, in1, in2, in3, out0, out1);                                    \
1686     ADD2(in4, in5, in6, in7, out2, out3);                                    \
1687   }
1688 
1689 /* Description : Subtraction of 2 pairs of vectors
1690    Arguments   : Inputs  - in0, in1, in2, in3
1691                  Outputs - out0, out1
1692    Details     : Each element in 'in1' is subtracted from 'in0' and result is
1693                  written to 'out0'.
1694 */
1695 #define SUB2(in0, in1, in2, in3, out0, out1) \
1696   {                                          \
1697     out0 = in0 - in1;                        \
1698     out1 = in2 - in3;                        \
1699   }
1700 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1701   {                                                                          \
1702     out0 = in0 - in1;                                                        \
1703     out1 = in2 - in3;                                                        \
1704     out2 = in4 - in5;                                                        \
1705     out3 = in6 - in7;                                                        \
1706   }
1707 
1708 /* Description : Sign extend halfword elements from right half of the vector
1709    Arguments   : Input  - in    (halfword vector)
1710                  Output - out   (sign extended word vector)
1711                  Return Type - signed word
1712    Details     : Sign bit of halfword elements from input vector 'in' is
1713                  extracted and interleaved with same vector 'in0' to generate
1714                  4 word elements keeping sign intact
1715 */
1716 #define UNPCK_R_SH_SW(in, out)                    \
1717   {                                               \
1718     v8i16 sign_m;                                 \
1719                                                   \
1720     sign_m = __msa_clti_s_h((v8i16)in, 0);        \
1721     out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
1722   }
1723 
1724 /* Description : Zero extend unsigned byte elements to halfword elements
1725    Arguments   : Input   - in          (unsigned byte vector)
1726                  Outputs - out0, out1  (unsigned  halfword vectors)
1727                  Return Type - signed halfword
1728    Details     : Zero extended right half of vector is returned in 'out0'
1729                  Zero extended left half of vector is returned in 'out1'
1730 */
1731 #define UNPCK_UB_SH(in, out0, out1)      \
1732   {                                      \
1733     v16i8 zero_m = { 0 };                \
1734                                          \
1735     ILVRL_B2_SH(zero_m, in, out0, out1); \
1736   }
1737 
1738 /* Description : Sign extend halfword elements from input vector and return
1739                  the result in pair of vectors
1740    Arguments   : Input   - in            (halfword vector)
1741                  Outputs - out0, out1   (sign extended word vectors)
1742                  Return Type - signed word
1743    Details     : Sign bit of halfword elements from input vector 'in' is
1744                  extracted and interleaved right with same vector 'in0' to
1745                  generate 4 signed word elements in 'out0'
1746                  Then interleaved left with same vector 'in0' to
1747                  generate 4 signed word elements in 'out1'
1748 */
1749 #define UNPCK_SH_SW(in, out0, out1)       \
1750   {                                       \
1751     v8i16 tmp_m;                          \
1752                                           \
1753     tmp_m = __msa_clti_s_h((v8i16)in, 0); \
1754     ILVRL_H2_SW(tmp_m, in, out0, out1);   \
1755   }
1756 
1757 /* Description : Butterfly of 4 input vectors
1758    Arguments   : Inputs  - in0, in1, in2, in3
1759                  Outputs - out0, out1, out2, out3
1760    Details     : Butterfly operation
1761 */
1762 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
1763   {                                                             \
1764     out0 = in0 + in3;                                           \
1765     out1 = in1 + in2;                                           \
1766                                                                 \
1767     out2 = in1 - in2;                                           \
1768     out3 = in0 - in3;                                           \
1769   }
1770 
1771 /* Description : Butterfly of 8 input vectors
1772    Arguments   : Inputs  - in0 ...  in7
1773                  Outputs - out0 .. out7
1774    Details     : Butterfly operation
1775 */
1776 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
1777                     out3, out4, out5, out6, out7)                             \
1778   {                                                                           \
1779     out0 = in0 + in7;                                                         \
1780     out1 = in1 + in6;                                                         \
1781     out2 = in2 + in5;                                                         \
1782     out3 = in3 + in4;                                                         \
1783                                                                               \
1784     out4 = in3 - in4;                                                         \
1785     out5 = in2 - in5;                                                         \
1786     out6 = in1 - in6;                                                         \
1787     out7 = in0 - in7;                                                         \
1788   }
1789 
1790 /* Description : Butterfly of 16 input vectors
1791    Arguments   : Inputs  - in0 ...  in15
1792                  Outputs - out0 .. out15
1793    Details     : Butterfly operation
1794 */
1795 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,  \
1796                      in11, in12, in13, in14, in15, out0, out1, out2, out3,    \
1797                      out4, out5, out6, out7, out8, out9, out10, out11, out12, \
1798                      out13, out14, out15)                                     \
1799   {                                                                           \
1800     out0 = in0 + in15;                                                        \
1801     out1 = in1 + in14;                                                        \
1802     out2 = in2 + in13;                                                        \
1803     out3 = in3 + in12;                                                        \
1804     out4 = in4 + in11;                                                        \
1805     out5 = in5 + in10;                                                        \
1806     out6 = in6 + in9;                                                         \
1807     out7 = in7 + in8;                                                         \
1808                                                                               \
1809     out8 = in7 - in8;                                                         \
1810     out9 = in6 - in9;                                                         \
1811     out10 = in5 - in10;                                                       \
1812     out11 = in4 - in11;                                                       \
1813     out12 = in3 - in12;                                                       \
1814     out13 = in2 - in13;                                                       \
1815     out14 = in1 - in14;                                                       \
1816     out15 = in0 - in15;                                                       \
1817   }
1818 
1819 /* Description : Transpose input 8x8 byte block
1820    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1821                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1822                  Return Type - as per RTYPE
1823 */
1824 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0,   \
1825                         out1, out2, out3, out4, out5, out6, out7)              \
1826   {                                                                            \
1827     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
1828     v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
1829                                                                                \
1830     ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
1831                tmp3_m);                                                        \
1832     ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                               \
1833     ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                               \
1834     ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                               \
1835     ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                               \
1836     SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                               \
1837     SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                               \
1838   }
1839 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
1840 
1841 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
1842    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1843                            in8, in9, in10, in11, in12, in13, in14, in15
1844                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1845                  Return Type - unsigned byte
1846 */
1847 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
1848                             in10, in11, in12, in13, in14, in15, out0, out1,   \
1849                             out2, out3, out4, out5, out6, out7)               \
1850   {                                                                           \
1851     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
1852     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                     \
1853                                                                               \
1854     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                              \
1855     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                            \
1856     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                            \
1857     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                            \
1858                                                                               \
1859     tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                  \
1860     tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                  \
1861     tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                  \
1862     tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                  \
1863     out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                    \
1864     tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                  \
1865     out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                    \
1866     tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                  \
1867                                                                               \
1868     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                  \
1869     out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1870     out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1871                                                                               \
1872     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);              \
1873     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                  \
1874     out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1875     out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1876                                                                               \
1877     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);              \
1878     out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1879     out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1880                                                                               \
1881     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
1882     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
1883     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
1884     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
1885     out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1886     out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1887   }
1888 
1889 /* Description : Transpose 4x4 block with half word elements in vectors
1890    Arguments   : Inputs  - in0, in1, in2, in3
1891                  Outputs - out0, out1, out2, out3
1892                  Return Type - signed halfword
1893 */
1894 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
1895   {                                                                    \
1896     v8i16 s0_m, s1_m;                                                  \
1897                                                                        \
1898     ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                        \
1899     ILVRL_W2_SH(s1_m, s0_m, out0, out2);                               \
1900     out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);              \
1901     out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);              \
1902   }
1903 
1904 /* Description : Transpose 4x8 block with half word elements in vectors
1905    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1906                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1907                  Return Type - signed halfword
1908 */
1909 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1910                            out2, out3, out4, out5, out6, out7)                 \
1911   {                                                                            \
1912     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
1913     v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                      \
1914     v8i16 zero_m = { 0 };                                                      \
1915                                                                                \
1916     ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \
1917                tmp3_n);                                                        \
1918     ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                               \
1919     ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                               \
1920                                                                                \
1921     out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
1922     out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
1923     out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
1924     out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
1925                                                                                \
1926     out4 = zero_m;                                                             \
1927     out5 = zero_m;                                                             \
1928     out6 = zero_m;                                                             \
1929     out7 = zero_m;                                                             \
1930   }
1931 
1932 /* Description : Transpose 8x4 block with half word elements in vectors
1933    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1934                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1935                  Return Type - signed halfword
1936 */
1937 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
1938   {                                                                    \
1939     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
1940                                                                        \
1941     ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                    \
1942     ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                    \
1943     ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);            \
1944     ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);            \
1945   }
1946 
1947 /* Description : Transpose 8x8 block with half word elements in vectors
1948    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1949                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1950                  Return Type - as per RTYPE
1951 */
1952 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
1953                        out1, out2, out3, out4, out5, out6, out7)            \
1954   {                                                                         \
1955     v8i16 s0_m, s1_m;                                                       \
1956     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
1957     v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
1958                                                                             \
1959     ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
1960     ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
1961     ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
1962     ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
1963     ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
1964     ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
1965     ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
1966     ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
1967     PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \
1968              tmp7_m, out0, out2, out4, out6);                               \
1969     out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
1970     out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
1971     out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
1972     out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
1973   }
1974 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
1975 
1976 /* Description : Transpose 4x4 block with word elements in vectors
1977    Arguments   : Inputs  - in0, in1, in2, in3
1978                  Outputs - out0, out1, out2, out3
1979                  Return Type - signed word
1980 */
1981 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
1982   {                                                                    \
1983     v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
1984                                                                        \
1985     ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                 \
1986     ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                 \
1987                                                                        \
1988     out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);              \
1989     out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);              \
1990     out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);              \
1991     out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);              \
1992   }
1993 
1994 /* Description : Add block 4x4
1995    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
1996    Details     : Least significant 4 bytes from each input vector are added to
1997                  the destination bytes, clipped between 0-255 and stored.
1998 */
1999 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)        \
2000   {                                                              \
2001     uint32_t src0_m, src1_m, src2_m, src3_m;                     \
2002     v8i16 inp0_m, inp1_m, res0_m, res1_m;                        \
2003     v16i8 dst0_m = { 0 };                                        \
2004     v16i8 dst1_m = { 0 };                                        \
2005     v16i8 zero_m = { 0 };                                        \
2006                                                                  \
2007     ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)               \
2008     LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m);           \
2009     INSERT_W2_SB(src0_m, src1_m, dst0_m);                        \
2010     INSERT_W2_SB(src2_m, src3_m, dst1_m);                        \
2011     ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);  \
2012     ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);        \
2013     CLIP_SH2_0_255(res0_m, res1_m);                              \
2014     PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
2015     ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);          \
2016   }
2017 
2018 /* Description : Pack even elements of input vectors & xor with 128
2019    Arguments   : Inputs - in0, in1
2020                  Output - out_m
2021                  Return Type - unsigned byte
2022    Details     : Signed byte even elements from 'in0' and 'in1' are packed
2023                  together in one vector and the resulting vector is xor'ed with
2024                  128 to shift the range from signed to unsigned byte
2025 */
2026 #define PCKEV_XORI128_UB(in0, in1)                        \
2027   ({                                                      \
2028     v16u8 out_m;                                          \
2029                                                           \
2030     out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
2031     out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);       \
2032     out_m;                                                \
2033   })
2034 
2035 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2036                  as 8x4 unsigned byte block
2037    Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
2038                           pdst, stride
2039 */
2040 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \
2041                                 pdst, stride)                               \
2042   {                                                                         \
2043     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
2044                                                                             \
2045     tmp0_m = PCKEV_XORI128_UB(in0, in1);                                    \
2046     tmp1_m = PCKEV_XORI128_UB(in2, in3);                                    \
2047     ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
2048     AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);            \
2049     ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                 \
2050   }
2051 
2052 /* Description : Pack even byte elements and store byte vector in destination
2053                  memory
2054    Arguments   : Inputs - in0, in1, pdst
2055 */
2056 #define PCKEV_ST_SB(in0, in1, pdst)                \
2057   {                                                \
2058     v16i8 tmp_m;                                   \
2059                                                    \
2060     tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
2061     ST_SB(tmp_m, (pdst));                          \
2062   }
2063 
2064 /* Description : Horizontal 2 tap filter kernel code
2065    Arguments   : Inputs - in0, in1, mask, coeff, shift
2066 */
2067 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)        \
2068   ({                                                            \
2069     v16i8 tmp0_m;                                               \
2070     v8u16 tmp1_m;                                               \
2071                                                                 \
2072     tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
2073     tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);       \
2074     tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);        \
2075                                                                 \
2076     tmp1_m;                                                     \
2077   })
2078 #endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */
2079