1 // Copyright 2016 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // MSA common macros 11 // 12 // Author(s): Prashant Patil (prashant.patil@imgtec.com) 13 14 #ifndef WEBP_DSP_MSA_MACRO_H_ 15 #define WEBP_DSP_MSA_MACRO_H_ 16 17 #include <stdint.h> 18 #include <msa.h> 19 20 #if defined(__clang__) 21 #define CLANG_BUILD 22 #endif 23 24 #ifdef CLANG_BUILD 25 #define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b) 26 #define ADDVI_W(a, b) __msa_addvi_w((v4i32)a, b) 27 #define SRAI_B(a, b) __msa_srai_b((v16i8)a, b) 28 #define SRAI_H(a, b) __msa_srai_h((v8i16)a, b) 29 #define SRAI_W(a, b) __msa_srai_w((v4i32)a, b) 30 #define SRLI_H(a, b) __msa_srli_h((v8i16)a, b) 31 #define SLLI_B(a, b) __msa_slli_b((v4i32)a, b) 32 #define ANDI_B(a, b) __msa_andi_b((v16u8)a, b) 33 #define ORI_B(a, b) __msa_ori_b((v16u8)a, b) 34 #else 35 #define ADDVI_H(a, b) (a + b) 36 #define ADDVI_W(a, b) (a + b) 37 #define SRAI_B(a, b) (a >> b) 38 #define SRAI_H(a, b) (a >> b) 39 #define SRAI_W(a, b) (a >> b) 40 #define SRLI_H(a, b) (a << b) 41 #define SLLI_B(a, b) (a << b) 42 #define ANDI_B(a, b) (a & b) 43 #define ORI_B(a, b) (a | b) 44 #endif 45 46 #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) 47 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__) 48 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__) 49 50 #define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) 51 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__) 52 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__) 53 54 #define LD_W(RTYPE, psrc) *((RTYPE*)(psrc)) 55 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__) 56 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__) 57 58 #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in 59 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) 60 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__) 61 62 #define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in 63 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__) 64 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__) 65 66 #define ST_W(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in 67 #define ST_UW(...) ST_W(v4u32, __VA_ARGS__) 68 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__) 69 70 #define MSA_LOAD_FUNC(TYPE, INSTR, FUNC_NAME) \ 71 static inline TYPE FUNC_NAME(const void* const psrc) { \ 72 const uint8_t* const psrc_m = (const uint8_t*)psrc; \ 73 TYPE val_m; \ 74 asm volatile ( \ 75 "" #INSTR " %[val_m], %[psrc_m] \n\t" \ 76 : [val_m] "=r" (val_m) \ 77 : [psrc_m] "m" (*psrc_m)); \ 78 return val_m; \ 79 } 80 81 #define MSA_LOAD(psrc, FUNC_NAME) FUNC_NAME(psrc) 82 83 #define MSA_STORE_FUNC(TYPE, INSTR, FUNC_NAME) \ 84 static inline void FUNC_NAME(TYPE val, void* const pdst) { \ 85 uint8_t* const pdst_m = (uint8_t*)pdst; \ 86 TYPE val_m = val; \ 87 asm volatile ( \ 88 " " #INSTR " %[val_m], %[pdst_m] \n\t" \ 89 : [pdst_m] "=m" (*pdst_m) \ 90 : [val_m] "r" (val_m)); \ 91 } 92 93 #define MSA_STORE(val, pdst, FUNC_NAME) FUNC_NAME(val, pdst) 94 95 #if (__mips_isa_rev >= 6) 96 MSA_LOAD_FUNC(uint16_t, lh, msa_lh); 97 #define LH(psrc) MSA_LOAD(psrc, msa_lh) 98 MSA_LOAD_FUNC(uint32_t, lw, msa_lw); 99 #define LW(psrc) MSA_LOAD(psrc, msa_lw) 100 #if (__mips == 64) 101 MSA_LOAD_FUNC(uint64_t, ld, msa_ld); 102 #define LD(psrc) MSA_LOAD(psrc, msa_ld) 103 #else // !(__mips == 64) 104 #define LD(psrc) ((((uint64_t)MSA_LOAD(psrc + 4, msa_lw)) << 32) | \ 105 MSA_LOAD(psrc, msa_lw)) 106 #endif // (__mips == 64) 107 108 MSA_STORE_FUNC(uint16_t, sh, msa_sh); 109 #define SH(val, pdst) MSA_STORE(val, pdst, msa_sh) 110 MSA_STORE_FUNC(uint32_t, sw, msa_sw); 111 #define SW(val, pdst) MSA_STORE(val, pdst, msa_sw) 112 MSA_STORE_FUNC(uint64_t, sd, msa_sd); 113 #define SD(val, pdst) MSA_STORE(val, pdst, msa_sd) 114 #else // !(__mips_isa_rev >= 6) 115 MSA_LOAD_FUNC(uint16_t, ulh, msa_ulh); 116 #define LH(psrc) MSA_LOAD(psrc, msa_ulh) 117 MSA_LOAD_FUNC(uint32_t, ulw, msa_ulw); 118 #define LW(psrc) MSA_LOAD(psrc, msa_ulw) 119 #if (__mips == 64) 120 MSA_LOAD_FUNC(uint64_t, uld, msa_uld); 121 #define LD(psrc) MSA_LOAD(psrc, msa_uld) 122 #else // !(__mips == 64) 123 #define LD(psrc) ((((uint64_t)MSA_LOAD(psrc + 4, msa_ulw)) << 32) | \ 124 MSA_LOAD(psrc, msa_ulw)) 125 #endif // (__mips == 64) 126 127 MSA_STORE_FUNC(uint16_t, ush, msa_ush); 128 #define SH(val, pdst) MSA_STORE(val, pdst, msa_ush) 129 MSA_STORE_FUNC(uint32_t, usw, msa_usw); 130 #define SW(val, pdst) MSA_STORE(val, pdst, msa_usw) 131 #define SD(val, pdst) do { \ 132 uint8_t* const pdst_sd_m = (uint8_t*)(pdst); \ 133 const uint32_t val0_m = (uint32_t)(val & 0x00000000FFFFFFFF); \ 134 const uint32_t val1_m = (uint32_t)((val >> 32) & 0x00000000FFFFFFFF); \ 135 SW(val0_m, pdst_sd_m); \ 136 SW(val1_m, pdst_sd_m + 4); \ 137 } while (0) 138 #endif // (__mips_isa_rev >= 6) 139 140 /* Description : Load 4 words with stride 141 * Arguments : Inputs - psrc, stride 142 * Outputs - out0, out1, out2, out3 143 * Details : Load word in 'out0' from (psrc) 144 * Load word in 'out1' from (psrc + stride) 145 * Load word in 'out2' from (psrc + 2 * stride) 146 * Load word in 'out3' from (psrc + 3 * stride) 147 */ 148 #define LW4(psrc, stride, out0, out1, out2, out3) do { \ 149 const uint8_t* ptmp = (const uint8_t*)psrc; \ 150 out0 = LW(ptmp); \ 151 ptmp += stride; \ 152 out1 = LW(ptmp); \ 153 ptmp += stride; \ 154 out2 = LW(ptmp); \ 155 ptmp += stride; \ 156 out3 = LW(ptmp); \ 157 } while (0) 158 159 /* Description : Store words with stride 160 * Arguments : Inputs - in0, in1, in2, in3, pdst, stride 161 * Details : Store word from 'in0' to (pdst) 162 * Store word from 'in1' to (pdst + stride) 163 * Store word from 'in2' to (pdst + 2 * stride) 164 * Store word from 'in3' to (pdst + 3 * stride) 165 */ 166 #define SW4(in0, in1, in2, in3, pdst, stride) do { \ 167 uint8_t* ptmp = (uint8_t*)pdst; \ 168 SW(in0, ptmp); \ 169 ptmp += stride; \ 170 SW(in1, ptmp); \ 171 ptmp += stride; \ 172 SW(in2, ptmp); \ 173 ptmp += stride; \ 174 SW(in3, ptmp); \ 175 } while (0) 176 177 #define SW3(in0, in1, in2, pdst, stride) do { \ 178 uint8_t* ptmp = (uint8_t*)pdst; \ 179 SW(in0, ptmp); \ 180 ptmp += stride; \ 181 SW(in1, ptmp); \ 182 ptmp += stride; \ 183 SW(in2, ptmp); \ 184 } while (0) 185 186 #define SW2(in0, in1, pdst, stride) do { \ 187 uint8_t* ptmp = (uint8_t*)pdst; \ 188 SW(in0, ptmp); \ 189 ptmp += stride; \ 190 SW(in1, ptmp); \ 191 } while (0) 192 193 /* Description : Store 4 double words with stride 194 * Arguments : Inputs - in0, in1, in2, in3, pdst, stride 195 * Details : Store double word from 'in0' to (pdst) 196 * Store double word from 'in1' to (pdst + stride) 197 * Store double word from 'in2' to (pdst + 2 * stride) 198 * Store double word from 'in3' to (pdst + 3 * stride) 199 */ 200 #define SD4(in0, in1, in2, in3, pdst, stride) do { \ 201 uint8_t* ptmp = (uint8_t*)pdst; \ 202 SD(in0, ptmp); \ 203 ptmp += stride; \ 204 SD(in1, ptmp); \ 205 ptmp += stride; \ 206 SD(in2, ptmp); \ 207 ptmp += stride; \ 208 SD(in3, ptmp); \ 209 } while (0) 210 211 /* Description : Load vectors with 16 byte elements with stride 212 * Arguments : Inputs - psrc, stride 213 * Outputs - out0, out1 214 * Return Type - as per RTYPE 215 * Details : Load 16 byte elements in 'out0' from (psrc) 216 * Load 16 byte elements in 'out1' from (psrc + stride) 217 */ 218 #define LD_B2(RTYPE, psrc, stride, out0, out1) do { \ 219 out0 = LD_B(RTYPE, psrc); \ 220 out1 = LD_B(RTYPE, psrc + stride); \ 221 } while (0) 222 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) 223 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) 224 225 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) do { \ 226 LD_B2(RTYPE, psrc, stride, out0, out1); \ 227 out2 = LD_B(RTYPE, psrc + 2 * stride); \ 228 } while (0) 229 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) 230 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__) 231 232 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) do { \ 233 LD_B2(RTYPE, psrc, stride, out0, out1); \ 234 LD_B2(RTYPE, psrc + 2 * stride , stride, out2, out3); \ 235 } while (0) 236 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) 237 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) 238 239 #define LD_B8(RTYPE, psrc, stride, \ 240 out0, out1, out2, out3, out4, out5, out6, out7) do { \ 241 LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3); \ 242 LD_B4(RTYPE, psrc + 4 * stride, stride, out4, out5, out6, out7); \ 243 } while (0) 244 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) 245 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) 246 247 /* Description : Load vectors with 8 halfword elements with stride 248 * Arguments : Inputs - psrc, stride 249 * Outputs - out0, out1 250 * Details : Load 8 halfword elements in 'out0' from (psrc) 251 * Load 8 halfword elements in 'out1' from (psrc + stride) 252 */ 253 #define LD_H2(RTYPE, psrc, stride, out0, out1) do { \ 254 out0 = LD_H(RTYPE, psrc); \ 255 out1 = LD_H(RTYPE, psrc + stride); \ 256 } while (0) 257 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__) 258 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) 259 260 /* Description : Load vectors with 4 word elements with stride 261 * Arguments : Inputs - psrc, stride 262 * Outputs - out0, out1, out2, out3 263 * Details : Load 4 word elements in 'out0' from (psrc + 0 * stride) 264 * Load 4 word elements in 'out1' from (psrc + 1 * stride) 265 * Load 4 word elements in 'out2' from (psrc + 2 * stride) 266 * Load 4 word elements in 'out3' from (psrc + 3 * stride) 267 */ 268 #define LD_W2(RTYPE, psrc, stride, out0, out1) do { \ 269 out0 = LD_W(RTYPE, psrc); \ 270 out1 = LD_W(RTYPE, psrc + stride); \ 271 } while (0) 272 #define LD_UW2(...) LD_W2(v4u32, __VA_ARGS__) 273 #define LD_SW2(...) LD_W2(v4i32, __VA_ARGS__) 274 275 #define LD_W3(RTYPE, psrc, stride, out0, out1, out2) do { \ 276 LD_W2(RTYPE, psrc, stride, out0, out1); \ 277 out2 = LD_W(RTYPE, psrc + 2 * stride); \ 278 } while (0) 279 #define LD_UW3(...) LD_W3(v4u32, __VA_ARGS__) 280 #define LD_SW3(...) LD_W3(v4i32, __VA_ARGS__) 281 282 #define LD_W4(RTYPE, psrc, stride, out0, out1, out2, out3) do { \ 283 LD_W2(RTYPE, psrc, stride, out0, out1); \ 284 LD_W2(RTYPE, psrc + 2 * stride, stride, out2, out3); \ 285 } while (0) 286 #define LD_UW4(...) LD_W4(v4u32, __VA_ARGS__) 287 #define LD_SW4(...) LD_W4(v4i32, __VA_ARGS__) 288 289 /* Description : Store vectors of 16 byte elements with stride 290 * Arguments : Inputs - in0, in1, pdst, stride 291 * Details : Store 16 byte elements from 'in0' to (pdst) 292 * Store 16 byte elements from 'in1' to (pdst + stride) 293 */ 294 #define ST_B2(RTYPE, in0, in1, pdst, stride) do { \ 295 ST_B(RTYPE, in0, pdst); \ 296 ST_B(RTYPE, in1, pdst + stride); \ 297 } while (0) 298 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) 299 #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__) 300 301 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) do { \ 302 ST_B2(RTYPE, in0, in1, pdst, stride); \ 303 ST_B2(RTYPE, in2, in3, pdst + 2 * stride, stride); \ 304 } while (0) 305 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) 306 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__) 307 308 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 309 pdst, stride) do { \ 310 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ 311 ST_B4(RTYPE, in4, in5, in6, in7, pdst + 4 * stride, stride); \ 312 } while (0) 313 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) 314 315 /* Description : Store vectors of 4 word elements with stride 316 * Arguments : Inputs - in0, in1, in2, in3, pdst, stride 317 * Details : Store 4 word elements from 'in0' to (pdst + 0 * stride) 318 * Store 4 word elements from 'in1' to (pdst + 1 * stride) 319 * Store 4 word elements from 'in2' to (pdst + 2 * stride) 320 * Store 4 word elements from 'in3' to (pdst + 3 * stride) 321 */ 322 #define ST_W2(RTYPE, in0, in1, pdst, stride) do { \ 323 ST_W(RTYPE, in0, pdst); \ 324 ST_W(RTYPE, in1, pdst + stride); \ 325 } while (0) 326 #define ST_UW2(...) ST_W2(v4u32, __VA_ARGS__) 327 #define ST_SW2(...) ST_W2(v4i32, __VA_ARGS__) 328 329 #define ST_W3(RTYPE, in0, in1, in2, pdst, stride) do { \ 330 ST_W2(RTYPE, in0, in1, pdst, stride); \ 331 ST_W(RTYPE, in2, pdst + 2 * stride); \ 332 } while (0) 333 #define ST_UW3(...) ST_W3(v4u32, __VA_ARGS__) 334 #define ST_SW3(...) ST_W3(v4i32, __VA_ARGS__) 335 336 #define ST_W4(RTYPE, in0, in1, in2, in3, pdst, stride) do { \ 337 ST_W2(RTYPE, in0, in1, pdst, stride); \ 338 ST_W2(RTYPE, in2, in3, pdst + 2 * stride, stride); \ 339 } while (0) 340 #define ST_UW4(...) ST_W4(v4u32, __VA_ARGS__) 341 #define ST_SW4(...) ST_W4(v4i32, __VA_ARGS__) 342 343 /* Description : Store vectors of 8 halfword elements with stride 344 * Arguments : Inputs - in0, in1, pdst, stride 345 * Details : Store 8 halfword elements from 'in0' to (pdst) 346 * Store 8 halfword elements from 'in1' to (pdst + stride) 347 */ 348 #define ST_H2(RTYPE, in0, in1, pdst, stride) do { \ 349 ST_H(RTYPE, in0, pdst); \ 350 ST_H(RTYPE, in1, pdst + stride); \ 351 } while (0) 352 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__) 353 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) 354 355 /* Description : Store 2x4 byte block to destination memory from input vector 356 * Arguments : Inputs - in, stidx, pdst, stride 357 * Details : Index 'stidx' halfword element from 'in' vector is copied to 358 * the GP register and stored to (pdst) 359 * Index 'stidx+1' halfword element from 'in' vector is copied to 360 * the GP register and stored to (pdst + stride) 361 * Index 'stidx+2' halfword element from 'in' vector is copied to 362 * the GP register and stored to (pdst + 2 * stride) 363 * Index 'stidx+3' halfword element from 'in' vector is copied to 364 * the GP register and stored to (pdst + 3 * stride) 365 */ 366 #define ST2x4_UB(in, stidx, pdst, stride) do { \ 367 uint8_t* pblk_2x4_m = (uint8_t*)pdst; \ 368 const uint16_t out0_m = __msa_copy_s_h((v8i16)in, stidx); \ 369 const uint16_t out1_m = __msa_copy_s_h((v8i16)in, stidx + 1); \ 370 const uint16_t out2_m = __msa_copy_s_h((v8i16)in, stidx + 2); \ 371 const uint16_t out3_m = __msa_copy_s_h((v8i16)in, stidx + 3); \ 372 SH(out0_m, pblk_2x4_m); \ 373 pblk_2x4_m += stride; \ 374 SH(out1_m, pblk_2x4_m); \ 375 pblk_2x4_m += stride; \ 376 SH(out2_m, pblk_2x4_m); \ 377 pblk_2x4_m += stride; \ 378 SH(out3_m, pblk_2x4_m); \ 379 } while (0) 380 381 /* Description : Store 4x4 byte block to destination memory from input vector 382 * Arguments : Inputs - in0, in1, pdst, stride 383 * Details : 'Idx0' word element from input vector 'in0' is copied to the 384 * GP register and stored to (pdst) 385 * 'Idx1' word element from input vector 'in0' is copied to the 386 * GP register and stored to (pdst + stride) 387 * 'Idx2' word element from input vector 'in0' is copied to the 388 * GP register and stored to (pdst + 2 * stride) 389 * 'Idx3' word element from input vector 'in0' is copied to the 390 * GP register and stored to (pdst + 3 * stride) 391 */ 392 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) do { \ 393 uint8_t* const pblk_4x4_m = (uint8_t*)pdst; \ 394 const uint32_t out0_m = __msa_copy_s_w((v4i32)in0, idx0); \ 395 const uint32_t out1_m = __msa_copy_s_w((v4i32)in0, idx1); \ 396 const uint32_t out2_m = __msa_copy_s_w((v4i32)in1, idx2); \ 397 const uint32_t out3_m = __msa_copy_s_w((v4i32)in1, idx3); \ 398 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ 399 } while (0) 400 401 #define ST4x8_UB(in0, in1, pdst, stride) do { \ 402 uint8_t* const pblk_4x8 = (uint8_t*)pdst; \ 403 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ 404 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ 405 } while (0) 406 407 /* Description : Immediate number of elements to slide 408 * Arguments : Inputs - in0, in1, slide_val 409 * Outputs - out 410 * Return Type - as per RTYPE 411 * Details : Byte elements from 'in1' vector are slid into 'in0' by 412 * value specified in the 'slide_val' 413 */ 414 #define SLDI_B(RTYPE, in0, in1, slide_val) \ 415 (RTYPE)__msa_sldi_b((v16i8)in0, (v16i8)in1, slide_val) \ 416 417 #define SLDI_UB(...) SLDI_B(v16u8, __VA_ARGS__) 418 #define SLDI_SB(...) SLDI_B(v16i8, __VA_ARGS__) 419 #define SLDI_SH(...) SLDI_B(v8i16, __VA_ARGS__) 420 421 /* Description : Shuffle byte vector elements as per mask vector 422 * Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 423 * Outputs - out0, out1 424 * Return Type - as per RTYPE 425 * Details : Byte elements from 'in0' & 'in1' are copied selectively to 426 * 'out0' as per control vector 'mask0' 427 */ 428 #define VSHF_B(RTYPE, in0, in1, mask) \ 429 (RTYPE)__msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0) 430 431 #define VSHF_UB(...) VSHF_B(v16u8, __VA_ARGS__) 432 #define VSHF_SB(...) VSHF_B(v16i8, __VA_ARGS__) 433 #define VSHF_UH(...) VSHF_B(v8u16, __VA_ARGS__) 434 #define VSHF_SH(...) VSHF_B(v8i16, __VA_ARGS__) 435 436 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) do { \ 437 out0 = VSHF_B(RTYPE, in0, in1, mask0); \ 438 out1 = VSHF_B(RTYPE, in2, in3, mask1); \ 439 } while (0) 440 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) 441 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) 442 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) 443 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__) 444 445 /* Description : Shuffle halfword vector elements as per mask vector 446 * Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 447 * Outputs - out0, out1 448 * Return Type - as per RTYPE 449 * Details : halfword elements from 'in0' & 'in1' are copied selectively to 450 * 'out0' as per control vector 'mask0' 451 */ 452 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) do { \ 453 out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \ 454 out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \ 455 } while (0) 456 #define VSHF_H2_UH(...) VSHF_H2(v8u16, __VA_ARGS__) 457 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__) 458 459 /* Description : Dot product of byte vector elements 460 * Arguments : Inputs - mult0, mult1, cnst0, cnst1 461 * Outputs - out0, out1 462 * Return Type - as per RTYPE 463 * Details : Signed byte elements from 'mult0' are multiplied with 464 * signed byte elements from 'cnst0' producing a result 465 * twice the size of input i.e. signed halfword. 466 * The multiplication result of adjacent odd-even elements 467 * are added together and written to the 'out0' vector 468 */ 469 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \ 470 out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ 471 out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ 472 } while (0) 473 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) 474 475 /* Description : Dot product of halfword vector elements 476 * Arguments : Inputs - mult0, mult1, cnst0, cnst1 477 * Outputs - out0, out1 478 * Return Type - as per RTYPE 479 * Details : Signed halfword elements from 'mult0' are multiplied with 480 * signed halfword elements from 'cnst0' producing a result 481 * twice the size of input i.e. signed word. 482 * The multiplication result of adjacent odd-even elements 483 * are added together and written to the 'out0' vector 484 */ 485 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \ 486 out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ 487 out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ 488 } while (0) 489 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) 490 491 /* Description : Dot product of unsigned word vector elements 492 * Arguments : Inputs - mult0, mult1, cnst0, cnst1 493 * Outputs - out0, out1 494 * Return Type - as per RTYPE 495 * Details : Unsigned word elements from 'mult0' are multiplied with 496 * unsigned word elements from 'cnst0' producing a result 497 * twice the size of input i.e. unsigned double word. 498 * The multiplication result of adjacent odd-even elements 499 * are added together and written to the 'out0' vector 500 */ 501 #define DOTP_UW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \ 502 out0 = (RTYPE)__msa_dotp_u_d((v4u32)mult0, (v4u32)cnst0); \ 503 out1 = (RTYPE)__msa_dotp_u_d((v4u32)mult1, (v4u32)cnst1); \ 504 } while (0) 505 #define DOTP_UW2_UD(...) DOTP_UW2(v2u64, __VA_ARGS__) 506 507 /* Description : Dot product & addition of halfword vector elements 508 * Arguments : Inputs - mult0, mult1, cnst0, cnst1 509 * Outputs - out0, out1 510 * Return Type - as per RTYPE 511 * Details : Signed halfword elements from 'mult0' are multiplied with 512 * signed halfword elements from 'cnst0' producing a result 513 * twice the size of input i.e. signed word. 514 * The multiplication result of adjacent odd-even elements 515 * are added to the 'out0' vector 516 */ 517 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \ 518 out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ 519 out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ 520 } while (0) 521 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) 522 523 /* Description : Clips all signed halfword elements of input vector 524 * between 0 & 255 525 * Arguments : Input/output - val 526 * Return Type - signed halfword 527 */ 528 #define CLIP_SH_0_255(val) do { \ 529 const v8i16 max_m = __msa_ldi_h(255); \ 530 val = __msa_maxi_s_h((v8i16)val, 0); \ 531 val = __msa_min_s_h(max_m, (v8i16)val); \ 532 } while (0) 533 534 #define CLIP_SH2_0_255(in0, in1) do { \ 535 CLIP_SH_0_255(in0); \ 536 CLIP_SH_0_255(in1); \ 537 } while (0) 538 539 #define CLIP_SH4_0_255(in0, in1, in2, in3) do { \ 540 CLIP_SH2_0_255(in0, in1); \ 541 CLIP_SH2_0_255(in2, in3); \ 542 } while (0) 543 544 /* Description : Clips all unsigned halfword elements of input vector 545 * between 0 & 255 546 * Arguments : Input - in 547 * Output - out_m 548 * Return Type - unsigned halfword 549 */ 550 #define CLIP_UH_0_255(in) do { \ 551 const v8u16 max_m = (v8u16)__msa_ldi_h(255); \ 552 in = __msa_maxi_u_h((v8u16) in, 0); \ 553 in = __msa_min_u_h((v8u16) max_m, (v8u16) in); \ 554 } while (0) 555 556 #define CLIP_UH2_0_255(in0, in1) do { \ 557 CLIP_UH_0_255(in0); \ 558 CLIP_UH_0_255(in1); \ 559 } while (0) 560 561 /* Description : Clips all signed word elements of input vector 562 * between 0 & 255 563 * Arguments : Input/output - val 564 * Return Type - signed word 565 */ 566 #define CLIP_SW_0_255(val) do { \ 567 const v4i32 max_m = __msa_ldi_w(255); \ 568 val = __msa_maxi_s_w((v4i32)val, 0); \ 569 val = __msa_min_s_w(max_m, (v4i32)val); \ 570 } while (0) 571 572 #define CLIP_SW4_0_255(in0, in1, in2, in3) do { \ 573 CLIP_SW_0_255(in0); \ 574 CLIP_SW_0_255(in1); \ 575 CLIP_SW_0_255(in2); \ 576 CLIP_SW_0_255(in3); \ 577 } while (0) 578 579 /* Description : Horizontal addition of 4 signed word elements of input vector 580 * Arguments : Input - in (signed word vector) 581 * Output - sum_m (i32 sum) 582 * Return Type - signed word (GP) 583 * Details : 4 signed word elements of 'in' vector are added together and 584 * the resulting integer sum is returned 585 */ func_hadd_sw_s32(v4i32 in)586 static WEBP_INLINE int32_t func_hadd_sw_s32(v4i32 in) { 587 const v2i64 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); 588 const v2i64 res1_m = __msa_splati_d(res0_m, 1); 589 const v2i64 out = res0_m + res1_m; 590 int32_t sum_m = __msa_copy_s_w((v4i32)out, 0); 591 return sum_m; 592 } 593 #define HADD_SW_S32(in) func_hadd_sw_s32(in) 594 595 /* Description : Horizontal addition of 8 signed halfword elements 596 * Arguments : Input - in (signed halfword vector) 597 * Output - sum_m (s32 sum) 598 * Return Type - signed word 599 * Details : 8 signed halfword elements of input vector are added 600 * together and the resulting integer sum is returned 601 */ func_hadd_sh_s32(v8i16 in)602 static WEBP_INLINE int32_t func_hadd_sh_s32(v8i16 in) { 603 const v4i32 res = __msa_hadd_s_w(in, in); 604 const v2i64 res0 = __msa_hadd_s_d(res, res); 605 const v2i64 res1 = __msa_splati_d(res0, 1); 606 const v2i64 res2 = res0 + res1; 607 const int32_t sum_m = __msa_copy_s_w((v4i32)res2, 0); 608 return sum_m; 609 } 610 #define HADD_SH_S32(in) func_hadd_sh_s32(in) 611 612 /* Description : Horizontal addition of 8 unsigned halfword elements 613 * Arguments : Input - in (unsigned halfword vector) 614 * Output - sum_m (u32 sum) 615 * Return Type - unsigned word 616 * Details : 8 unsigned halfword elements of input vector are added 617 * together and the resulting integer sum is returned 618 */ func_hadd_uh_u32(v8u16 in)619 static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) { 620 uint32_t sum_m; 621 const v4u32 res_m = __msa_hadd_u_w(in, in); 622 v2u64 res0_m = __msa_hadd_u_d(res_m, res_m); 623 v2u64 res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); 624 res0_m = res0_m + res1_m; 625 sum_m = __msa_copy_s_w((v4i32)res0_m, 0); 626 return sum_m; 627 } 628 #define HADD_UH_U32(in) func_hadd_uh_u32(in) 629 630 /* Description : Horizontal addition of signed half word vector elements 631 Arguments : Inputs - in0, in1 632 Outputs - out0, out1 633 Return Type - as per RTYPE 634 Details : Each signed odd half word element from 'in0' is added to 635 even signed half word element from 'in0' (pairwise) and the 636 halfword result is written in 'out0' 637 */ 638 #define HADD_SH2(RTYPE, in0, in1, out0, out1) do { \ 639 out0 = (RTYPE)__msa_hadd_s_w((v8i16)in0, (v8i16)in0); \ 640 out1 = (RTYPE)__msa_hadd_s_w((v8i16)in1, (v8i16)in1); \ 641 } while (0) 642 #define HADD_SH2_SW(...) HADD_SH2(v4i32, __VA_ARGS__) 643 644 #define HADD_SH4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) do { \ 645 HADD_SH2(RTYPE, in0, in1, out0, out1); \ 646 HADD_SH2(RTYPE, in2, in3, out2, out3); \ 647 } while (0) 648 #define HADD_SH4_SW(...) HADD_SH4(v4i32, __VA_ARGS__) 649 650 /* Description : Horizontal subtraction of unsigned byte vector elements 651 * Arguments : Inputs - in0, in1 652 * Outputs - out0, out1 653 * Return Type - as per RTYPE 654 * Details : Each unsigned odd byte element from 'in0' is subtracted from 655 * even unsigned byte element from 'in0' (pairwise) and the 656 * halfword result is written to 'out0' 657 */ 658 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) do { \ 659 out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ 660 out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ 661 } while (0) 662 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__) 663 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) 664 #define HSUB_UB2_SW(...) HSUB_UB2(v4i32, __VA_ARGS__) 665 666 /* Description : Set element n input vector to GPR value 667 * Arguments : Inputs - in0, in1, in2, in3 668 * Output - out 669 * Return Type - as per RTYPE 670 * Details : Set element 0 in vector 'out' to value specified in 'in0' 671 */ 672 #define INSERT_W2(RTYPE, in0, in1, out) do { \ 673 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ 674 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ 675 } while (0) 676 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__) 677 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) 678 679 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) do { \ 680 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ 681 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ 682 out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ 683 out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ 684 } while (0) 685 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) 686 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) 687 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__) 688 689 /* Description : Set element n of double word input vector to GPR value 690 * Arguments : Inputs - in0, in1 691 * Output - out 692 * Return Type - as per RTYPE 693 * Details : Set element 0 in vector 'out' to GPR value specified in 'in0' 694 * Set element 1 in vector 'out' to GPR value specified in 'in1' 695 */ 696 #define INSERT_D2(RTYPE, in0, in1, out) do { \ 697 out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ 698 out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ 699 } while (0) 700 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) 701 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) 702 703 /* Description : Interleave even byte elements from vectors 704 * Arguments : Inputs - in0, in1, in2, in3 705 * Outputs - out0, out1 706 * Return Type - as per RTYPE 707 * Details : Even byte elements of 'in0' and 'in1' are interleaved 708 * and written to 'out0' 709 */ 710 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 711 out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ 712 out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ 713 } while (0) 714 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) 715 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__) 716 #define ILVEV_B2_UH(...) ILVEV_B2(v8u16, __VA_ARGS__) 717 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) 718 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__) 719 720 /* Description : Interleave odd byte elements from vectors 721 * Arguments : Inputs - in0, in1, in2, in3 722 * Outputs - out0, out1 723 * Return Type - as per RTYPE 724 * Details : Odd byte elements of 'in0' and 'in1' are interleaved 725 * and written to 'out0' 726 */ 727 #define ILVOD_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 728 out0 = (RTYPE)__msa_ilvod_b((v16i8)in1, (v16i8)in0); \ 729 out1 = (RTYPE)__msa_ilvod_b((v16i8)in3, (v16i8)in2); \ 730 } while (0) 731 #define ILVOD_B2_UB(...) ILVOD_B2(v16u8, __VA_ARGS__) 732 #define ILVOD_B2_SB(...) ILVOD_B2(v16i8, __VA_ARGS__) 733 #define ILVOD_B2_UH(...) ILVOD_B2(v8u16, __VA_ARGS__) 734 #define ILVOD_B2_SH(...) ILVOD_B2(v8i16, __VA_ARGS__) 735 #define ILVOD_B2_SD(...) ILVOD_B2(v2i64, __VA_ARGS__) 736 737 /* Description : Interleave even halfword elements from vectors 738 * Arguments : Inputs - in0, in1, in2, in3 739 * Outputs - out0, out1 740 * Return Type - as per RTYPE 741 * Details : Even halfword elements of 'in0' and 'in1' are interleaved 742 * and written to 'out0' 743 */ 744 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 745 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ 746 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ 747 } while (0) 748 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) 749 #define ILVEV_H2_UH(...) ILVEV_H2(v8u16, __VA_ARGS__) 750 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) 751 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) 752 753 /* Description : Interleave odd halfword elements from vectors 754 * Arguments : Inputs - in0, in1, in2, in3 755 * Outputs - out0, out1 756 * Return Type - as per RTYPE 757 * Details : Odd halfword elements of 'in0' and 'in1' are interleaved 758 * and written to 'out0' 759 */ 760 #define ILVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 761 out0 = (RTYPE)__msa_ilvod_h((v8i16)in1, (v8i16)in0); \ 762 out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2); \ 763 } while (0) 764 #define ILVOD_H2_UB(...) ILVOD_H2(v16u8, __VA_ARGS__) 765 #define ILVOD_H2_UH(...) ILVOD_H2(v8u16, __VA_ARGS__) 766 #define ILVOD_H2_SH(...) ILVOD_H2(v8i16, __VA_ARGS__) 767 #define ILVOD_H2_SW(...) ILVOD_H2(v4i32, __VA_ARGS__) 768 769 /* Description : Interleave even word elements from vectors 770 * Arguments : Inputs - in0, in1, in2, in3 771 * Outputs - out0, out1 772 * Return Type - as per RTYPE 773 * Details : Even word elements of 'in0' and 'in1' are interleaved 774 * and written to 'out0' 775 */ 776 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 777 out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ 778 out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ 779 } while (0) 780 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__) 781 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) 782 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__) 783 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__) 784 785 /* Description : Interleave even-odd word elements from vectors 786 * Arguments : Inputs - in0, in1, in2, in3 787 * Outputs - out0, out1 788 * Return Type - as per RTYPE 789 * Details : Even word elements of 'in0' and 'in1' are interleaved 790 * and written to 'out0' 791 * Odd word elements of 'in2' and 'in3' are interleaved 792 * and written to 'out1' 793 */ 794 #define ILVEVOD_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 795 out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ 796 out1 = (RTYPE)__msa_ilvod_w((v4i32)in3, (v4i32)in2); \ 797 } while (0) 798 #define ILVEVOD_W2_UB(...) ILVEVOD_W2(v16u8, __VA_ARGS__) 799 #define ILVEVOD_W2_UH(...) ILVEVOD_W2(v8u16, __VA_ARGS__) 800 #define ILVEVOD_W2_SH(...) ILVEVOD_W2(v8i16, __VA_ARGS__) 801 #define ILVEVOD_W2_SW(...) ILVEVOD_W2(v4i32, __VA_ARGS__) 802 803 /* Description : Interleave even-odd half-word elements from vectors 804 * Arguments : Inputs - in0, in1, in2, in3 805 * Outputs - out0, out1 806 * Return Type - as per RTYPE 807 * Details : Even half-word elements of 'in0' and 'in1' are interleaved 808 * and written to 'out0' 809 * Odd half-word elements of 'in2' and 'in3' are interleaved 810 * and written to 'out1' 811 */ 812 #define ILVEVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 813 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ 814 out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2); \ 815 } while (0) 816 #define ILVEVOD_H2_UB(...) ILVEVOD_H2(v16u8, __VA_ARGS__) 817 #define ILVEVOD_H2_UH(...) ILVEVOD_H2(v8u16, __VA_ARGS__) 818 #define ILVEVOD_H2_SH(...) ILVEVOD_H2(v8i16, __VA_ARGS__) 819 #define ILVEVOD_H2_SW(...) ILVEVOD_H2(v4i32, __VA_ARGS__) 820 821 /* Description : Interleave even double word elements from vectors 822 * Arguments : Inputs - in0, in1, in2, in3 823 * Outputs - out0, out1 824 * Return Type - as per RTYPE 825 * Details : Even double word elements of 'in0' and 'in1' are interleaved 826 * and written to 'out0' 827 */ 828 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 829 out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ 830 out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ 831 } while (0) 832 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) 833 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__) 834 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__) 835 #define ILVEV_D2_SD(...) ILVEV_D2(v2i64, __VA_ARGS__) 836 837 /* Description : Interleave left half of byte elements from vectors 838 * Arguments : Inputs - in0, in1, in2, in3 839 * Outputs - out0, out1 840 * Return Type - as per RTYPE 841 * Details : Left half of byte elements of 'in0' and 'in1' are interleaved 842 * and written to 'out0'. 843 */ 844 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 845 out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 846 out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ 847 } while (0) 848 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) 849 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) 850 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) 851 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) 852 #define ILVL_B2_SW(...) ILVL_B2(v4i32, __VA_ARGS__) 853 854 /* Description : Interleave right half of byte elements from vectors 855 * Arguments : Inputs - in0, in1, in2, in3 856 * Outputs - out0, out1 857 * Return Type - as per RTYPE 858 * Details : Right half of byte elements of 'in0' and 'in1' are interleaved 859 * and written to out0. 860 */ 861 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 862 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 863 out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ 864 } while (0) 865 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) 866 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) 867 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) 868 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) 869 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__) 870 871 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 872 out0, out1, out2, out3) do { \ 873 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 874 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 875 } while (0) 876 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) 877 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) 878 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) 879 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) 880 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__) 881 882 /* Description : Interleave right half of halfword elements from vectors 883 * Arguments : Inputs - in0, in1, in2, in3 884 * Outputs - out0, out1 885 * Return Type - as per RTYPE 886 * Details : Right half of halfword elements of 'in0' and 'in1' are 887 * interleaved and written to 'out0'. 888 */ 889 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 890 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 891 out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ 892 } while (0) 893 #define ILVR_H2_UB(...) ILVR_H2(v16u8, __VA_ARGS__) 894 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) 895 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) 896 897 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 898 out0, out1, out2, out3) do { \ 899 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 900 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 901 } while (0) 902 #define ILVR_H4_UB(...) ILVR_H4(v16u8, __VA_ARGS__) 903 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) 904 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__) 905 906 /* Description : Interleave right half of double word elements from vectors 907 * Arguments : Inputs - in0, in1, in2, in3 908 * Outputs - out0, out1 909 * Return Type - as per RTYPE 910 * Details : Right half of double word elements of 'in0' and 'in1' are 911 * interleaved and written to 'out0'. 912 */ 913 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 914 out0 = (RTYPE)__msa_ilvr_d((v2i64)in0, (v2i64)in1); \ 915 out1 = (RTYPE)__msa_ilvr_d((v2i64)in2, (v2i64)in3); \ 916 } while (0) 917 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) 918 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) 919 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) 920 921 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 922 out0, out1, out2, out3) do { \ 923 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 924 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 925 } while (0) 926 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) 927 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) 928 929 /* Description : Interleave both left and right half of input vectors 930 * Arguments : Inputs - in0, in1 931 * Outputs - out0, out1 932 * Return Type - as per RTYPE 933 * Details : Right half of byte elements from 'in0' and 'in1' are 934 * interleaved and written to 'out0' 935 */ 936 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) do { \ 937 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 938 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 939 } while (0) 940 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) 941 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) 942 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) 943 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) 944 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__) 945 946 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) do { \ 947 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 948 out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 949 } while (0) 950 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__) 951 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__) 952 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) 953 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) 954 #define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__) 955 956 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) do { \ 957 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 958 out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 959 } while (0) 960 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) 961 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) 962 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) 963 #define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__) 964 965 /* Description : Pack even byte elements of vector pairs 966 * Arguments : Inputs - in0, in1, in2, in3 967 * Outputs - out0, out1 968 * Return Type - as per RTYPE 969 * Details : Even byte elements of 'in0' are copied to the left half of 970 * 'out0' & even byte elements of 'in1' are copied to the right 971 * half of 'out0'. 972 */ 973 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 974 out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ 975 out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ 976 } while (0) 977 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) 978 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) 979 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) 980 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__) 981 982 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 983 out0, out1, out2, out3) do { \ 984 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 985 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 986 } while (0) 987 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) 988 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) 989 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) 990 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__) 991 992 /* Description : Pack even halfword elements of vector pairs 993 * Arguments : Inputs - in0, in1, in2, in3 994 * Outputs - out0, out1 995 * Return Type - as per RTYPE 996 * Details : Even halfword elements of 'in0' are copied to the left half of 997 * 'out0' & even halfword elements of 'in1' are copied to the 998 * right half of 'out0'. 999 */ 1000 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 1001 out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ 1002 out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ 1003 } while (0) 1004 #define PCKEV_H2_UH(...) PCKEV_H2(v8u16, __VA_ARGS__) 1005 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) 1006 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) 1007 #define PCKEV_H2_UW(...) PCKEV_H2(v4u32, __VA_ARGS__) 1008 1009 /* Description : Pack even word elements of vector pairs 1010 * Arguments : Inputs - in0, in1, in2, in3 1011 * Outputs - out0, out1 1012 * Return Type - as per RTYPE 1013 * Details : Even word elements of 'in0' are copied to the left half of 1014 * 'out0' & even word elements of 'in1' are copied to the 1015 * right half of 'out0'. 1016 */ 1017 #define PCKEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 1018 out0 = (RTYPE)__msa_pckev_w((v4i32)in0, (v4i32)in1); \ 1019 out1 = (RTYPE)__msa_pckev_w((v4i32)in2, (v4i32)in3); \ 1020 } while (0) 1021 #define PCKEV_W2_UH(...) PCKEV_W2(v8u16, __VA_ARGS__) 1022 #define PCKEV_W2_SH(...) PCKEV_W2(v8i16, __VA_ARGS__) 1023 #define PCKEV_W2_SW(...) PCKEV_W2(v4i32, __VA_ARGS__) 1024 #define PCKEV_W2_UW(...) PCKEV_W2(v4u32, __VA_ARGS__) 1025 1026 /* Description : Pack odd halfword elements of vector pairs 1027 * Arguments : Inputs - in0, in1, in2, in3 1028 * Outputs - out0, out1 1029 * Return Type - as per RTYPE 1030 * Details : Odd halfword elements of 'in0' are copied to the left half of 1031 * 'out0' & odd halfword elements of 'in1' are copied to the 1032 * right half of 'out0'. 1033 */ 1034 #define PCKOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 1035 out0 = (RTYPE)__msa_pckod_h((v8i16)in0, (v8i16)in1); \ 1036 out1 = (RTYPE)__msa_pckod_h((v8i16)in2, (v8i16)in3); \ 1037 } while (0) 1038 #define PCKOD_H2_UH(...) PCKOD_H2(v8u16, __VA_ARGS__) 1039 #define PCKOD_H2_SH(...) PCKOD_H2(v8i16, __VA_ARGS__) 1040 #define PCKOD_H2_SW(...) PCKOD_H2(v4i32, __VA_ARGS__) 1041 #define PCKOD_H2_UW(...) PCKOD_H2(v4u32, __VA_ARGS__) 1042 1043 /* Description : Arithmetic immediate shift right all elements of word vector 1044 * Arguments : Inputs - in0, in1, shift 1045 * Outputs - in place operation 1046 * Return Type - as per input vector RTYPE 1047 * Details : Each element of vector 'in0' is right shifted by 'shift' and 1048 * the result is written in-place. 'shift' is a GP variable. 1049 */ 1050 #define SRAI_W2(RTYPE, in0, in1, shift_val) do { \ 1051 in0 = (RTYPE)SRAI_W(in0, shift_val); \ 1052 in1 = (RTYPE)SRAI_W(in1, shift_val); \ 1053 } while (0) 1054 #define SRAI_W2_SW(...) SRAI_W2(v4i32, __VA_ARGS__) 1055 #define SRAI_W2_UW(...) SRAI_W2(v4u32, __VA_ARGS__) 1056 1057 #define SRAI_W4(RTYPE, in0, in1, in2, in3, shift_val) do { \ 1058 SRAI_W2(RTYPE, in0, in1, shift_val); \ 1059 SRAI_W2(RTYPE, in2, in3, shift_val); \ 1060 } while (0) 1061 #define SRAI_W4_SW(...) SRAI_W4(v4i32, __VA_ARGS__) 1062 #define SRAI_W4_UW(...) SRAI_W4(v4u32, __VA_ARGS__) 1063 1064 /* Description : Arithmetic shift right all elements of half-word vector 1065 * Arguments : Inputs - in0, in1, shift 1066 * Outputs - in place operation 1067 * Return Type - as per input vector RTYPE 1068 * Details : Each element of vector 'in0' is right shifted by 'shift' and 1069 * the result is written in-place. 'shift' is a GP variable. 1070 */ 1071 #define SRAI_H2(RTYPE, in0, in1, shift_val) do { \ 1072 in0 = (RTYPE)SRAI_H(in0, shift_val); \ 1073 in1 = (RTYPE)SRAI_H(in1, shift_val); \ 1074 } while (0) 1075 #define SRAI_H2_SH(...) SRAI_H2(v8i16, __VA_ARGS__) 1076 #define SRAI_H2_UH(...) SRAI_H2(v8u16, __VA_ARGS__) 1077 1078 /* Description : Arithmetic rounded shift right all elements of word vector 1079 * Arguments : Inputs - in0, in1, shift 1080 * Outputs - in place operation 1081 * Return Type - as per input vector RTYPE 1082 * Details : Each element of vector 'in0' is right shifted by 'shift' and 1083 * the result is written in-place. 'shift' is a GP variable. 1084 */ 1085 #define SRARI_W2(RTYPE, in0, in1, shift) do { \ 1086 in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ 1087 in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ 1088 } while (0) 1089 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) 1090 1091 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) do { \ 1092 SRARI_W2(RTYPE, in0, in1, shift); \ 1093 SRARI_W2(RTYPE, in2, in3, shift); \ 1094 } while (0) 1095 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__) 1096 #define SRARI_W4_UW(...) SRARI_W4(v4u32, __VA_ARGS__) 1097 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) 1098 1099 /* Description : Shift right arithmetic rounded double words 1100 * Arguments : Inputs - in0, in1, shift 1101 * Outputs - in place operation 1102 * Return Type - as per RTYPE 1103 * Details : Each element of vector 'in0' is shifted right arithmetically by 1104 * the number of bits in the corresponding element in the vector 1105 * 'shift'. The last discarded bit is added to shifted value for 1106 * rounding and the result is written in-place. 1107 * 'shift' is a vector. 1108 */ 1109 #define SRAR_D2(RTYPE, in0, in1, shift) do { \ 1110 in0 = (RTYPE)__msa_srar_d((v2i64)in0, (v2i64)shift); \ 1111 in1 = (RTYPE)__msa_srar_d((v2i64)in1, (v2i64)shift); \ 1112 } while (0) 1113 #define SRAR_D2_SW(...) SRAR_D2(v4i32, __VA_ARGS__) 1114 #define SRAR_D2_SD(...) SRAR_D2(v2i64, __VA_ARGS__) 1115 #define SRAR_D2_UD(...) SRAR_D2(v2u64, __VA_ARGS__) 1116 1117 #define SRAR_D4(RTYPE, in0, in1, in2, in3, shift) do { \ 1118 SRAR_D2(RTYPE, in0, in1, shift); \ 1119 SRAR_D2(RTYPE, in2, in3, shift); \ 1120 } while (0) 1121 #define SRAR_D4_SD(...) SRAR_D4(v2i64, __VA_ARGS__) 1122 #define SRAR_D4_UD(...) SRAR_D4(v2u64, __VA_ARGS__) 1123 1124 /* Description : Addition of 2 pairs of half-word vectors 1125 * Arguments : Inputs - in0, in1, in2, in3 1126 * Outputs - out0, out1 1127 * Details : Each element in 'in0' is added to 'in1' and result is written 1128 * to 'out0'. 1129 */ 1130 #define ADDVI_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 1131 out0 = (RTYPE)ADDVI_H(in0, in1); \ 1132 out1 = (RTYPE)ADDVI_H(in2, in3); \ 1133 } while (0) 1134 #define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__) 1135 #define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__) 1136 1137 /* Description : Addition of 2 pairs of word vectors 1138 * Arguments : Inputs - in0, in1, in2, in3 1139 * Outputs - out0, out1 1140 * Details : Each element in 'in0' is added to 'in1' and result is written 1141 * to 'out0'. 1142 */ 1143 #define ADDVI_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 1144 out0 = (RTYPE)ADDVI_W(in0, in1); \ 1145 out1 = (RTYPE)ADDVI_W(in2, in3); \ 1146 } while (0) 1147 #define ADDVI_W2_SW(...) ADDVI_W2(v4i32, __VA_ARGS__) 1148 1149 /* Description : Fill 2 pairs of word vectors with GP registers 1150 * Arguments : Inputs - in0, in1 1151 * Outputs - out0, out1 1152 * Details : GP register in0 is replicated in each word element of out0 1153 * GP register in1 is replicated in each word element of out1 1154 */ 1155 #define FILL_W2(RTYPE, in0, in1, out0, out1) do { \ 1156 out0 = (RTYPE)__msa_fill_w(in0); \ 1157 out1 = (RTYPE)__msa_fill_w(in1); \ 1158 } while (0) 1159 #define FILL_W2_SW(...) FILL_W2(v4i32, __VA_ARGS__) 1160 1161 /* Description : Addition of 2 pairs of vectors 1162 * Arguments : Inputs - in0, in1, in2, in3 1163 * Outputs - out0, out1 1164 * Details : Each element in 'in0' is added to 'in1' and result is written 1165 * to 'out0'. 1166 */ 1167 #define ADD2(in0, in1, in2, in3, out0, out1) do { \ 1168 out0 = in0 + in1; \ 1169 out1 = in2 + in3; \ 1170 } while (0) 1171 1172 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ 1173 out0, out1, out2, out3) do { \ 1174 ADD2(in0, in1, in2, in3, out0, out1); \ 1175 ADD2(in4, in5, in6, in7, out2, out3); \ 1176 } while (0) 1177 1178 /* Description : Subtraction of 2 pairs of vectors 1179 * Arguments : Inputs - in0, in1, in2, in3 1180 * Outputs - out0, out1 1181 * Details : Each element in 'in1' is subtracted from 'in0' and result is 1182 * written to 'out0'. 1183 */ 1184 #define SUB2(in0, in1, in2, in3, out0, out1) do { \ 1185 out0 = in0 - in1; \ 1186 out1 = in2 - in3; \ 1187 } while (0) 1188 1189 #define SUB3(in0, in1, in2, in3, in4, in5, out0, out1, out2) do { \ 1190 out0 = in0 - in1; \ 1191 out1 = in2 - in3; \ 1192 out2 = in4 - in5; \ 1193 } while (0) 1194 1195 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \ 1196 out0, out1, out2, out3) do { \ 1197 out0 = in0 - in1; \ 1198 out1 = in2 - in3; \ 1199 out2 = in4 - in5; \ 1200 out3 = in6 - in7; \ 1201 } while (0) 1202 1203 /* Description : Addition - Subtraction of input vectors 1204 * Arguments : Inputs - in0, in1 1205 * Outputs - out0, out1 1206 * Details : Each element in 'in1' is added to 'in0' and result is 1207 * written to 'out0'. 1208 * Each element in 'in1' is subtracted from 'in0' and result is 1209 * written to 'out1'. 1210 */ 1211 #define ADDSUB2(in0, in1, out0, out1) do { \ 1212 out0 = in0 + in1; \ 1213 out1 = in0 - in1; \ 1214 } while (0) 1215 1216 /* Description : Multiplication of pairs of vectors 1217 * Arguments : Inputs - in0, in1, in2, in3 1218 * Outputs - out0, out1 1219 * Details : Each element from 'in0' is multiplied with elements from 'in1' 1220 * and the result is written to 'out0' 1221 */ 1222 #define MUL2(in0, in1, in2, in3, out0, out1) do { \ 1223 out0 = in0 * in1; \ 1224 out1 = in2 * in3; \ 1225 } while (0) 1226 1227 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \ 1228 out0, out1, out2, out3) do { \ 1229 MUL2(in0, in1, in2, in3, out0, out1); \ 1230 MUL2(in4, in5, in6, in7, out2, out3); \ 1231 } while (0) 1232 1233 /* Description : Sign extend halfword elements from right half of the vector 1234 * Arguments : Input - in (halfword vector) 1235 * Output - out (sign extended word vector) 1236 * Return Type - signed word 1237 * Details : Sign bit of halfword elements from input vector 'in' is 1238 * extracted and interleaved with same vector 'in0' to generate 1239 * 4 word elements keeping sign intact 1240 */ 1241 #define UNPCK_R_SH_SW(in, out) do { \ 1242 const v8i16 sign_m = __msa_clti_s_h((v8i16)in, 0); \ 1243 out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ 1244 } while (0) 1245 1246 /* Description : Sign extend halfword elements from input vector and return 1247 * the result in pair of vectors 1248 * Arguments : Input - in (halfword vector) 1249 * Outputs - out0, out1 (sign extended word vectors) 1250 * Return Type - signed word 1251 * Details : Sign bit of halfword elements from input vector 'in' is 1252 * extracted and interleaved right with same vector 'in0' to 1253 * generate 4 signed word elements in 'out0' 1254 * Then interleaved left with same vector 'in0' to 1255 * generate 4 signed word elements in 'out1' 1256 */ 1257 #define UNPCK_SH_SW(in, out0, out1) do { \ 1258 const v8i16 tmp_m = __msa_clti_s_h((v8i16)in, 0); \ 1259 ILVRL_H2_SW(tmp_m, in, out0, out1); \ 1260 } while (0) 1261 1262 /* Description : Butterfly of 4 input vectors 1263 * Arguments : Inputs - in0, in1, in2, in3 1264 * Outputs - out0, out1, out2, out3 1265 * Details : Butterfly operation 1266 */ 1267 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) do { \ 1268 out0 = in0 + in3; \ 1269 out1 = in1 + in2; \ 1270 out2 = in1 - in2; \ 1271 out3 = in0 - in3; \ 1272 } while (0) 1273 1274 /* Description : Transpose 16x4 block into 4x16 with byte elements in vectors 1275 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 1276 * in8, in9, in10, in11, in12, in13, in14, in15 1277 * Outputs - out0, out1, out2, out3 1278 * Return Type - unsigned byte 1279 */ 1280 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 1281 in8, in9, in10, in11, in12, in13, in14, in15, \ 1282 out0, out1, out2, out3) do { \ 1283 v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m; \ 1284 ILVEV_W2_SD(in0, in4, in8, in12, tmp2_m, tmp3_m); \ 1285 ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \ 1286 ILVEV_D2_UB(tmp2_m, tmp3_m, tmp0_m, tmp1_m, out1, out3); \ 1287 ILVEV_W2_SD(in2, in6, in10, in14, tmp4_m, tmp5_m); \ 1288 ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \ 1289 ILVEV_D2_SD(tmp4_m, tmp5_m, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 1290 ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \ 1291 ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out0, out2); \ 1292 ILVOD_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \ 1293 ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out1, out3); \ 1294 } while (0) 1295 1296 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors 1297 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 1298 * in8, in9, in10, in11, in12, in13, in14, in15 1299 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1300 * Return Type - unsigned byte 1301 */ 1302 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 1303 in8, in9, in10, in11, in12, in13, in14, in15, \ 1304 out0, out1, out2, out3, out4, out5, \ 1305 out6, out7) do { \ 1306 v8i16 tmp0_m, tmp1_m, tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1307 v4i32 tmp2_m, tmp3_m; \ 1308 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ 1309 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ 1310 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ 1311 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ 1312 ILVEV_B2_SH(out7, out6, out5, out4, tmp0_m, tmp1_m); \ 1313 ILVOD_B2_SH(out7, out6, out5, out4, tmp4_m, tmp5_m); \ 1314 ILVEV_B2_UB(out3, out2, out1, out0, out5, out7); \ 1315 ILVOD_B2_SH(out3, out2, out1, out0, tmp6_m, tmp7_m); \ 1316 ILVEV_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ 1317 ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out0, out4); \ 1318 ILVOD_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ 1319 ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out2, out6); \ 1320 ILVEV_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ 1321 ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out1, out5); \ 1322 ILVOD_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ 1323 ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out3, out7); \ 1324 } while (0) 1325 1326 /* Description : Transpose 4x4 block with word elements in vectors 1327 * Arguments : Inputs - in0, in1, in2, in3 1328 * Outputs - out0, out1, out2, out3 1329 * Return Type - as per RTYPE 1330 */ 1331 #define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, \ 1332 out0, out1, out2, out3) do { \ 1333 v4i32 s0_m, s1_m, s2_m, s3_m; \ 1334 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ 1335 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ 1336 out0 = (RTYPE)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ 1337 out1 = (RTYPE)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ 1338 out2 = (RTYPE)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ 1339 out3 = (RTYPE)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ 1340 } while (0) 1341 #define TRANSPOSE4x4_SW_SW(...) TRANSPOSE4x4_W(v4i32, __VA_ARGS__) 1342 1343 /* Description : Add block 4x4 1344 * Arguments : Inputs - in0, in1, in2, in3, pdst, stride 1345 * Details : Least significant 4 bytes from each input vector are added to 1346 * the destination bytes, clipped between 0-255 and stored. 1347 */ 1348 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) do { \ 1349 uint32_t src0_m, src1_m, src2_m, src3_m; \ 1350 v8i16 inp0_m, inp1_m, res0_m, res1_m; \ 1351 v16i8 dst0_m = { 0 }; \ 1352 v16i8 dst1_m = { 0 }; \ 1353 const v16i8 zero_m = { 0 }; \ 1354 ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m); \ 1355 LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ 1356 INSERT_W2_SB(src0_m, src1_m, dst0_m); \ 1357 INSERT_W2_SB(src2_m, src3_m, dst1_m); \ 1358 ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ 1359 ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ 1360 CLIP_SH2_0_255(res0_m, res1_m); \ 1361 PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ 1362 ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ 1363 } while (0) 1364 1365 /* Description : Pack even byte elements, extract 0 & 2 index words from pair 1366 * of results and store 4 words in destination memory as per 1367 * stride 1368 * Arguments : Inputs - in0, in1, in2, in3, pdst, stride 1369 */ 1370 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) do { \ 1371 v16i8 tmp0_m, tmp1_m; \ 1372 PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \ 1373 ST4x4_UB(tmp0_m, tmp1_m, 0, 2, 0, 2, pdst, stride); \ 1374 } while (0) 1375 1376 /* Description : average with rounding (in0 + in1 + 1) / 2. 1377 * Arguments : Inputs - in0, in1, in2, in3, 1378 * Outputs - out0, out1 1379 * Return Type - as per RTYPE 1380 * Details : Each unsigned byte element from 'in0' vector is added with 1381 * each unsigned byte element from 'in1' vector. Then the average 1382 * with rounding is calculated and written to 'out0' 1383 */ 1384 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ 1385 out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ 1386 out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ 1387 } while (0) 1388 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) 1389 1390 #endif /* WEBP_DSP_MSA_MACRO_H_ */ 1391