1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AOM_DSP_MIPS_MACROS_MSA_H_ 13 #define AOM_AOM_DSP_MIPS_MACROS_MSA_H_ 14 15 #include <msa.h> 16 17 #include "config/aom_config.h" 18 19 #include "aom/aom_integer.h" 20 21 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) 22 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__) 23 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__) 24 25 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) 26 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__) 27 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__) 28 29 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) 30 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__) 31 32 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 33 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) 34 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__) 35 36 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 37 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__) 38 39 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 40 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__) 41 42 #if (__mips_isa_rev >= 6) 43 #define LH(psrc) \ 44 ({ \ 45 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 46 uint16_t val_m; \ 47 \ 48 __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \ 49 \ 50 : [val_m] "=r"(val_m) \ 51 : [psrc_m] "m"(*psrc_m)); \ 52 \ 53 val_m; \ 54 }) 55 56 #define LW(psrc) \ 57 ({ \ 58 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 59 uint32_t val_m; \ 60 \ 61 __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \ 62 \ 63 : [val_m] "=r"(val_m) \ 64 : [psrc_m] "m"(*psrc_m)); \ 65 \ 66 val_m; \ 67 }) 68 69 #if (__mips == 64) 70 #define LD(psrc) \ 71 ({ \ 72 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 73 uint64_t val_m = 0; \ 74 \ 75 __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \ 76 \ 77 : [val_m] "=r"(val_m) \ 78 : [psrc_m] "m"(*psrc_m)); \ 79 \ 80 val_m; \ 81 }) 82 #else // !(__mips == 64) 83 #define LD(psrc) \ 84 ({ \ 85 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 86 uint32_t val0_m, val1_m; \ 87 uint64_t val_m = 0; \ 88 \ 89 val0_m = LW(psrc_m); \ 90 val1_m = LW(psrc_m + 4); \ 91 \ 92 val_m = (uint64_t)(val1_m); \ 93 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ 94 val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ 95 \ 96 val_m; \ 97 }) 98 #endif // (__mips == 64) 99 100 #define SH(val, pdst) \ 101 { \ 102 uint8_t *pdst_m = (uint8_t *)(pdst); \ 103 const uint16_t val_m = (val); \ 104 \ 105 __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \ 106 \ 107 : [pdst_m] "=m"(*pdst_m) \ 108 : [val_m] "r"(val_m)); \ 109 } 110 111 #define SW(val, pdst) \ 112 { \ 113 uint8_t *pdst_m = (uint8_t *)(pdst); \ 114 const uint32_t val_m = (val); \ 115 \ 116 __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \ 117 \ 118 : [pdst_m] "=m"(*pdst_m) \ 119 : [val_m] "r"(val_m)); \ 120 } 121 122 #define SD(val, pdst) \ 123 { \ 124 uint8_t *pdst_m = (uint8_t *)(pdst); \ 125 const uint64_t val_m = (val); \ 126 \ 127 __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \ 128 \ 129 : [pdst_m] "=m"(*pdst_m) \ 130 : [val_m] "r"(val_m)); \ 131 } 132 #else // !(__mips_isa_rev >= 6) 133 #define LH(psrc) \ 134 ({ \ 135 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 136 uint16_t val_m; \ 137 \ 138 __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \ 139 \ 140 : [val_m] "=r"(val_m) \ 141 : [psrc_m] "m"(*psrc_m)); \ 142 \ 143 val_m; \ 144 }) 145 146 #define LW(psrc) \ 147 ({ \ 148 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 149 uint32_t val_m; \ 150 \ 151 __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \ 152 \ 153 : [val_m] "=r"(val_m) \ 154 : [psrc_m] "m"(*psrc_m)); \ 155 \ 156 val_m; \ 157 }) 158 159 #if (__mips == 64) 160 #define LD(psrc) \ 161 ({ \ 162 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 163 uint64_t val_m = 0; \ 164 \ 165 __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \ 166 \ 167 : [val_m] "=r"(val_m) \ 168 : [psrc_m] "m"(*psrc_m)); \ 169 \ 170 val_m; \ 171 }) 172 #else // !(__mips == 64) 173 #define LD(psrc) \ 174 ({ \ 175 const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ 176 uint32_t val0_m, val1_m; \ 177 uint64_t val_m_combined = 0; \ 178 \ 179 val0_m = LW(psrc_m1); \ 180 val1_m = LW(psrc_m1 + 4); \ 181 \ 182 val_m_combined = (uint64_t)(val1_m); \ 183 val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \ 184 val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m); \ 185 \ 186 val_m_combined; \ 187 }) 188 #endif // (__mips == 64) 189 190 #define SH(val, pdst) \ 191 { \ 192 uint8_t *pdst_m = (uint8_t *)(pdst); \ 193 const uint16_t val_m = (val); \ 194 \ 195 __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \ 196 \ 197 : [pdst_m] "=m"(*pdst_m) \ 198 : [val_m] "r"(val_m)); \ 199 } 200 201 #define SW(val, pdst) \ 202 { \ 203 uint8_t *pdst_m = (uint8_t *)(pdst); \ 204 const uint32_t val_m = (val); \ 205 \ 206 __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \ 207 \ 208 : [pdst_m] "=m"(*pdst_m) \ 209 : [val_m] "r"(val_m)); \ 210 } 211 212 #define SD(val, pdst) \ 213 { \ 214 uint8_t *pdst_m1 = (uint8_t *)(pdst); \ 215 uint32_t val0_m, val1_m; \ 216 \ 217 val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ 218 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ 219 \ 220 SW(val0_m, pdst_m1); \ 221 SW(val1_m, pdst_m1 + 4); \ 222 } 223 #endif // (__mips_isa_rev >= 6) 224 225 /* Description : Load 4 words with stride 226 Arguments : Inputs - psrc, stride 227 Outputs - out0, out1, out2, out3 228 Details : Load word in 'out0' from (psrc) 229 Load word in 'out1' from (psrc + stride) 230 Load word in 'out2' from (psrc + 2 * stride) 231 Load word in 'out3' from (psrc + 3 * stride) 232 */ 233 #define LW4(psrc, stride, out0, out1, out2, out3) \ 234 { \ 235 out0 = LW((psrc)); \ 236 out1 = LW((psrc) + stride); \ 237 out2 = LW((psrc) + 2 * stride); \ 238 out3 = LW((psrc) + 3 * stride); \ 239 } 240 241 /* Description : Load double words with stride 242 Arguments : Inputs - psrc, stride 243 Outputs - out0, out1 244 Details : Load double word in 'out0' from (psrc) 245 Load double word in 'out1' from (psrc + stride) 246 */ 247 #define LD2(psrc, stride, out0, out1) \ 248 { \ 249 out0 = LD((psrc)); \ 250 out1 = LD((psrc) + stride); \ 251 } 252 #define LD4(psrc, stride, out0, out1, out2, out3) \ 253 { \ 254 LD2((psrc), stride, out0, out1); \ 255 LD2((psrc) + 2 * stride, stride, out2, out3); \ 256 } 257 258 /* Description : Store 4 words with stride 259 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 260 Details : Store word from 'in0' to (pdst) 261 Store word from 'in1' to (pdst + stride) 262 Store word from 'in2' to (pdst + 2 * stride) 263 Store word from 'in3' to (pdst + 3 * stride) 264 */ 265 #define SW4(in0, in1, in2, in3, pdst, stride) \ 266 { \ 267 SW(in0, (pdst)) \ 268 SW(in1, (pdst) + stride); \ 269 SW(in2, (pdst) + 2 * stride); \ 270 SW(in3, (pdst) + 3 * stride); \ 271 } 272 273 /* Description : Store 4 double words with stride 274 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 275 Details : Store double word from 'in0' to (pdst) 276 Store double word from 'in1' to (pdst + stride) 277 Store double word from 'in2' to (pdst + 2 * stride) 278 Store double word from 'in3' to (pdst + 3 * stride) 279 */ 280 #define SD4(in0, in1, in2, in3, pdst, stride) \ 281 { \ 282 SD(in0, (pdst)) \ 283 SD(in1, (pdst) + stride); \ 284 SD(in2, (pdst) + 2 * stride); \ 285 SD(in3, (pdst) + 3 * stride); \ 286 } 287 288 /* Description : Load vectors with 16 byte elements with stride 289 Arguments : Inputs - psrc, stride 290 Outputs - out0, out1 291 Return Type - as per RTYPE 292 Details : Load 16 byte elements in 'out0' from (psrc) 293 Load 16 byte elements in 'out1' from (psrc + stride) 294 */ 295 #define LD_B2(RTYPE, psrc, stride, out0, out1) \ 296 { \ 297 out0 = LD_B(RTYPE, (psrc)); \ 298 out1 = LD_B(RTYPE, (psrc) + stride); \ 299 } 300 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) 301 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) 302 303 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ 304 { \ 305 LD_B2(RTYPE, (psrc), stride, out0, out1); \ 306 out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ 307 } 308 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) 309 310 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 311 { \ 312 LD_B2(RTYPE, (psrc), stride, out0, out1); \ 313 LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ 314 } 315 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) 316 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) 317 318 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ 319 { \ 320 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 321 out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ 322 } 323 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) 324 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) 325 326 #define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ 327 { \ 328 LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ 329 LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ 330 } 331 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) 332 333 #define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ 334 out7) \ 335 { \ 336 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 337 LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ 338 } 339 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) 340 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) 341 342 /* Description : Load vectors with 8 halfword elements with stride 343 Arguments : Inputs - psrc, stride 344 Outputs - out0, out1 345 Details : Load 8 halfword elements in 'out0' from (psrc) 346 Load 8 halfword elements in 'out1' from (psrc + stride) 347 */ 348 #define LD_H2(RTYPE, psrc, stride, out0, out1) \ 349 { \ 350 out0 = LD_H(RTYPE, (psrc)); \ 351 out1 = LD_H(RTYPE, (psrc) + (stride)); \ 352 } 353 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) 354 355 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 356 { \ 357 LD_H2(RTYPE, (psrc), stride, out0, out1); \ 358 LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ 359 } 360 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) 361 362 #define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ 363 out7) \ 364 { \ 365 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 366 LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ 367 } 368 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) 369 370 #define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ 371 out7, out8, out9, out10, out11, out12, out13, out14, out15) \ 372 { \ 373 LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ 374 out7); \ 375 LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ 376 out13, out14, out15); \ 377 } 378 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) 379 380 /* Description : Load 4x4 block of signed halfword elements from 1D source 381 data into 4 vectors (Each vector with 4 signed halfwords) 382 Arguments : Input - psrc 383 Outputs - out0, out1, out2, out3 384 */ 385 #define LD4x4_SH(psrc, out0, out1, out2, out3) \ 386 { \ 387 out0 = LD_SH(psrc); \ 388 out2 = LD_SH(psrc + 8); \ 389 out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ 390 out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ 391 } 392 393 /* Description : Load 2 vectors of signed word elements with stride 394 Arguments : Inputs - psrc, stride 395 Outputs - out0, out1 396 Return Type - signed word 397 */ 398 #define LD_SW2(psrc, stride, out0, out1) \ 399 { \ 400 out0 = LD_SW((psrc)); \ 401 out1 = LD_SW((psrc) + stride); \ 402 } 403 404 /* Description : Store vectors of 16 byte elements with stride 405 Arguments : Inputs - in0, in1, pdst, stride 406 Details : Store 16 byte elements from 'in0' to (pdst) 407 Store 16 byte elements from 'in1' to (pdst + stride) 408 */ 409 #define ST_B2(RTYPE, in0, in1, pdst, stride) \ 410 { \ 411 ST_B(RTYPE, in0, (pdst)); \ 412 ST_B(RTYPE, in1, (pdst) + stride); \ 413 } 414 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) 415 416 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ 417 { \ 418 ST_B2(RTYPE, in0, in1, (pdst), stride); \ 419 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 420 } 421 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) 422 423 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 424 { \ 425 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ 426 ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ 427 } 428 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) 429 430 /* Description : Store vectors of 8 halfword elements with stride 431 Arguments : Inputs - in0, in1, pdst, stride 432 Details : Store 8 halfword elements from 'in0' to (pdst) 433 Store 8 halfword elements from 'in1' to (pdst + stride) 434 */ 435 #define ST_H2(RTYPE, in0, in1, pdst, stride) \ 436 { \ 437 ST_H(RTYPE, in0, (pdst)); \ 438 ST_H(RTYPE, in1, (pdst) + stride); \ 439 } 440 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) 441 442 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \ 443 { \ 444 ST_H2(RTYPE, in0, in1, (pdst), stride); \ 445 ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 446 } 447 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) 448 449 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 450 { \ 451 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ 452 ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ 453 } 454 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) 455 456 /* Description : Store vectors of word elements with stride 457 Arguments : Inputs - in0, in1, pdst, stride 458 Details : Store 4 word elements from 'in0' to (pdst) 459 Store 4 word elements from 'in1' to (pdst + stride) 460 */ 461 #define ST_SW2(in0, in1, pdst, stride) \ 462 { \ 463 ST_SW(in0, (pdst)); \ 464 ST_SW(in1, (pdst) + stride); \ 465 } 466 467 /* Description : Store 2x4 byte block to destination memory from input vector 468 Arguments : Inputs - in, stidx, pdst, stride 469 Details : Index 'stidx' halfword element from 'in' vector is copied to 470 the GP register and stored to (pdst) 471 Index 'stidx+1' halfword element from 'in' vector is copied to 472 the GP register and stored to (pdst + stride) 473 Index 'stidx+2' halfword element from 'in' vector is copied to 474 the GP register and stored to (pdst + 2 * stride) 475 Index 'stidx+3' halfword element from 'in' vector is copied to 476 the GP register and stored to (pdst + 3 * stride) 477 */ 478 #define ST2x4_UB(in, stidx, pdst, stride) \ 479 { \ 480 uint16_t out0_m, out1_m, out2_m, out3_m; \ 481 uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ 482 \ 483 out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ 484 out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ 485 out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ 486 out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ 487 \ 488 SH(out0_m, pblk_2x4_m); \ 489 SH(out1_m, pblk_2x4_m + stride); \ 490 SH(out2_m, pblk_2x4_m + 2 * stride); \ 491 SH(out3_m, pblk_2x4_m + 3 * stride); \ 492 } 493 494 /* Description : Store 4x2 byte block to destination memory from input vector 495 Arguments : Inputs - in, pdst, stride 496 Details : Index 0 word element from 'in' vector is copied to the GP 497 register and stored to (pdst) 498 Index 1 word element from 'in' vector is copied to the GP 499 register and stored to (pdst + stride) 500 */ 501 #define ST4x2_UB(in, pdst, stride) \ 502 { \ 503 uint32_t out0_m, out1_m; \ 504 uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ 505 \ 506 out0_m = __msa_copy_u_w((v4i32)in, 0); \ 507 out1_m = __msa_copy_u_w((v4i32)in, 1); \ 508 \ 509 SW(out0_m, pblk_4x2_m); \ 510 SW(out1_m, pblk_4x2_m + stride); \ 511 } 512 513 /* Description : Store 4x4 byte block to destination memory from input vector 514 Arguments : Inputs - in0, in1, pdst, stride 515 Details : 'Idx0' word element from input vector 'in0' is copied to the 516 GP register and stored to (pdst) 517 'Idx1' word element from input vector 'in0' is copied to the 518 GP register and stored to (pdst + stride) 519 'Idx2' word element from input vector 'in0' is copied to the 520 GP register and stored to (pdst + 2 * stride) 521 'Idx3' word element from input vector 'in0' is copied to the 522 GP register and stored to (pdst + 3 * stride) 523 */ 524 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ 525 { \ 526 uint32_t out0_m, out1_m, out2_m, out3_m; \ 527 uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ 528 \ 529 out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ 530 out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ 531 out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ 532 out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ 533 \ 534 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ 535 } 536 #define ST4x8_UB(in0, in1, pdst, stride) \ 537 { \ 538 uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ 539 \ 540 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ 541 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ 542 } 543 544 /* Description : Store 8x1 byte block to destination memory from input vector 545 Arguments : Inputs - in, pdst 546 Details : Index 0 double word element from 'in' vector is copied to the 547 GP register and stored to (pdst) 548 */ 549 #define ST8x1_UB(in, pdst) \ 550 { \ 551 uint64_t out0_m; \ 552 \ 553 out0_m = __msa_copy_u_d((v2i64)in, 0); \ 554 SD(out0_m, pdst); \ 555 } 556 557 /* Description : Store 8x2 byte block to destination memory from input vector 558 Arguments : Inputs - in, pdst, stride 559 Details : Index 0 double word element from 'in' vector is copied to the 560 GP register and stored to (pdst) 561 Index 1 double word element from 'in' vector is copied to the 562 GP register and stored to (pdst + stride) 563 */ 564 #define ST8x2_UB(in, pdst, stride) \ 565 { \ 566 uint64_t out0_m, out1_m; \ 567 uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ 568 \ 569 out0_m = __msa_copy_u_d((v2i64)in, 0); \ 570 out1_m = __msa_copy_u_d((v2i64)in, 1); \ 571 \ 572 SD(out0_m, pblk_8x2_m); \ 573 SD(out1_m, pblk_8x2_m + stride); \ 574 } 575 576 /* Description : Store 8x4 byte block to destination memory from input 577 vectors 578 Arguments : Inputs - in0, in1, pdst, stride 579 Details : Index 0 double word element from 'in0' vector is copied to the 580 GP register and stored to (pdst) 581 Index 1 double word element from 'in0' vector is copied to the 582 GP register and stored to (pdst + stride) 583 Index 0 double word element from 'in1' vector is copied to the 584 GP register and stored to (pdst + 2 * stride) 585 Index 1 double word element from 'in1' vector is copied to the 586 GP register and stored to (pdst + 3 * stride) 587 */ 588 #define ST8x4_UB(in0, in1, pdst, stride) \ 589 { \ 590 uint64_t out0_m, out1_m, out2_m, out3_m; \ 591 uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ 592 \ 593 out0_m = __msa_copy_u_d((v2i64)in0, 0); \ 594 out1_m = __msa_copy_u_d((v2i64)in0, 1); \ 595 out2_m = __msa_copy_u_d((v2i64)in1, 0); \ 596 out3_m = __msa_copy_u_d((v2i64)in1, 1); \ 597 \ 598 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ 599 } 600 601 /* Description : average with rounding (in0 + in1 + 1) / 2. 602 Arguments : Inputs - in0, in1, in2, in3, 603 Outputs - out0, out1 604 Return Type - as per RTYPE 605 Details : Each unsigned byte element from 'in0' vector is added with 606 each unsigned byte element from 'in1' vector. Then the average 607 with rounding is calculated and written to 'out0' 608 */ 609 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ 610 { \ 611 out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ 612 out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ 613 } 614 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) 615 616 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 617 out2, out3) \ 618 { \ 619 AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ 620 AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ 621 } 622 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) 623 624 /* Description : Immediate number of elements to slide with zero 625 Arguments : Inputs - in0, in1, slide_val 626 Outputs - out0, out1 627 Return Type - as per RTYPE 628 Details : Byte elements from 'zero_m' vector are slid into 'in0' by 629 value specified in the 'slide_val' 630 */ 631 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ 632 { \ 633 v16i8 zero_m = { 0 }; \ 634 out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ 635 out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ 636 } 637 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) 638 639 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \ 640 slide_val) \ 641 { \ 642 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ 643 SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ 644 } 645 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) 646 647 /* Description : Immediate number of elements to slide 648 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val 649 Outputs - out0, out1 650 Return Type - as per RTYPE 651 Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by 652 value specified in the 'slide_val' 653 */ 654 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ 655 { \ 656 out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ 657 out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ 658 } 659 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) 660 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) 661 662 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \ 663 out2, slide_val) \ 664 { \ 665 SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ 666 out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ 667 } 668 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) 669 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) 670 671 /* Description : Shuffle byte vector elements as per mask vector 672 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 673 Outputs - out0, out1 674 Return Type - as per RTYPE 675 Details : Byte elements from 'in0' & 'in1' are copied selectively to 676 'out0' as per control vector 'mask0' 677 */ 678 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 679 { \ 680 out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ 681 out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ 682 } 683 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) 684 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) 685 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) 686 687 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \ 688 out3) \ 689 { \ 690 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ 691 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ 692 } 693 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) 694 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) 695 696 /* Description : Dot product of byte vector elements 697 Arguments : Inputs - mult0, mult1, cnst0, cnst1 698 Outputs - out0, out1 699 Return Type - as per RTYPE 700 Details : Unsigned byte elements from 'mult0' are multiplied with 701 unsigned byte elements from 'cnst0' producing a result 702 twice the size of input i.e. unsigned halfword. 703 The multiplication result of adjacent odd-even elements 704 are added together and written to the 'out0' vector 705 */ 706 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 707 { \ 708 out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ 709 out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ 710 } 711 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) 712 713 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 714 cnst3, out0, out1, out2, out3) \ 715 { \ 716 DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 717 DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 718 } 719 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) 720 721 /* Description : Dot product of byte vector elements 722 Arguments : Inputs - mult0, mult1, cnst0, cnst1 723 Outputs - out0, out1 724 Return Type - as per RTYPE 725 Details : Signed byte elements from 'mult0' are multiplied with 726 signed byte elements from 'cnst0' producing a result 727 twice the size of input i.e. signed halfword. 728 The multiplication result of adjacent odd-even elements 729 are added together and written to the 'out0' vector 730 */ 731 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 732 { \ 733 out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ 734 out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ 735 } 736 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) 737 738 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 739 cnst3, out0, out1, out2, out3) \ 740 { \ 741 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 742 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 743 } 744 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) 745 746 /* Description : Dot product of halfword vector elements 747 Arguments : Inputs - mult0, mult1, cnst0, cnst1 748 Outputs - out0, out1 749 Return Type - as per RTYPE 750 Details : Signed halfword elements from 'mult0' are multiplied with 751 signed halfword elements from 'cnst0' producing a result 752 twice the size of input i.e. signed word. 753 The multiplication result of adjacent odd-even elements 754 are added together and written to the 'out0' vector 755 */ 756 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 757 { \ 758 out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ 759 out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ 760 } 761 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) 762 763 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 764 cnst3, out0, out1, out2, out3) \ 765 { \ 766 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 767 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 768 } 769 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) 770 771 /* Description : Dot product of word vector elements 772 Arguments : Inputs - mult0, mult1, cnst0, cnst1 773 Outputs - out0, out1 774 Return Type - as per RTYPE 775 Details : Signed word elements from 'mult0' are multiplied with 776 signed word elements from 'cnst0' producing a result 777 twice the size of input i.e. signed double word. 778 The multiplication result of adjacent odd-even elements 779 are added together and written to the 'out0' vector 780 */ 781 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 782 { \ 783 out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ 784 out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ 785 } 786 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) 787 788 /* Description : Dot product & addition of byte vector elements 789 Arguments : Inputs - mult0, mult1, cnst0, cnst1 790 Outputs - out0, out1 791 Return Type - as per RTYPE 792 Details : Signed byte elements from 'mult0' are multiplied with 793 signed byte elements from 'cnst0' producing a result 794 twice the size of input i.e. signed halfword. 795 The multiplication result of adjacent odd-even elements 796 are added to the 'out0' vector 797 */ 798 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 799 { \ 800 out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ 801 out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ 802 } 803 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) 804 805 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 806 cnst3, out0, out1, out2, out3) \ 807 { \ 808 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 809 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 810 } 811 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) 812 813 /* Description : Dot product & addition of halfword vector elements 814 Arguments : Inputs - mult0, mult1, cnst0, cnst1 815 Outputs - out0, out1 816 Return Type - as per RTYPE 817 Details : Signed halfword elements from 'mult0' are multiplied with 818 signed halfword elements from 'cnst0' producing a result 819 twice the size of input i.e. signed word. 820 The multiplication result of adjacent odd-even elements 821 are added to the 'out0' vector 822 */ 823 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 824 { \ 825 out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ 826 out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ 827 } 828 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) 829 830 /* Description : Dot product & addition of double word vector elements 831 Arguments : Inputs - mult0, mult1 832 Outputs - out0, out1 833 Return Type - as per RTYPE 834 Details : Each signed word element from 'mult0' is multiplied with itself 835 producing an intermediate result twice the size of input 836 i.e. signed double word 837 The multiplication result of adjacent odd-even elements 838 are added to the 'out0' vector 839 */ 840 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ 841 { \ 842 out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ 843 out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ 844 } 845 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) 846 847 /* Description : Minimum values between unsigned elements of 848 either vector are copied to the output vector 849 Arguments : Inputs - in0, in1, min_vec 850 Outputs - in place operation 851 Return Type - as per RTYPE 852 Details : Minimum of unsigned halfword element values from 'in0' and 853 'min_vec' are written to output vector 'in0' 854 */ 855 #define MIN_UH2(RTYPE, in0, in1, min_vec) \ 856 { \ 857 in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ 858 in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ 859 } 860 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) 861 862 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \ 863 { \ 864 MIN_UH2(RTYPE, in0, in1, min_vec); \ 865 MIN_UH2(RTYPE, in2, in3, min_vec); \ 866 } 867 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) 868 869 /* Description : Clips all signed halfword elements of input vector 870 between 0 & 255 871 Arguments : Input - in 872 Output - out_m 873 Return Type - signed halfword 874 */ 875 #define CLIP_SH_0_255(in) \ 876 ({ \ 877 v8i16 max_m = __msa_ldi_h(255); \ 878 v8i16 out_m; \ 879 \ 880 out_m = __msa_maxi_s_h((v8i16)in, 0); \ 881 out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ 882 out_m; \ 883 }) 884 #define CLIP_SH2_0_255(in0, in1) \ 885 { \ 886 in0 = CLIP_SH_0_255(in0); \ 887 in1 = CLIP_SH_0_255(in1); \ 888 } 889 #define CLIP_SH4_0_255(in0, in1, in2, in3) \ 890 { \ 891 CLIP_SH2_0_255(in0, in1); \ 892 CLIP_SH2_0_255(in2, in3); \ 893 } 894 895 /* Description : Horizontal addition of 4 signed word elements of input vector 896 Arguments : Input - in (signed word vector) 897 Output - sum_m (i32 sum) 898 Return Type - signed word (GP) 899 Details : 4 signed word elements of 'in' vector are added together and 900 the resulting integer sum is returned 901 */ 902 #define HADD_SW_S32(in) \ 903 ({ \ 904 v2i64 res0_m, res1_m; \ 905 int32_t sum_m; \ 906 \ 907 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ 908 res1_m = __msa_splati_d(res0_m, 1); \ 909 res0_m = res0_m + res1_m; \ 910 sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ 911 sum_m; \ 912 }) 913 914 /* Description : Horizontal addition of 8 unsigned halfword elements 915 Arguments : Inputs - in (unsigned halfword vector) 916 Outputs - sum_m (u32 sum) 917 Return Type - unsigned word 918 Details : 8 unsigned halfword elements of input vector are added 919 together and the resulting integer sum is returned 920 */ 921 #define HADD_UH_U32(in) \ 922 ({ \ 923 v4u32 res_m; \ 924 v2u64 res0_m, res1_m; \ 925 uint32_t sum_m; \ 926 \ 927 res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ 928 res0_m = __msa_hadd_u_d(res_m, res_m); \ 929 res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ 930 res0_m = res0_m + res1_m; \ 931 sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ 932 sum_m; \ 933 }) 934 935 /* Description : Horizontal addition of unsigned byte vector elements 936 Arguments : Inputs - in0, in1 937 Outputs - out0, out1 938 Return Type - as per RTYPE 939 Details : Each unsigned odd byte element from 'in0' is added to 940 even unsigned byte element from 'in0' (pairwise) and the 941 halfword result is written to 'out0' 942 */ 943 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \ 944 { \ 945 out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ 946 out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ 947 } 948 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) 949 950 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ 951 { \ 952 HADD_UB2(RTYPE, in0, in1, out0, out1); \ 953 HADD_UB2(RTYPE, in2, in3, out2, out3); \ 954 } 955 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) 956 957 /* Description : Horizontal subtraction of unsigned byte vector elements 958 Arguments : Inputs - in0, in1 959 Outputs - out0, out1 960 Return Type - as per RTYPE 961 Details : Each unsigned odd byte element from 'in0' is subtracted from 962 even unsigned byte element from 'in0' (pairwise) and the 963 halfword result is written to 'out0' 964 */ 965 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ 966 { \ 967 out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ 968 out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ 969 } 970 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) 971 972 /* Description : SAD (Sum of Absolute Difference) 973 Arguments : Inputs - in0, in1, ref0, ref1 974 Outputs - sad_m (halfword vector) 975 Return Type - unsigned halfword 976 Details : Absolute difference of all the byte elements from 'in0' with 977 'ref0' is calculated and preserved in 'diff0'. Then even-odd 978 pairs are added together to generate 8 halfword results. 979 */ 980 #define SAD_UB2_UH(in0, in1, ref0, ref1) \ 981 ({ \ 982 v16u8 diff0_m, diff1_m; \ 983 v8u16 sad_m = { 0 }; \ 984 \ 985 diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \ 986 diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \ 987 \ 988 sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \ 989 sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \ 990 \ 991 sad_m; \ 992 }) 993 994 /* Description : Horizontal subtraction of signed halfword vector elements 995 Arguments : Inputs - in0, in1 996 Outputs - out0, out1 997 Return Type - as per RTYPE 998 Details : Each signed odd halfword element from 'in0' is subtracted from 999 even signed halfword element from 'in0' (pairwise) and the 1000 word result is written to 'out0' 1001 */ 1002 #define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ 1003 { \ 1004 out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ 1005 out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ 1006 } 1007 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) 1008 1009 /* Description : Set element n input vector to GPR value 1010 Arguments : Inputs - in0, in1, in2, in3 1011 Output - out 1012 Return Type - as per RTYPE 1013 Details : Set element 0 in vector 'out' to value specified in 'in0' 1014 */ 1015 #define INSERT_W2(RTYPE, in0, in1, out) \ 1016 { \ 1017 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ 1018 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ 1019 } 1020 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) 1021 1022 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \ 1023 { \ 1024 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ 1025 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ 1026 out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ 1027 out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ 1028 } 1029 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) 1030 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) 1031 1032 #define INSERT_D2(RTYPE, in0, in1, out) \ 1033 { \ 1034 out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ 1035 out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ 1036 } 1037 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) 1038 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) 1039 1040 /* Description : Interleave even byte elements from vectors 1041 Arguments : Inputs - in0, in1, in2, in3 1042 Outputs - out0, out1 1043 Return Type - as per RTYPE 1044 Details : Even byte elements of 'in0' and 'in1' are interleaved 1045 and written to 'out0' 1046 */ 1047 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1048 { \ 1049 out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ 1050 out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ 1051 } 1052 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) 1053 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) 1054 1055 /* Description : Interleave even halfword elements from vectors 1056 Arguments : Inputs - in0, in1, in2, in3 1057 Outputs - out0, out1 1058 Return Type - as per RTYPE 1059 Details : Even halfword elements of 'in0' and 'in1' are interleaved 1060 and written to 'out0' 1061 */ 1062 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1063 { \ 1064 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ 1065 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ 1066 } 1067 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) 1068 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) 1069 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) 1070 1071 /* Description : Interleave even word elements from vectors 1072 Arguments : Inputs - in0, in1, in2, in3 1073 Outputs - out0, out1 1074 Return Type - as per RTYPE 1075 Details : Even word elements of 'in0' and 'in1' are interleaved 1076 and written to 'out0' 1077 */ 1078 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1079 { \ 1080 out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ 1081 out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ 1082 } 1083 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) 1084 1085 /* Description : Interleave even double word elements from vectors 1086 Arguments : Inputs - in0, in1, in2, in3 1087 Outputs - out0, out1 1088 Return Type - as per RTYPE 1089 Details : Even double word elements of 'in0' and 'in1' are interleaved 1090 and written to 'out0' 1091 */ 1092 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1093 { \ 1094 out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ 1095 out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ 1096 } 1097 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) 1098 1099 /* Description : Interleave left half of byte elements from vectors 1100 Arguments : Inputs - in0, in1, in2, in3 1101 Outputs - out0, out1 1102 Return Type - as per RTYPE 1103 Details : Left half of byte elements of 'in0' and 'in1' are interleaved 1104 and written to 'out0'. 1105 */ 1106 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1107 { \ 1108 out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 1109 out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ 1110 } 1111 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) 1112 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) 1113 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) 1114 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) 1115 1116 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1117 out2, out3) \ 1118 { \ 1119 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1120 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1121 } 1122 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) 1123 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) 1124 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) 1125 1126 /* Description : Interleave left half of halfword elements from vectors 1127 Arguments : Inputs - in0, in1, in2, in3 1128 Outputs - out0, out1 1129 Return Type - as per RTYPE 1130 Details : Left half of halfword elements of 'in0' and 'in1' are 1131 interleaved and written to 'out0'. 1132 */ 1133 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1134 { \ 1135 out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 1136 out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ 1137 } 1138 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) 1139 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) 1140 1141 /* Description : Interleave left half of word elements from vectors 1142 Arguments : Inputs - in0, in1, in2, in3 1143 Outputs - out0, out1 1144 Return Type - as per RTYPE 1145 Details : Left half of word elements of 'in0' and 'in1' are interleaved 1146 and written to 'out0'. 1147 */ 1148 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1149 { \ 1150 out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 1151 out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ 1152 } 1153 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) 1154 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) 1155 1156 /* Description : Interleave right half of byte elements from vectors 1157 Arguments : Inputs - in0, in1, in2, in3 1158 Outputs - out0, out1 1159 Return Type - as per RTYPE 1160 Details : Right half of byte elements of 'in0' and 'in1' are interleaved 1161 and written to out0. 1162 */ 1163 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1164 { \ 1165 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 1166 out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ 1167 } 1168 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) 1169 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) 1170 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) 1171 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) 1172 1173 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1174 out2, out3) \ 1175 { \ 1176 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1177 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1178 } 1179 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) 1180 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) 1181 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) 1182 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) 1183 1184 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ 1185 in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \ 1186 out5, out6, out7) \ 1187 { \ 1188 ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ 1189 out3); \ 1190 ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \ 1191 out6, out7); \ 1192 } 1193 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) 1194 1195 /* Description : Interleave right half of halfword elements from vectors 1196 Arguments : Inputs - in0, in1, in2, in3 1197 Outputs - out0, out1 1198 Return Type - as per RTYPE 1199 Details : Right half of halfword elements of 'in0' and 'in1' are 1200 interleaved and written to 'out0'. 1201 */ 1202 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1203 { \ 1204 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 1205 out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ 1206 } 1207 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) 1208 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) 1209 1210 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1211 out2, out3) \ 1212 { \ 1213 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1214 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1215 } 1216 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) 1217 1218 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1219 { \ 1220 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 1221 out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ 1222 } 1223 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) 1224 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) 1225 1226 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1227 out2, out3) \ 1228 { \ 1229 ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1230 ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1231 } 1232 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) 1233 1234 /* Description : Interleave right half of double word elements from vectors 1235 Arguments : Inputs - in0, in1, in2, in3 1236 Outputs - out0, out1 1237 Return Type - as per RTYPE 1238 Details : Right half of double word elements of 'in0' and 'in1' are 1239 interleaved and written to 'out0'. 1240 */ 1241 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1242 { \ 1243 out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ 1244 out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ 1245 } 1246 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) 1247 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) 1248 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) 1249 1250 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ 1251 { \ 1252 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1253 out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ 1254 } 1255 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) 1256 1257 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1258 out2, out3) \ 1259 { \ 1260 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1261 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1262 } 1263 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) 1264 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) 1265 1266 /* Description : Interleave both left and right half of input vectors 1267 Arguments : Inputs - in0, in1 1268 Outputs - out0, out1 1269 Return Type - as per RTYPE 1270 Details : Right half of byte elements from 'in0' and 'in1' are 1271 interleaved and written to 'out0' 1272 */ 1273 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ 1274 { \ 1275 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 1276 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 1277 } 1278 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) 1279 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) 1280 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) 1281 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) 1282 1283 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ 1284 { \ 1285 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 1286 out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 1287 } 1288 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) 1289 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) 1290 1291 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ 1292 { \ 1293 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 1294 out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 1295 } 1296 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) 1297 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) 1298 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) 1299 1300 /* Description : Saturate the halfword element values to the max 1301 unsigned value of (sat_val + 1) bits 1302 The element data width remains unchanged 1303 Arguments : Inputs - in0, in1, sat_val 1304 Outputs - in place operation 1305 Return Type - as per RTYPE 1306 Details : Each unsigned halfword element from 'in0' is saturated to the 1307 value generated with (sat_val + 1) bit range. 1308 The results are written in place 1309 */ 1310 #define SAT_UH2(RTYPE, in0, in1, sat_val) \ 1311 { \ 1312 in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ 1313 in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ 1314 } 1315 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) 1316 1317 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ 1318 { \ 1319 SAT_UH2(RTYPE, in0, in1, sat_val); \ 1320 SAT_UH2(RTYPE, in2, in3, sat_val) \ 1321 } 1322 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) 1323 1324 /* Description : Saturate the halfword element values to the max 1325 unsigned value of (sat_val + 1) bits 1326 The element data width remains unchanged 1327 Arguments : Inputs - in0, in1, sat_val 1328 Outputs - in place operation 1329 Return Type - as per RTYPE 1330 Details : Each unsigned halfword element from 'in0' is saturated to the 1331 value generated with (sat_val + 1) bit range 1332 The results are written in place 1333 */ 1334 #define SAT_SH2(RTYPE, in0, in1, sat_val) \ 1335 { \ 1336 in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ 1337 in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ 1338 } 1339 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) 1340 1341 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ 1342 { \ 1343 SAT_SH2(RTYPE, in0, in1, sat_val); \ 1344 SAT_SH2(RTYPE, in2, in3, sat_val); \ 1345 } 1346 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) 1347 1348 /* Description : Indexed halfword element values are replicated to all 1349 elements in output vector 1350 Arguments : Inputs - in, idx0, idx1 1351 Outputs - out0, out1 1352 Return Type - as per RTYPE 1353 Details : 'idx0' element value from 'in' vector is replicated to all 1354 elements in 'out0' vector 1355 Valid index range for halfword operation is 0-7 1356 */ 1357 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ 1358 { \ 1359 out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ 1360 out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ 1361 } 1362 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) 1363 1364 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \ 1365 { \ 1366 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ 1367 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ 1368 } 1369 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) 1370 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) 1371 1372 /* Description : Pack even byte elements of vector pairs 1373 Arguments : Inputs - in0, in1, in2, in3 1374 Outputs - out0, out1 1375 Return Type - as per RTYPE 1376 Details : Even byte elements of 'in0' are copied to the left half of 1377 'out0' & even byte elements of 'in1' are copied to the right 1378 half of 'out0'. 1379 */ 1380 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1381 { \ 1382 out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ 1383 out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ 1384 } 1385 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) 1386 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) 1387 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) 1388 1389 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1390 out2, out3) \ 1391 { \ 1392 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1393 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1394 } 1395 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) 1396 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) 1397 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) 1398 1399 /* Description : Pack even halfword elements of vector pairs 1400 Arguments : Inputs - in0, in1, in2, in3 1401 Outputs - out0, out1 1402 Return Type - as per RTYPE 1403 Details : Even halfword elements of 'in0' are copied to the left half of 1404 'out0' & even halfword elements of 'in1' are copied to the 1405 right half of 'out0'. 1406 */ 1407 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1408 { \ 1409 out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ 1410 out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ 1411 } 1412 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) 1413 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) 1414 1415 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1416 out2, out3) \ 1417 { \ 1418 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1419 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1420 } 1421 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) 1422 1423 /* Description : Pack even double word elements of vector pairs 1424 Arguments : Inputs - in0, in1, in2, in3 1425 Outputs - out0, out1 1426 Return Type - as per RTYPE 1427 Details : Even double elements of 'in0' are copied to the left half of 1428 'out0' & even double elements of 'in1' are copied to the right 1429 half of 'out0'. 1430 */ 1431 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1432 { \ 1433 out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ 1434 out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ 1435 } 1436 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) 1437 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) 1438 1439 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1440 out2, out3) \ 1441 { \ 1442 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1443 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1444 } 1445 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) 1446 1447 /* Description : Each byte element is logically xor'ed with immediate 128 1448 Arguments : Inputs - in0, in1 1449 Outputs - in place operation 1450 Return Type - as per RTYPE 1451 Details : Each unsigned byte element from input vector 'in0' is 1452 logically xor'ed with 128 and the result is stored in-place. 1453 */ 1454 #define XORI_B2_128(RTYPE, in0, in1) \ 1455 { \ 1456 in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ 1457 in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ 1458 } 1459 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) 1460 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) 1461 1462 #define XORI_B3_128(RTYPE, in0, in1, in2) \ 1463 { \ 1464 XORI_B2_128(RTYPE, in0, in1); \ 1465 in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ 1466 } 1467 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) 1468 1469 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ 1470 { \ 1471 XORI_B2_128(RTYPE, in0, in1); \ 1472 XORI_B2_128(RTYPE, in2, in3); \ 1473 } 1474 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) 1475 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) 1476 1477 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \ 1478 { \ 1479 XORI_B4_128(RTYPE, in0, in1, in2, in3); \ 1480 XORI_B3_128(RTYPE, in4, in5, in6); \ 1481 } 1482 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) 1483 1484 /* Description : Average of signed halfword elements -> (a + b) / 2 1485 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1486 Outputs - out0, out1, out2, out3 1487 Return Type - as per RTYPE 1488 Details : Each signed halfword element from 'in0' is added to each 1489 signed halfword element of 'in1' with full precision resulting 1490 in one extra bit in the result. The result is then divided by 1491 2 and written to 'out0' 1492 */ 1493 #define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1494 out2, out3) \ 1495 { \ 1496 out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ 1497 out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ 1498 out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ 1499 out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ 1500 } 1501 #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) 1502 1503 /* Description : Addition of signed halfword elements and signed saturation 1504 Arguments : Inputs - in0, in1, in2, in3 1505 Outputs - out0, out1 1506 Return Type - as per RTYPE 1507 Details : Signed halfword elements from 'in0' are added to signed 1508 halfword elements of 'in1'. The result is then signed saturated 1509 between halfword data type range 1510 */ 1511 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1512 { \ 1513 out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ 1514 out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ 1515 } 1516 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) 1517 1518 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1519 out2, out3) \ 1520 { \ 1521 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1522 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1523 } 1524 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) 1525 1526 /* Description : Shift left all elements of vector (generic for all data types) 1527 Arguments : Inputs - in0, in1, in2, in3, shift 1528 Outputs - in place operation 1529 Return Type - as per input vector RTYPE 1530 Details : Each element of vector 'in0' is left shifted by 'shift' and 1531 the result is written in-place. 1532 */ 1533 #define SLLI_4V(in0, in1, in2, in3, shift) \ 1534 { \ 1535 in0 = in0 << shift; \ 1536 in1 = in1 << shift; \ 1537 in2 = in2 << shift; \ 1538 in3 = in3 << shift; \ 1539 } 1540 1541 /* Description : Arithmetic shift right all elements of vector 1542 (generic for all data types) 1543 Arguments : Inputs - in0, in1, in2, in3, shift 1544 Outputs - in place operation 1545 Return Type - as per input vector RTYPE 1546 Details : Each element of vector 'in0' is right shifted by 'shift' and 1547 the result is written in-place. 'shift' is a GP variable. 1548 */ 1549 #define SRA_4V(in0, in1, in2, in3, shift) \ 1550 { \ 1551 in0 = in0 >> shift; \ 1552 in1 = in1 >> shift; \ 1553 in2 = in2 >> shift; \ 1554 in3 = in3 >> shift; \ 1555 } 1556 1557 /* Description : Shift right arithmetic rounded words 1558 Arguments : Inputs - in0, in1, shift 1559 Outputs - in place operation 1560 Return Type - as per RTYPE 1561 Details : Each element of vector 'in0' is shifted right arithmetically by 1562 the number of bits in the corresponding element in the vector 1563 'shift'. The last discarded bit is added to shifted value for 1564 rounding and the result is written in-place. 1565 'shift' is a vector. 1566 */ 1567 #define SRAR_W2(RTYPE, in0, in1, shift) \ 1568 { \ 1569 in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ 1570 in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ 1571 } 1572 1573 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ 1574 { \ 1575 SRAR_W2(RTYPE, in0, in1, shift) \ 1576 SRAR_W2(RTYPE, in2, in3, shift) \ 1577 } 1578 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) 1579 1580 /* Description : Shift right arithmetic rounded (immediate) 1581 Arguments : Inputs - in0, in1, shift 1582 Outputs - in place operation 1583 Return Type - as per RTYPE 1584 Details : Each element of vector 'in0' is shifted right arithmetically by 1585 the value in 'shift'. The last discarded bit is added to the 1586 shifted value for rounding and the result is written in-place. 1587 'shift' is an immediate value. 1588 */ 1589 #define SRARI_H2(RTYPE, in0, in1, shift) \ 1590 { \ 1591 in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ 1592 in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ 1593 } 1594 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) 1595 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) 1596 1597 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ 1598 { \ 1599 SRARI_H2(RTYPE, in0, in1, shift); \ 1600 SRARI_H2(RTYPE, in2, in3, shift); \ 1601 } 1602 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) 1603 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) 1604 1605 #define SRARI_W2(RTYPE, in0, in1, shift) \ 1606 { \ 1607 in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ 1608 in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ 1609 } 1610 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) 1611 1612 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ 1613 { \ 1614 SRARI_W2(RTYPE, in0, in1, shift); \ 1615 SRARI_W2(RTYPE, in2, in3, shift); \ 1616 } 1617 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) 1618 1619 /* Description : Logical shift right all elements of vector (immediate) 1620 Arguments : Inputs - in0, in1, in2, in3, shift 1621 Outputs - out0, out1, out2, out3 1622 Return Type - as per RTYPE 1623 Details : Each element of vector 'in0' is right shifted by 'shift' and 1624 the result is written in-place. 'shift' is an immediate value. 1625 */ 1626 #define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \ 1627 { \ 1628 out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ 1629 out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ 1630 out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ 1631 out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ 1632 } 1633 #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) 1634 1635 /* Description : Multiplication of pairs of vectors 1636 Arguments : Inputs - in0, in1, in2, in3 1637 Outputs - out0, out1 1638 Details : Each element from 'in0' is multiplied with elements from 'in1' 1639 and the result is written to 'out0' 1640 */ 1641 #define MUL2(in0, in1, in2, in3, out0, out1) \ 1642 { \ 1643 out0 = in0 * in1; \ 1644 out1 = in2 * in3; \ 1645 } 1646 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1647 { \ 1648 MUL2(in0, in1, in2, in3, out0, out1); \ 1649 MUL2(in4, in5, in6, in7, out2, out3); \ 1650 } 1651 1652 /* Description : Addition of 2 pairs of vectors 1653 Arguments : Inputs - in0, in1, in2, in3 1654 Outputs - out0, out1 1655 Details : Each element in 'in0' is added to 'in1' and result is written 1656 to 'out0'. 1657 */ 1658 #define ADD2(in0, in1, in2, in3, out0, out1) \ 1659 { \ 1660 out0 = in0 + in1; \ 1661 out1 = in2 + in3; \ 1662 } 1663 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1664 { \ 1665 ADD2(in0, in1, in2, in3, out0, out1); \ 1666 ADD2(in4, in5, in6, in7, out2, out3); \ 1667 } 1668 1669 /* Description : Subtraction of 2 pairs of vectors 1670 Arguments : Inputs - in0, in1, in2, in3 1671 Outputs - out0, out1 1672 Details : Each element in 'in1' is subtracted from 'in0' and result is 1673 written to 'out0'. 1674 */ 1675 #define SUB2(in0, in1, in2, in3, out0, out1) \ 1676 { \ 1677 out0 = in0 - in1; \ 1678 out1 = in2 - in3; \ 1679 } 1680 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1681 { \ 1682 out0 = in0 - in1; \ 1683 out1 = in2 - in3; \ 1684 out2 = in4 - in5; \ 1685 out3 = in6 - in7; \ 1686 } 1687 1688 /* Description : Sign extend halfword elements from right half of the vector 1689 Arguments : Input - in (halfword vector) 1690 Output - out (sign extended word vector) 1691 Return Type - signed word 1692 Details : Sign bit of halfword elements from input vector 'in' is 1693 extracted and interleaved with same vector 'in0' to generate 1694 4 word elements keeping sign intact 1695 */ 1696 #define UNPCK_R_SH_SW(in, out) \ 1697 { \ 1698 v8i16 sign_m; \ 1699 \ 1700 sign_m = __msa_clti_s_h((v8i16)in, 0); \ 1701 out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ 1702 } 1703 1704 /* Description : Zero extend unsigned byte elements to halfword elements 1705 Arguments : Input - in (unsigned byte vector) 1706 Outputs - out0, out1 (unsigned halfword vectors) 1707 Return Type - signed halfword 1708 Details : Zero extended right half of vector is returned in 'out0' 1709 Zero extended left half of vector is returned in 'out1' 1710 */ 1711 #define UNPCK_UB_SH(in, out0, out1) \ 1712 { \ 1713 v16i8 zero_m = { 0 }; \ 1714 \ 1715 ILVRL_B2_SH(zero_m, in, out0, out1); \ 1716 } 1717 1718 /* Description : Sign extend halfword elements from input vector and return 1719 the result in pair of vectors 1720 Arguments : Input - in (halfword vector) 1721 Outputs - out0, out1 (sign extended word vectors) 1722 Return Type - signed word 1723 Details : Sign bit of halfword elements from input vector 'in' is 1724 extracted and interleaved right with same vector 'in0' to 1725 generate 4 signed word elements in 'out0' 1726 Then interleaved left with same vector 'in0' to 1727 generate 4 signed word elements in 'out1' 1728 */ 1729 #define UNPCK_SH_SW(in, out0, out1) \ 1730 { \ 1731 v8i16 tmp_m; \ 1732 \ 1733 tmp_m = __msa_clti_s_h((v8i16)in, 0); \ 1734 ILVRL_H2_SW(tmp_m, in, out0, out1); \ 1735 } 1736 1737 /* Description : Butterfly of 4 input vectors 1738 Arguments : Inputs - in0, in1, in2, in3 1739 Outputs - out0, out1, out2, out3 1740 Details : Butterfly operation 1741 */ 1742 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ 1743 { \ 1744 out0 = in0 + in3; \ 1745 out1 = in1 + in2; \ 1746 \ 1747 out2 = in1 - in2; \ 1748 out3 = in0 - in3; \ 1749 } 1750 1751 /* Description : Butterfly of 8 input vectors 1752 Arguments : Inputs - in0 ... in7 1753 Outputs - out0 .. out7 1754 Details : Butterfly operation 1755 */ 1756 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ 1757 out3, out4, out5, out6, out7) \ 1758 { \ 1759 out0 = in0 + in7; \ 1760 out1 = in1 + in6; \ 1761 out2 = in2 + in5; \ 1762 out3 = in3 + in4; \ 1763 \ 1764 out4 = in3 - in4; \ 1765 out5 = in2 - in5; \ 1766 out6 = in1 - in6; \ 1767 out7 = in0 - in7; \ 1768 } 1769 1770 /* Description : Butterfly of 16 input vectors 1771 Arguments : Inputs - in0 ... in15 1772 Outputs - out0 .. out15 1773 Details : Butterfly operation 1774 */ 1775 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ 1776 in11, in12, in13, in14, in15, out0, out1, out2, out3, \ 1777 out4, out5, out6, out7, out8, out9, out10, out11, out12, \ 1778 out13, out14, out15) \ 1779 { \ 1780 out0 = in0 + in15; \ 1781 out1 = in1 + in14; \ 1782 out2 = in2 + in13; \ 1783 out3 = in3 + in12; \ 1784 out4 = in4 + in11; \ 1785 out5 = in5 + in10; \ 1786 out6 = in6 + in9; \ 1787 out7 = in7 + in8; \ 1788 \ 1789 out8 = in7 - in8; \ 1790 out9 = in6 - in9; \ 1791 out10 = in5 - in10; \ 1792 out11 = in4 - in11; \ 1793 out12 = in3 - in12; \ 1794 out13 = in2 - in13; \ 1795 out14 = in1 - in14; \ 1796 out15 = in0 - in15; \ 1797 } 1798 1799 /* Description : Transpose input 8x8 byte block 1800 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1801 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1802 Return Type - as per RTYPE 1803 */ 1804 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ 1805 out1, out2, out3, out4, out5, out6, out7) \ 1806 { \ 1807 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1808 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1809 \ 1810 ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \ 1811 tmp3_m); \ 1812 ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ 1813 ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ 1814 ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ 1815 ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ 1816 SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ 1817 SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ 1818 } 1819 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) 1820 1821 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors 1822 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 1823 in8, in9, in10, in11, in12, in13, in14, in15 1824 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1825 Return Type - unsigned byte 1826 */ 1827 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ 1828 in10, in11, in12, in13, in14, in15, out0, out1, \ 1829 out2, out3, out4, out5, out6, out7) \ 1830 { \ 1831 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1832 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1833 \ 1834 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ 1835 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ 1836 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ 1837 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ 1838 \ 1839 tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ 1840 tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ 1841 tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ 1842 tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ 1843 out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ 1844 tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ 1845 out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ 1846 tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ 1847 \ 1848 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ 1849 out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1850 out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1851 \ 1852 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ 1853 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ 1854 out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1855 out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1856 \ 1857 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ 1858 out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1859 out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1860 \ 1861 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ 1862 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ 1863 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ 1864 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ 1865 out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1866 out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1867 } 1868 1869 /* Description : Transpose 4x4 block with half word elements in vectors 1870 Arguments : Inputs - in0, in1, in2, in3 1871 Outputs - out0, out1, out2, out3 1872 Return Type - signed halfword 1873 */ 1874 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ 1875 { \ 1876 v8i16 s0_m, s1_m; \ 1877 \ 1878 ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ 1879 ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ 1880 out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ 1881 out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ 1882 } 1883 1884 /* Description : Transpose 4x8 block with half word elements in vectors 1885 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1886 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1887 Return Type - signed halfword 1888 */ 1889 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1890 out2, out3, out4, out5, out6, out7) \ 1891 { \ 1892 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1893 v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ 1894 v8i16 zero_m = { 0 }; \ 1895 \ 1896 ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \ 1897 tmp3_n); \ 1898 ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ 1899 ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ 1900 \ 1901 out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ 1902 out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ 1903 out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ 1904 out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ 1905 \ 1906 out4 = zero_m; \ 1907 out5 = zero_m; \ 1908 out6 = zero_m; \ 1909 out7 = zero_m; \ 1910 } 1911 1912 /* Description : Transpose 8x4 block with half word elements in vectors 1913 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1914 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1915 Return Type - signed halfword 1916 */ 1917 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ 1918 { \ 1919 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1920 \ 1921 ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ 1922 ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ 1923 ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ 1924 ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ 1925 } 1926 1927 /* Description : Transpose 8x8 block with half word elements in vectors 1928 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1929 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1930 Return Type - as per RTYPE 1931 */ 1932 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ 1933 out1, out2, out3, out4, out5, out6, out7) \ 1934 { \ 1935 v8i16 s0_m, s1_m; \ 1936 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1937 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1938 \ 1939 ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ 1940 ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ 1941 ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ 1942 ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ 1943 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 1944 ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ 1945 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 1946 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ 1947 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \ 1948 tmp7_m, out0, out2, out4, out6); \ 1949 out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ 1950 out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ 1951 out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ 1952 out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ 1953 } 1954 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) 1955 1956 /* Description : Transpose 4x4 block with word elements in vectors 1957 Arguments : Inputs - in0, in1, in2, in3 1958 Outputs - out0, out1, out2, out3 1959 Return Type - signed word 1960 */ 1961 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ 1962 { \ 1963 v4i32 s0_m, s1_m, s2_m, s3_m; \ 1964 \ 1965 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ 1966 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ 1967 \ 1968 out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ 1969 out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ 1970 out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ 1971 out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ 1972 } 1973 1974 /* Description : Add block 4x4 1975 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 1976 Details : Least significant 4 bytes from each input vector are added to 1977 the destination bytes, clipped between 0-255 and stored. 1978 */ 1979 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ 1980 { \ 1981 uint32_t src0_m, src1_m, src2_m, src3_m; \ 1982 v8i16 inp0_m, inp1_m, res0_m, res1_m; \ 1983 v16i8 dst0_m = { 0 }; \ 1984 v16i8 dst1_m = { 0 }; \ 1985 v16i8 zero_m = { 0 }; \ 1986 \ 1987 ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ 1988 LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ 1989 INSERT_W2_SB(src0_m, src1_m, dst0_m); \ 1990 INSERT_W2_SB(src2_m, src3_m, dst1_m); \ 1991 ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ 1992 ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ 1993 CLIP_SH2_0_255(res0_m, res1_m); \ 1994 PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ 1995 ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ 1996 } 1997 1998 /* Description : Pack even elements of input vectors & xor with 128 1999 Arguments : Inputs - in0, in1 2000 Output - out_m 2001 Return Type - unsigned byte 2002 Details : Signed byte even elements from 'in0' and 'in1' are packed 2003 together in one vector and the resulting vector is xor'ed with 2004 128 to shift the range from signed to unsigned byte 2005 */ 2006 #define PCKEV_XORI128_UB(in0, in1) \ 2007 ({ \ 2008 v16u8 out_m; \ 2009 \ 2010 out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ 2011 out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ 2012 out_m; \ 2013 }) 2014 2015 /* Description : Converts inputs to unsigned bytes, interleave, average & store 2016 as 8x4 unsigned byte block 2017 Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, 2018 pdst, stride 2019 */ 2020 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \ 2021 pdst, stride) \ 2022 { \ 2023 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2024 \ 2025 tmp0_m = PCKEV_XORI128_UB(in0, in1); \ 2026 tmp1_m = PCKEV_XORI128_UB(in2, in3); \ 2027 ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ 2028 AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ 2029 ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ 2030 } 2031 2032 /* Description : Pack even byte elements and store byte vector in destination 2033 memory 2034 Arguments : Inputs - in0, in1, pdst 2035 */ 2036 #define PCKEV_ST_SB(in0, in1, pdst) \ 2037 { \ 2038 v16i8 tmp_m; \ 2039 \ 2040 tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ 2041 ST_SB(tmp_m, (pdst)); \ 2042 } 2043 2044 /* Description : Horizontal 2 tap filter kernel code 2045 Arguments : Inputs - in0, in1, mask, coeff, shift 2046 */ 2047 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ 2048 ({ \ 2049 v16i8 tmp0_m; \ 2050 v8u16 tmp1_m; \ 2051 \ 2052 tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ 2053 tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ 2054 tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ 2055 \ 2056 tmp1_m; \ 2057 }) 2058 #endif // AOM_AOM_DSP_MIPS_MACROS_MSA_H_ 2059