1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ 12 #define VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ 13 14 #include <msa.h> 15 16 #include "./vpx_config.h" 17 #include "vpx/vpx_integer.h" 18 19 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) 20 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__) 21 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__) 22 23 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) 24 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__) 25 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__) 26 27 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) 28 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__) 29 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__) 30 31 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 32 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) 33 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__) 34 35 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 36 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__) 37 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__) 38 39 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 40 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__) 41 42 #if (__mips_isa_rev >= 6) 43 #define LW(psrc) \ 44 ({ \ 45 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 46 uint32_t val_m; \ 47 \ 48 asm volatile ( \ 49 "lw %[val_m], %[psrc_m] \n\t" \ 50 \ 51 : [val_m] "=r" (val_m) \ 52 : [psrc_m] "m" (*psrc_m) \ 53 ); \ 54 \ 55 val_m; \ 56 }) 57 58 #if (__mips == 64) 59 #define LD(psrc) \ 60 ({ \ 61 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 62 uint64_t val_m = 0; \ 63 \ 64 asm volatile ( \ 65 "ld %[val_m], %[psrc_m] \n\t" \ 66 \ 67 : [val_m] "=r" (val_m) \ 68 : [psrc_m] "m" (*psrc_m) \ 69 ); \ 70 \ 71 val_m; \ 72 }) 73 #else // !(__mips == 64) 74 #define LD(psrc) \ 75 ({ \ 76 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 77 uint32_t val0_m, val1_m; \ 78 uint64_t val_m = 0; \ 79 \ 80 val0_m = LW(psrc_m); \ 81 val1_m = LW(psrc_m + 4); \ 82 \ 83 val_m = (uint64_t)(val1_m); \ 84 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ 85 val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ 86 \ 87 val_m; \ 88 }) 89 #endif // (__mips == 64) 90 91 #define SH(val, pdst) \ 92 { \ 93 uint8_t *pdst_m = (uint8_t *)(pdst); \ 94 const uint16_t val_m = (val); \ 95 \ 96 asm volatile ( \ 97 "sh %[val_m], %[pdst_m] \n\t" \ 98 \ 99 : [pdst_m] "=m" (*pdst_m) \ 100 : [val_m] "r" (val_m) \ 101 ); \ 102 } 103 104 #define SW(val, pdst) \ 105 { \ 106 uint8_t *pdst_m = (uint8_t *)(pdst); \ 107 const uint32_t val_m = (val); \ 108 \ 109 asm volatile ( \ 110 "sw %[val_m], %[pdst_m] \n\t" \ 111 \ 112 : [pdst_m] "=m" (*pdst_m) \ 113 : [val_m] "r" (val_m) \ 114 ); \ 115 } 116 117 #define SD(val, pdst) \ 118 { \ 119 uint8_t *pdst_m = (uint8_t *)(pdst); \ 120 const uint64_t val_m = (val); \ 121 \ 122 asm volatile ( \ 123 "sd %[val_m], %[pdst_m] \n\t" \ 124 \ 125 : [pdst_m] "=m" (*pdst_m) \ 126 : [val_m] "r" (val_m) \ 127 ); \ 128 } 129 #else // !(__mips_isa_rev >= 6) 130 #define LW(psrc) \ 131 ({ \ 132 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 133 uint32_t val_m; \ 134 \ 135 asm volatile ( \ 136 "ulw %[val_m], %[psrc_m] \n\t" \ 137 \ 138 : [val_m] "=r" (val_m) \ 139 : [psrc_m] "m" (*psrc_m) \ 140 ); \ 141 \ 142 val_m; \ 143 }) 144 145 #if (__mips == 64) 146 #define LD(psrc) \ 147 ({ \ 148 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 149 uint64_t val_m = 0; \ 150 \ 151 asm volatile ( \ 152 "uld %[val_m], %[psrc_m] \n\t" \ 153 \ 154 : [val_m] "=r" (val_m) \ 155 : [psrc_m] "m" (*psrc_m) \ 156 ); \ 157 \ 158 val_m; \ 159 }) 160 #else // !(__mips == 64) 161 #define LD(psrc) \ 162 ({ \ 163 const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ 164 uint32_t val0_m, val1_m; \ 165 uint64_t val_m = 0; \ 166 \ 167 val0_m = LW(psrc_m1); \ 168 val1_m = LW(psrc_m1 + 4); \ 169 \ 170 val_m = (uint64_t)(val1_m); \ 171 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ 172 val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ 173 \ 174 val_m; \ 175 }) 176 #endif // (__mips == 64) 177 #define SH(val, pdst) \ 178 { \ 179 uint8_t *pdst_m = (uint8_t *)(pdst); \ 180 const uint16_t val_m = (val); \ 181 \ 182 asm volatile ( \ 183 "ush %[val_m], %[pdst_m] \n\t" \ 184 \ 185 : [pdst_m] "=m" (*pdst_m) \ 186 : [val_m] "r" (val_m) \ 187 ); \ 188 } 189 190 #define SW(val, pdst) \ 191 { \ 192 uint8_t *pdst_m = (uint8_t *)(pdst); \ 193 const uint32_t val_m = (val); \ 194 \ 195 asm volatile ( \ 196 "usw %[val_m], %[pdst_m] \n\t" \ 197 \ 198 : [pdst_m] "=m" (*pdst_m) \ 199 : [val_m] "r" (val_m) \ 200 ); \ 201 } 202 203 #define SD(val, pdst) \ 204 { \ 205 uint8_t *pdst_m1 = (uint8_t *)(pdst); \ 206 uint32_t val0_m, val1_m; \ 207 \ 208 val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \ 209 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ 210 \ 211 SW(val0_m, pdst_m1); \ 212 SW(val1_m, pdst_m1 + 4); \ 213 } 214 #endif // (__mips_isa_rev >= 6) 215 216 /* Description : Load 4 words with stride 217 Arguments : Inputs - psrc, stride 218 Outputs - out0, out1, out2, out3 219 Details : Load word in 'out0' from (psrc) 220 Load word in 'out1' from (psrc + stride) 221 Load word in 'out2' from (psrc + 2 * stride) 222 Load word in 'out3' from (psrc + 3 * stride) 223 */ 224 #define LW4(psrc, stride, out0, out1, out2, out3) \ 225 { \ 226 out0 = LW((psrc)); \ 227 out1 = LW((psrc) + stride); \ 228 out2 = LW((psrc) + 2 * stride); \ 229 out3 = LW((psrc) + 3 * stride); \ 230 } 231 232 /* Description : Load double words with stride 233 Arguments : Inputs - psrc, stride 234 Outputs - out0, out1 235 Details : Load double word in 'out0' from (psrc) 236 Load double word in 'out1' from (psrc + stride) 237 */ 238 #define LD2(psrc, stride, out0, out1) \ 239 { \ 240 out0 = LD((psrc)); \ 241 out1 = LD((psrc) + stride); \ 242 } 243 #define LD4(psrc, stride, out0, out1, out2, out3) \ 244 { \ 245 LD2((psrc), stride, out0, out1); \ 246 LD2((psrc) + 2 * stride, stride, out2, out3); \ 247 } 248 249 /* Description : Store 4 words with stride 250 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 251 Details : Store word from 'in0' to (pdst) 252 Store word from 'in1' to (pdst + stride) 253 Store word from 'in2' to (pdst + 2 * stride) 254 Store word from 'in3' to (pdst + 3 * stride) 255 */ 256 #define SW4(in0, in1, in2, in3, pdst, stride) \ 257 { \ 258 SW(in0, (pdst)); \ 259 SW(in1, (pdst) + stride); \ 260 SW(in2, (pdst) + 2 * stride); \ 261 SW(in3, (pdst) + 3 * stride); \ 262 } 263 264 /* Description : Store 4 double words with stride 265 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 266 Details : Store double word from 'in0' to (pdst) 267 Store double word from 'in1' to (pdst + stride) 268 Store double word from 'in2' to (pdst + 2 * stride) 269 Store double word from 'in3' to (pdst + 3 * stride) 270 */ 271 #define SD4(in0, in1, in2, in3, pdst, stride) \ 272 { \ 273 SD(in0, (pdst)); \ 274 SD(in1, (pdst) + stride); \ 275 SD(in2, (pdst) + 2 * stride); \ 276 SD(in3, (pdst) + 3 * stride); \ 277 } 278 279 /* Description : Load vectors with 16 byte elements with stride 280 Arguments : Inputs - psrc, stride 281 Outputs - out0, out1 282 Return Type - as per RTYPE 283 Details : Load 16 byte elements in 'out0' from (psrc) 284 Load 16 byte elements in 'out1' from (psrc + stride) 285 */ 286 #define LD_B2(RTYPE, psrc, stride, out0, out1) \ 287 { \ 288 out0 = LD_B(RTYPE, (psrc)); \ 289 out1 = LD_B(RTYPE, (psrc) + stride); \ 290 } 291 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) 292 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) 293 294 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ 295 { \ 296 LD_B2(RTYPE, (psrc), stride, out0, out1); \ 297 out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ 298 } 299 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) 300 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__) 301 302 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 303 { \ 304 LD_B2(RTYPE, (psrc), stride, out0, out1); \ 305 LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ 306 } 307 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) 308 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) 309 310 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ 311 { \ 312 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 313 out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ 314 } 315 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) 316 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) 317 318 #define LD_B8(RTYPE, psrc, stride, \ 319 out0, out1, out2, out3, out4, out5, out6, out7) \ 320 { \ 321 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 322 LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ 323 } 324 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) 325 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) 326 327 /* Description : Load vectors with 8 halfword elements with stride 328 Arguments : Inputs - psrc, stride 329 Outputs - out0, out1 330 Details : Load 8 halfword elements in 'out0' from (psrc) 331 Load 8 halfword elements in 'out1' from (psrc + stride) 332 */ 333 #define LD_H2(RTYPE, psrc, stride, out0, out1) \ 334 { \ 335 out0 = LD_H(RTYPE, (psrc)); \ 336 out1 = LD_H(RTYPE, (psrc) + (stride)); \ 337 } 338 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) 339 340 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 341 { \ 342 LD_H2(RTYPE, (psrc), stride, out0, out1); \ 343 LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ 344 } 345 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) 346 347 /* Description : Load 2 vectors of signed word elements with stride 348 Arguments : Inputs - psrc, stride 349 Outputs - out0, out1 350 Return Type - signed word 351 */ 352 #define LD_SW2(psrc, stride, out0, out1) \ 353 { \ 354 out0 = LD_SW((psrc)); \ 355 out1 = LD_SW((psrc) + stride); \ 356 } 357 358 /* Description : Store vectors of 16 byte elements with stride 359 Arguments : Inputs - in0, in1, pdst, stride 360 Details : Store 16 byte elements from 'in0' to (pdst) 361 Store 16 byte elements from 'in1' to (pdst + stride) 362 */ 363 #define ST_B2(RTYPE, in0, in1, pdst, stride) \ 364 { \ 365 ST_B(RTYPE, in0, (pdst)); \ 366 ST_B(RTYPE, in1, (pdst) + stride); \ 367 } 368 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) 369 370 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ 371 { \ 372 ST_B2(RTYPE, in0, in1, (pdst), stride); \ 373 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 374 } 375 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) 376 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__) 377 378 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 379 pdst, stride) \ 380 { \ 381 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ 382 ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ 383 } 384 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) 385 386 /* Description : Store vectors of 8 halfword elements with stride 387 Arguments : Inputs - in0, in1, pdst, stride 388 Details : Store 8 halfword elements from 'in0' to (pdst) 389 Store 8 halfword elements from 'in1' to (pdst + stride) 390 */ 391 #define ST_H2(RTYPE, in0, in1, pdst, stride) \ 392 { \ 393 ST_H(RTYPE, in0, (pdst)); \ 394 ST_H(RTYPE, in1, (pdst) + stride); \ 395 } 396 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) 397 398 /* Description : Store vectors of word elements with stride 399 Arguments : Inputs - in0, in1, pdst, stride 400 Details : Store 4 word elements from 'in0' to (pdst) 401 Store 4 word elements from 'in1' to (pdst + stride) 402 */ 403 #define ST_SW2(in0, in1, pdst, stride) \ 404 { \ 405 ST_SW(in0, (pdst)); \ 406 ST_SW(in1, (pdst) + stride); \ 407 } 408 409 /* Description : Store 2x4 byte block to destination memory from input vector 410 Arguments : Inputs - in, stidx, pdst, stride 411 Details : Index 'stidx' halfword element from 'in' vector is copied to 412 the GP register and stored to (pdst) 413 Index 'stidx+1' halfword element from 'in' vector is copied to 414 the GP register and stored to (pdst + stride) 415 Index 'stidx+2' halfword element from 'in' vector is copied to 416 the GP register and stored to (pdst + 2 * stride) 417 Index 'stidx+3' halfword element from 'in' vector is copied to 418 the GP register and stored to (pdst + 3 * stride) 419 */ 420 #define ST2x4_UB(in, stidx, pdst, stride) \ 421 { \ 422 uint16_t out0_m, out1_m, out2_m, out3_m; \ 423 uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ 424 \ 425 out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ 426 out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ 427 out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ 428 out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ 429 \ 430 SH(out0_m, pblk_2x4_m); \ 431 SH(out1_m, pblk_2x4_m + stride); \ 432 SH(out2_m, pblk_2x4_m + 2 * stride); \ 433 SH(out3_m, pblk_2x4_m + 3 * stride); \ 434 } 435 436 /* Description : Store 4x4 byte block to destination memory from input vector 437 Arguments : Inputs - in0, in1, pdst, stride 438 Details : 'Idx0' word element from input vector 'in0' is copied to the 439 GP register and stored to (pdst) 440 'Idx1' word element from input vector 'in0' is copied to the 441 GP register and stored to (pdst + stride) 442 'Idx2' word element from input vector 'in0' is copied to the 443 GP register and stored to (pdst + 2 * stride) 444 'Idx3' word element from input vector 'in0' is copied to the 445 GP register and stored to (pdst + 3 * stride) 446 */ 447 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ 448 { \ 449 uint32_t out0_m, out1_m, out2_m, out3_m; \ 450 uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ 451 \ 452 out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ 453 out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ 454 out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ 455 out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ 456 \ 457 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ 458 } 459 #define ST4x8_UB(in0, in1, pdst, stride) \ 460 { \ 461 uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ 462 \ 463 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ 464 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ 465 } 466 467 /* Description : Store 8x1 byte block to destination memory from input vector 468 Arguments : Inputs - in, pdst 469 Details : Index 0 double word element from 'in' vector is copied to the 470 GP register and stored to (pdst) 471 */ 472 #define ST8x1_UB(in, pdst) \ 473 { \ 474 uint64_t out0_m; \ 475 \ 476 out0_m = __msa_copy_u_d((v2i64)in, 0); \ 477 SD(out0_m, pdst); \ 478 } 479 480 /* Description : Store 8x2 byte block to destination memory from input vector 481 Arguments : Inputs - in, pdst, stride 482 Details : Index 0 double word element from 'in' vector is copied to the 483 GP register and stored to (pdst) 484 Index 1 double word element from 'in' vector is copied to the 485 GP register and stored to (pdst + stride) 486 */ 487 #define ST8x2_UB(in, pdst, stride) \ 488 { \ 489 uint64_t out0_m, out1_m; \ 490 uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ 491 \ 492 out0_m = __msa_copy_u_d((v2i64)in, 0); \ 493 out1_m = __msa_copy_u_d((v2i64)in, 1); \ 494 \ 495 SD(out0_m, pblk_8x2_m); \ 496 SD(out1_m, pblk_8x2_m + stride); \ 497 } 498 499 /* Description : Store 8x4 byte block to destination memory from input 500 vectors 501 Arguments : Inputs - in0, in1, pdst, stride 502 Details : Index 0 double word element from 'in0' vector is copied to the 503 GP register and stored to (pdst) 504 Index 1 double word element from 'in0' vector is copied to the 505 GP register and stored to (pdst + stride) 506 Index 0 double word element from 'in1' vector is copied to the 507 GP register and stored to (pdst + 2 * stride) 508 Index 1 double word element from 'in1' vector is copied to the 509 GP register and stored to (pdst + 3 * stride) 510 */ 511 #define ST8x4_UB(in0, in1, pdst, stride) \ 512 { \ 513 uint64_t out0_m, out1_m, out2_m, out3_m; \ 514 uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ 515 \ 516 out0_m = __msa_copy_u_d((v2i64)in0, 0); \ 517 out1_m = __msa_copy_u_d((v2i64)in0, 1); \ 518 out2_m = __msa_copy_u_d((v2i64)in1, 0); \ 519 out3_m = __msa_copy_u_d((v2i64)in1, 1); \ 520 \ 521 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ 522 } 523 524 /* Description : Immediate number of elements to slide with zero 525 Arguments : Inputs - in0, in1, slide_val 526 Outputs - out0, out1 527 Return Type - as per RTYPE 528 Details : Byte elements from 'zero_m' vector are slid into 'in0' by 529 value specified in the 'slide_val' 530 */ 531 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ 532 { \ 533 v16i8 zero_m = { 0 }; \ 534 \ 535 out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ 536 out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ 537 } 538 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__) 539 540 /* Description : Immediate number of elements to slide 541 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val 542 Outputs - out0, out1 543 Return Type - as per RTYPE 544 Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by 545 value specified in the 'slide_val' 546 */ 547 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ 548 { \ 549 out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ 550 out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ 551 } 552 553 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \ 554 out0, out1, out2, slide_val) \ 555 { \ 556 SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val); \ 557 out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ 558 } 559 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) 560 561 /* Description : Shuffle byte vector elements as per mask vector 562 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 563 Outputs - out0, out1 564 Return Type - as per RTYPE 565 Details : Byte elements from 'in0' & 'in1' are copied selectively to 566 'out0' as per control vector 'mask0' 567 */ 568 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 569 { \ 570 out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ 571 out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ 572 } 573 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) 574 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) 575 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) 576 577 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \ 578 out0, out1, out2) \ 579 { \ 580 VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \ 581 out2 = (RTYPE)__msa_vshf_b((v16i8)mask2, (v16i8)in5, (v16i8)in4); \ 582 } 583 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__) 584 585 /* Description : Shuffle halfword vector elements as per mask vector 586 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 587 Outputs - out0, out1 588 Return Type - as per RTYPE 589 Details : halfword elements from 'in0' & 'in1' are copied selectively to 590 'out0' as per control vector 'mask0' 591 */ 592 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 593 { \ 594 out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \ 595 out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \ 596 } 597 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__) 598 599 /* Description : Dot product of byte vector elements 600 Arguments : Inputs - mult0, mult1, cnst0, cnst1 601 Outputs - out0, out1 602 Return Type - as per RTYPE 603 Details : Unsigned byte elements from 'mult0' are multiplied with 604 unsigned byte elements from 'cnst0' producing a result 605 twice the size of input i.e. unsigned halfword. 606 The multiplication result of adjacent odd-even elements 607 are added together and written to the 'out0' vector 608 */ 609 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 610 { \ 611 out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ 612 out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ 613 } 614 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) 615 616 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \ 617 cnst0, cnst1, cnst2, cnst3, \ 618 out0, out1, out2, out3) \ 619 { \ 620 DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 621 DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 622 } 623 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) 624 625 /* Description : Dot product of byte vector elements 626 Arguments : Inputs - mult0, mult1, cnst0, cnst1 627 Outputs - out0, out1 628 Return Type - as per RTYPE 629 Details : Signed byte elements from 'mult0' are multiplied with 630 signed byte elements from 'cnst0' producing a result 631 twice the size of input i.e. signed halfword. 632 The multiplication result of adjacent odd-even elements 633 are added together and written to the 'out0' vector 634 */ 635 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 636 { \ 637 out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ 638 out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ 639 } 640 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) 641 642 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \ 643 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \ 644 { \ 645 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 646 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 647 } 648 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) 649 650 /* Description : Dot product of halfword vector elements 651 Arguments : Inputs - mult0, mult1, cnst0, cnst1 652 Outputs - out0, out1 653 Return Type - as per RTYPE 654 Details : Signed halfword elements from 'mult0' are multiplied with 655 signed halfword elements from 'cnst0' producing a result 656 twice the size of input i.e. signed word. 657 The multiplication result of adjacent odd-even elements 658 are added together and written to the 'out0' vector 659 */ 660 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 661 { \ 662 out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ 663 out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ 664 } 665 666 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \ 667 cnst0, cnst1, cnst2, cnst3, \ 668 out0, out1, out2, out3) \ 669 { \ 670 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 671 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 672 } 673 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) 674 675 /* Description : Dot product of word vector elements 676 Arguments : Inputs - mult0, mult1, cnst0, cnst1 677 Outputs - out0, out1 678 Return Type - as per RTYPE 679 Details : Signed word elements from 'mult0' are multiplied with 680 signed word elements from 'cnst0' producing a result 681 twice the size of input i.e. signed double word. 682 The multiplication result of adjacent odd-even elements 683 are added together and written to the 'out0' vector 684 */ 685 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 686 { \ 687 out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ 688 out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ 689 } 690 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) 691 692 /* Description : Dot product & addition of byte vector elements 693 Arguments : Inputs - mult0, mult1, cnst0, cnst1 694 Outputs - out0, out1 695 Return Type - as per RTYPE 696 Details : Signed byte elements from 'mult0' are multiplied with 697 signed byte elements from 'cnst0' producing a result 698 twice the size of input i.e. signed halfword. 699 The multiplication result of adjacent odd-even elements 700 are added to the 'out0' vector 701 */ 702 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 703 { \ 704 out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ 705 out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ 706 } 707 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) 708 709 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \ 710 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \ 711 { \ 712 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 713 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 714 } 715 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) 716 717 /* Description : Dot product & addition of halfword vector elements 718 Arguments : Inputs - mult0, mult1, cnst0, cnst1 719 Outputs - out0, out1 720 Return Type - as per RTYPE 721 Details : Signed halfword elements from 'mult0' are multiplied with 722 signed halfword elements from 'cnst0' producing a result 723 twice the size of input i.e. signed word. 724 The multiplication result of adjacent odd-even elements 725 are added to the 'out0' vector 726 */ 727 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 728 { \ 729 out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ 730 out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ 731 } 732 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) 733 734 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \ 735 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \ 736 { \ 737 DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 738 DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 739 } 740 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__) 741 742 /* Description : Dot product & addition of double word vector elements 743 Arguments : Inputs - mult0, mult1 744 Outputs - out0, out1 745 Return Type - as per RTYPE 746 Details : Each signed word element from 'mult0' is multiplied with itself 747 producing an intermediate result twice the size of it 748 i.e. signed double word 749 The multiplication result of adjacent odd-even elements 750 are added to the 'out0' vector 751 */ 752 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ 753 { \ 754 out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ 755 out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ 756 } 757 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) 758 759 /* Description : Clips all signed halfword elements of input vector 760 between 0 & 255 761 Arguments : Input - in 762 Output - out_m 763 Return Type - signed halfword 764 */ 765 #define CLIP_SH_0_255(in) \ 766 ({ \ 767 v8i16 max_m = __msa_ldi_h(255); \ 768 v8i16 out_m; \ 769 \ 770 out_m = __msa_maxi_s_h((v8i16)in, 0); \ 771 out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ 772 out_m; \ 773 }) 774 #define CLIP_SH2_0_255(in0, in1) \ 775 { \ 776 in0 = CLIP_SH_0_255(in0); \ 777 in1 = CLIP_SH_0_255(in1); \ 778 } 779 #define CLIP_SH4_0_255(in0, in1, in2, in3) \ 780 { \ 781 CLIP_SH2_0_255(in0, in1); \ 782 CLIP_SH2_0_255(in2, in3); \ 783 } 784 785 /* Description : Clips all signed word elements of input vector 786 between 0 & 255 787 Arguments : Input - in 788 Output - out_m 789 Return Type - signed word 790 */ 791 #define CLIP_SW_0_255(in) \ 792 ({ \ 793 v4i32 max_m = __msa_ldi_w(255); \ 794 v4i32 out_m; \ 795 \ 796 out_m = __msa_maxi_s_w((v4i32)in, 0); \ 797 out_m = __msa_min_s_w((v4i32)max_m, (v4i32)out_m); \ 798 out_m; \ 799 }) 800 801 /* Description : Horizontal addition of 4 signed word elements of input vector 802 Arguments : Input - in (signed word vector) 803 Output - sum_m (i32 sum) 804 Return Type - signed word (GP) 805 Details : 4 signed word elements of 'in' vector are added together and 806 the resulting integer sum is returned 807 */ 808 #define HADD_SW_S32(in) \ 809 ({ \ 810 v2i64 res0_m, res1_m; \ 811 int32_t sum_m; \ 812 \ 813 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ 814 res1_m = __msa_splati_d(res0_m, 1); \ 815 res0_m = res0_m + res1_m; \ 816 sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ 817 sum_m; \ 818 }) 819 820 /* Description : Horizontal addition of 8 unsigned halfword elements 821 Arguments : Inputs - in (unsigned halfword vector) 822 Outputs - sum_m (u32 sum) 823 Return Type - unsigned word 824 Details : 8 unsigned halfword elements of input vector are added 825 together and the resulting integer sum is returned 826 */ 827 #define HADD_UH_U32(in) \ 828 ({ \ 829 v4u32 res_m; \ 830 v2u64 res0_m, res1_m; \ 831 uint32_t sum_m; \ 832 \ 833 res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ 834 res0_m = __msa_hadd_u_d(res_m, res_m); \ 835 res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ 836 res0_m = res0_m + res1_m; \ 837 sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ 838 sum_m; \ 839 }) 840 841 /* Description : Horizontal addition of unsigned byte vector elements 842 Arguments : Inputs - in0, in1 843 Outputs - out0, out1 844 Return Type - as per RTYPE 845 Details : Each unsigned odd byte element from 'in0' is added to 846 even unsigned byte element from 'in0' (pairwise) and the 847 halfword result is written to 'out0' 848 */ 849 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \ 850 { \ 851 out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ 852 out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ 853 } 854 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) 855 856 /* Description : Horizontal subtraction of unsigned byte vector elements 857 Arguments : Inputs - in0, in1 858 Outputs - out0, out1 859 Return Type - as per RTYPE 860 Details : Each unsigned odd byte element from 'in0' is subtracted from 861 even unsigned byte element from 'in0' (pairwise) and the 862 halfword result is written to 'out0' 863 */ 864 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ 865 { \ 866 out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ 867 out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ 868 } 869 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) 870 871 /* Description : Horizontal subtraction of signed halfword vector elements 872 Arguments : Inputs - in0, in1 873 Outputs - out0, out1 874 Return Type - as per RTYPE 875 Details : Each signed odd halfword element from 'in0' is subtracted from 876 even signed halfword element from 'in0' (pairwise) and the 877 word result is written to 'out0' 878 */ 879 #define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ 880 { \ 881 out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ 882 out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ 883 } 884 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) 885 886 /* Description : Set element n input vector to GPR value 887 Arguments : Inputs - in0, in1, in2, in3 888 Output - out 889 Return Type - as per RTYPE 890 Details : Set element 0 in vector 'out' to value specified in 'in0' 891 */ 892 #define INSERT_D2(RTYPE, in0, in1, out) \ 893 { \ 894 out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ 895 out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ 896 } 897 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) 898 899 /* Description : Interleave even byte elements from vectors 900 Arguments : Inputs - in0, in1, in2, in3 901 Outputs - out0, out1 902 Return Type - as per RTYPE 903 Details : Even byte elements of 'in0' and 'in1' are interleaved 904 and written to 'out0' 905 */ 906 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 907 { \ 908 out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ 909 out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ 910 } 911 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) 912 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) 913 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__) 914 915 /* Description : Interleave even halfword elements from vectors 916 Arguments : Inputs - in0, in1, in2, in3 917 Outputs - out0, out1 918 Return Type - as per RTYPE 919 Details : Even halfword elements of 'in0' and 'in1' are interleaved 920 and written to 'out0' 921 */ 922 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 923 { \ 924 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ 925 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ 926 } 927 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) 928 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) 929 930 /* Description : Interleave even word elements from vectors 931 Arguments : Inputs - in0, in1, in2, in3 932 Outputs - out0, out1 933 Return Type - as per RTYPE 934 Details : Even word elements of 'in0' and 'in1' are interleaved 935 and written to 'out0' 936 */ 937 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 938 { \ 939 out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ 940 out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ 941 } 942 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__) 943 944 /* Description : Interleave even double word elements from vectors 945 Arguments : Inputs - in0, in1, in2, in3 946 Outputs - out0, out1 947 Return Type - as per RTYPE 948 Details : Even double word elements of 'in0' and 'in1' are interleaved 949 and written to 'out0' 950 */ 951 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 952 { \ 953 out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ 954 out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ 955 } 956 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) 957 958 /* Description : Interleave left half of byte elements from vectors 959 Arguments : Inputs - in0, in1, in2, in3 960 Outputs - out0, out1 961 Return Type - as per RTYPE 962 Details : Left half of byte elements of 'in0' and 'in1' are interleaved 963 and written to 'out0'. 964 */ 965 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 966 { \ 967 out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 968 out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ 969 } 970 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) 971 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) 972 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) 973 974 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 975 out0, out1, out2, out3) \ 976 { \ 977 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 978 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 979 } 980 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) 981 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) 982 983 /* Description : Interleave left half of halfword elements from vectors 984 Arguments : Inputs - in0, in1, in2, in3 985 Outputs - out0, out1 986 Return Type - as per RTYPE 987 Details : Left half of halfword elements of 'in0' and 'in1' are 988 interleaved and written to 'out0'. 989 */ 990 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 991 { \ 992 out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 993 out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ 994 } 995 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) 996 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) 997 998 /* Description : Interleave left half of word elements from vectors 999 Arguments : Inputs - in0, in1, in2, in3 1000 Outputs - out0, out1 1001 Return Type - as per RTYPE 1002 Details : Left half of word elements of 'in0' and 'in1' are interleaved 1003 and written to 'out0'. 1004 */ 1005 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1006 { \ 1007 out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 1008 out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ 1009 } 1010 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) 1011 1012 /* Description : Interleave right half of byte elements from vectors 1013 Arguments : Inputs - in0, in1, in2, in3 1014 Outputs - out0, out1 1015 Return Type - as per RTYPE 1016 Details : Right half of byte elements of 'in0' and 'in1' are interleaved 1017 and written to out0. 1018 */ 1019 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1020 { \ 1021 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 1022 out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ 1023 } 1024 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) 1025 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) 1026 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) 1027 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__) 1028 1029 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1030 out0, out1, out2, out3) \ 1031 { \ 1032 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1033 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1034 } 1035 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) 1036 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) 1037 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) 1038 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) 1039 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__) 1040 1041 /* Description : Interleave right half of halfword elements from vectors 1042 Arguments : Inputs - in0, in1, in2, in3 1043 Outputs - out0, out1 1044 Return Type - as per RTYPE 1045 Details : Right half of halfword elements of 'in0' and 'in1' are 1046 interleaved and written to 'out0'. 1047 */ 1048 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1049 { \ 1050 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 1051 out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ 1052 } 1053 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) 1054 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) 1055 1056 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1057 out0, out1, out2, out3) \ 1058 { \ 1059 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1060 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1061 } 1062 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) 1063 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__) 1064 1065 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1066 { \ 1067 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 1068 out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ 1069 } 1070 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) 1071 1072 /* Description : Interleave right half of double word elements from vectors 1073 Arguments : Inputs - in0, in1, in2, in3 1074 Outputs - out0, out1 1075 Return Type - as per RTYPE 1076 Details : Right half of double word elements of 'in0' and 'in1' are 1077 interleaved and written to 'out0'. 1078 */ 1079 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1080 { \ 1081 out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ 1082 out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ 1083 } 1084 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) 1085 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) 1086 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) 1087 1088 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1089 out0, out1, out2, out3) \ 1090 { \ 1091 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1092 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1093 } 1094 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) 1095 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) 1096 1097 /* Description : Interleave both left and right half of input vectors 1098 Arguments : Inputs - in0, in1 1099 Outputs - out0, out1 1100 Return Type - as per RTYPE 1101 Details : Right half of byte elements from 'in0' and 'in1' are 1102 interleaved and written to 'out0' 1103 */ 1104 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ 1105 { \ 1106 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 1107 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 1108 } 1109 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) 1110 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) 1111 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) 1112 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) 1113 1114 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ 1115 { \ 1116 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 1117 out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 1118 } 1119 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) 1120 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) 1121 1122 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ 1123 { \ 1124 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 1125 out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 1126 } 1127 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) 1128 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) 1129 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) 1130 1131 /* Description : Maximum values between signed elements of vector and 1132 5-bit signed immediate value are copied to the output vector 1133 Arguments : Inputs - in0, in1, in2, in3, max_val 1134 Outputs - in place operation 1135 Return Type - unsigned halfword 1136 Details : Maximum of signed halfword element values from 'in0' and 1137 'max_val' are written in place 1138 */ 1139 #define MAXI_SH2(RTYPE, in0, in1, max_val) \ 1140 { \ 1141 in0 = (RTYPE)__msa_maxi_s_h((v8i16)in0, (max_val)); \ 1142 in1 = (RTYPE)__msa_maxi_s_h((v8i16)in1, (max_val)); \ 1143 } 1144 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__) 1145 1146 /* Description : Saturate the halfword element values to the max 1147 unsigned value of (sat_val + 1) bits 1148 The element data width remains unchanged 1149 Arguments : Inputs - in0, in1, sat_val 1150 Outputs - in place operation 1151 Return Type - as per RTYPE 1152 Details : Each unsigned halfword element from 'in0' is saturated to the 1153 value generated with (sat_val + 1) bit range. 1154 The results are written in place 1155 */ 1156 #define SAT_UH2(RTYPE, in0, in1, sat_val) \ 1157 { \ 1158 in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ 1159 in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ 1160 } 1161 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__) 1162 1163 /* Description : Saturate the halfword element values to the max 1164 unsigned value of (sat_val + 1) bits 1165 The element data width remains unchanged 1166 Arguments : Inputs - in0, in1, sat_val 1167 Outputs - in place operation 1168 Return Type - as per RTYPE 1169 Details : Each unsigned halfword element from 'in0' is saturated to the 1170 value generated with (sat_val + 1) bit range 1171 The results are written in place 1172 */ 1173 #define SAT_SH2(RTYPE, in0, in1, sat_val) \ 1174 { \ 1175 in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ 1176 in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ 1177 } 1178 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) 1179 1180 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ 1181 { \ 1182 SAT_SH2(RTYPE, in0, in1, sat_val); \ 1183 SAT_SH2(RTYPE, in2, in3, sat_val); \ 1184 } 1185 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) 1186 1187 /* Description : Indexed halfword element values are replicated to all 1188 elements in output vector 1189 Arguments : Inputs - in, idx0, idx1 1190 Outputs - out0, out1 1191 Return Type - as per RTYPE 1192 Details : 'idx0' element value from 'in' vector is replicated to all 1193 elements in 'out0' vector 1194 Valid index range for halfword operation is 0-7 1195 */ 1196 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ 1197 { \ 1198 out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ 1199 out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ 1200 } 1201 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__) 1202 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) 1203 1204 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, \ 1205 out0, out1, out2) \ 1206 { \ 1207 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ 1208 out2 = (RTYPE)__msa_splati_h((v8i16)in, idx2); \ 1209 } 1210 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__) 1211 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__) 1212 1213 /* Description : Indexed word element values are replicated to all 1214 elements in output vector 1215 Arguments : Inputs - in, stidx 1216 Outputs - out0, out1 1217 Return Type - as per RTYPE 1218 Details : 'stidx' element value from 'in' vector is replicated to all 1219 elements in 'out0' vector 1220 'stidx + 1' element value from 'in' vector is replicated to all 1221 elements in 'out1' vector 1222 Valid index range for word operation is 0-3 1223 */ 1224 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \ 1225 { \ 1226 out0 = (RTYPE)__msa_splati_w((v4i32)in, stidx); \ 1227 out1 = (RTYPE)__msa_splati_w((v4i32)in, (stidx+1)); \ 1228 } 1229 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__) 1230 1231 /* Description : Pack even byte elements of vector pairs 1232 Arguments : Inputs - in0, in1, in2, in3 1233 Outputs - out0, out1 1234 Return Type - as per RTYPE 1235 Details : Even byte elements of 'in0' are copied to the left half of 1236 'out0' & even byte elements of 'in1' are copied to the right 1237 half of 'out0'. 1238 */ 1239 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1240 { \ 1241 out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ 1242 out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ 1243 } 1244 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) 1245 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) 1246 1247 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1248 out0, out1, out2, out3) \ 1249 { \ 1250 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1251 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1252 } 1253 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) 1254 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) 1255 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) 1256 1257 /* Description : Pack even halfword elements of vector pairs 1258 Arguments : Inputs - in0, in1, in2, in3 1259 Outputs - out0, out1 1260 Return Type - as per RTYPE 1261 Details : Even halfword elements of 'in0' are copied to the left half of 1262 'out0' & even halfword elements of 'in1' are copied to the 1263 right half of 'out0'. 1264 */ 1265 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1266 { \ 1267 out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ 1268 out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ 1269 } 1270 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) 1271 1272 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1273 out0, out1, out2, out3) \ 1274 { \ 1275 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1276 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1277 } 1278 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) 1279 1280 /* Description : Pack even double word elements of vector pairs 1281 Arguments : Inputs - in0, in1, in2, in3 1282 Outputs - out0, out1 1283 Return Type - as per RTYPE 1284 Details : Even double elements of 'in0' are copied to the left half of 1285 'out0' & even double elements of 'in1' are copied to the right 1286 half of 'out0'. 1287 */ 1288 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1289 { \ 1290 out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ 1291 out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ 1292 } 1293 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) 1294 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) 1295 1296 /* Description : Pack odd double word elements of vector pairs 1297 Arguments : Inputs - in0, in1, in2, in3 1298 Outputs - out0, out1 1299 Return Type - as per RTYPE 1300 Details : Odd double word elements of 'in0' are copied to the left half 1301 of 'out0' & odd double word elements of 'in1' are copied to 1302 the right half of 'out0'. 1303 */ 1304 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1305 { \ 1306 out0 = (RTYPE)__msa_pckod_d((v2i64)in0, (v2i64)in1); \ 1307 out1 = (RTYPE)__msa_pckod_d((v2i64)in2, (v2i64)in3); \ 1308 } 1309 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__) 1310 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__) 1311 1312 /* Description : Each byte element is logically xor'ed with immediate 128 1313 Arguments : Inputs - in0, in1 1314 Outputs - in place operation 1315 Return Type - as per RTYPE 1316 Details : Each unsigned byte element from input vector 'in0' is 1317 logically xor'ed with 128 and the result is stored in-place. 1318 */ 1319 #define XORI_B2_128(RTYPE, in0, in1) \ 1320 { \ 1321 in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ 1322 in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ 1323 } 1324 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) 1325 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) 1326 1327 #define XORI_B3_128(RTYPE, in0, in1, in2) \ 1328 { \ 1329 XORI_B2_128(RTYPE, in0, in1); \ 1330 in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ 1331 } 1332 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) 1333 1334 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ 1335 { \ 1336 XORI_B2_128(RTYPE, in0, in1); \ 1337 XORI_B2_128(RTYPE, in2, in3); \ 1338 } 1339 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) 1340 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) 1341 1342 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \ 1343 { \ 1344 XORI_B3_128(RTYPE, in0, in1, in2); \ 1345 XORI_B2_128(RTYPE, in3, in4); \ 1346 } 1347 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__) 1348 1349 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \ 1350 { \ 1351 XORI_B4_128(RTYPE, in0, in1, in2, in3); \ 1352 XORI_B4_128(RTYPE, in4, in5, in6, in7); \ 1353 } 1354 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__) 1355 1356 /* Description : Shift left all elements of vector (generic for all data types) 1357 Arguments : Inputs - in0, in1, in2, in3, shift 1358 Outputs - in place operation 1359 Return Type - as per input vector RTYPE 1360 Details : Each element of vector 'in0' is left shifted by 'shift' and 1361 the result is written in-place. 1362 */ 1363 #define SLLI_4V(in0, in1, in2, in3, shift) \ 1364 { \ 1365 in0 = in0 << shift; \ 1366 in1 = in1 << shift; \ 1367 in2 = in2 << shift; \ 1368 in3 = in3 << shift; \ 1369 } 1370 1371 /* Description : Arithmetic shift right all elements of vector 1372 (generic for all data types) 1373 Arguments : Inputs - in0, in1, in2, in3, shift 1374 Outputs - in place operation 1375 Return Type - as per input vector RTYPE 1376 Details : Each element of vector 'in0' is right shifted by 'shift' and 1377 the result is written in-place. 'shift' is a GP variable. 1378 */ 1379 #define SRA_4V(in0, in1, in2, in3, shift) \ 1380 { \ 1381 in0 = in0 >> shift; \ 1382 in1 = in1 >> shift; \ 1383 in2 = in2 >> shift; \ 1384 in3 = in3 >> shift; \ 1385 } 1386 1387 /* Description : Shift right arithmetic rounded words 1388 Arguments : Inputs - in0, in1, shift 1389 Outputs - in place operation 1390 Return Type - as per RTYPE 1391 Details : Each element of vector 'in0' is shifted right arithmetically by 1392 the number of bits in the corresponding element in the vector 1393 'shift'. The last discarded bit is added to shifted value for 1394 rounding and the result is written in-place. 1395 'shift' is a vector. 1396 */ 1397 #define SRAR_W2(RTYPE, in0, in1, shift) \ 1398 { \ 1399 in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ 1400 in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ 1401 } 1402 1403 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ 1404 { \ 1405 SRAR_W2(RTYPE, in0, in1, shift); \ 1406 SRAR_W2(RTYPE, in2, in3, shift); \ 1407 } 1408 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) 1409 1410 /* Description : Shift right arithmetic rounded (immediate) 1411 Arguments : Inputs - in0, in1, shift 1412 Outputs - in place operation 1413 Return Type - as per RTYPE 1414 Details : Each element of vector 'in0' is shifted right arithmetically by 1415 the value in 'shift'. The last discarded bit is added to the 1416 shifted value for rounding and the result is written in-place. 1417 'shift' is an immediate value. 1418 */ 1419 #define SRARI_H2(RTYPE, in0, in1, shift) \ 1420 { \ 1421 in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ 1422 in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ 1423 } 1424 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) 1425 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) 1426 1427 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ 1428 { \ 1429 SRARI_H2(RTYPE, in0, in1, shift); \ 1430 SRARI_H2(RTYPE, in2, in3, shift); \ 1431 } 1432 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) 1433 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) 1434 1435 #define SRARI_W2(RTYPE, in0, in1, shift) \ 1436 { \ 1437 in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ 1438 in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ 1439 } 1440 1441 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ 1442 { \ 1443 SRARI_W2(RTYPE, in0, in1, shift); \ 1444 SRARI_W2(RTYPE, in2, in3, shift); \ 1445 } 1446 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) 1447 1448 /* Description : Multiplication of pairs of vectors 1449 Arguments : Inputs - in0, in1, in2, in3 1450 Outputs - out0, out1 1451 Details : Each element from 'in0' is multiplied with elements from 'in1' 1452 and the result is written to 'out0' 1453 */ 1454 #define MUL2(in0, in1, in2, in3, out0, out1) \ 1455 { \ 1456 out0 = in0 * in1; \ 1457 out1 = in2 * in3; \ 1458 } 1459 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \ 1460 out0, out1, out2, out3) \ 1461 { \ 1462 MUL2(in0, in1, in2, in3, out0, out1); \ 1463 MUL2(in4, in5, in6, in7, out2, out3); \ 1464 } 1465 1466 /* Description : Addition of 2 pairs of vectors 1467 Arguments : Inputs - in0, in1, in2, in3 1468 Outputs - out0, out1 1469 Details : Each element in 'in0' is added to 'in1' and result is written 1470 to 'out0'. 1471 */ 1472 #define ADD2(in0, in1, in2, in3, out0, out1) \ 1473 { \ 1474 out0 = in0 + in1; \ 1475 out1 = in2 + in3; \ 1476 } 1477 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ 1478 out0, out1, out2, out3) \ 1479 { \ 1480 ADD2(in0, in1, in2, in3, out0, out1); \ 1481 ADD2(in4, in5, in6, in7, out2, out3); \ 1482 } 1483 1484 /* Description : Subtraction of 2 pairs of vectors 1485 Arguments : Inputs - in0, in1, in2, in3 1486 Outputs - out0, out1 1487 Details : Each element in 'in1' is subtracted from 'in0' and result is 1488 written to 'out0'. 1489 */ 1490 #define SUB2(in0, in1, in2, in3, out0, out1) \ 1491 { \ 1492 out0 = in0 - in1; \ 1493 out1 = in2 - in3; \ 1494 } 1495 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \ 1496 out0, out1, out2, out3) \ 1497 { \ 1498 out0 = in0 - in1; \ 1499 out1 = in2 - in3; \ 1500 out2 = in4 - in5; \ 1501 out3 = in6 - in7; \ 1502 } 1503 1504 /* Description : Sign extend halfword elements from right half of the vector 1505 Arguments : Input - in (halfword vector) 1506 Output - out (sign extended word vector) 1507 Return Type - signed word 1508 Details : Sign bit of halfword elements from input vector 'in' is 1509 extracted and interleaved with same vector 'in0' to generate 1510 4 word elements keeping sign intact 1511 */ 1512 #define UNPCK_R_SH_SW(in, out) \ 1513 { \ 1514 v8i16 sign_m; \ 1515 \ 1516 sign_m = __msa_clti_s_h((v8i16)in, 0); \ 1517 out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ 1518 } 1519 1520 /* Description : Zero extend unsigned byte elements to halfword elements 1521 Arguments : Input - in (unsigned byte vector) 1522 Outputs - out0, out1 (unsigned halfword vectors) 1523 Return Type - signed halfword 1524 Details : Zero extended right half of vector is returned in 'out0' 1525 Zero extended left half of vector is returned in 'out1' 1526 */ 1527 #define UNPCK_UB_SH(in, out0, out1) \ 1528 { \ 1529 v16i8 zero_m = { 0 }; \ 1530 \ 1531 ILVRL_B2_SH(zero_m, in, out0, out1); \ 1532 } 1533 1534 /* Description : Sign extend halfword elements from input vector and return 1535 the result in pair of vectors 1536 Arguments : Input - in (halfword vector) 1537 Outputs - out0, out1 (sign extended word vectors) 1538 Return Type - signed word 1539 Details : Sign bit of halfword elements from input vector 'in' is 1540 extracted and interleaved right with same vector 'in0' to 1541 generate 4 signed word elements in 'out0' 1542 Then interleaved left with same vector 'in0' to 1543 generate 4 signed word elements in 'out1' 1544 */ 1545 #define UNPCK_SH_SW(in, out0, out1) \ 1546 { \ 1547 v8i16 tmp_m; \ 1548 \ 1549 tmp_m = __msa_clti_s_h((v8i16)in, 0); \ 1550 ILVRL_H2_SW(tmp_m, in, out0, out1); \ 1551 } 1552 1553 /* Description : Butterfly of 4 input vectors 1554 Arguments : Inputs - in0, in1, in2, in3 1555 Outputs - out0, out1, out2, out3 1556 Details : Butterfly operation 1557 */ 1558 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ 1559 { \ 1560 out0 = in0 + in3; \ 1561 out1 = in1 + in2; \ 1562 \ 1563 out2 = in1 - in2; \ 1564 out3 = in0 - in3; \ 1565 } 1566 1567 /* Description : Transpose input 8x8 byte block 1568 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1569 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1570 Return Type - as per RTYPE 1571 */ 1572 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1573 out0, out1, out2, out3, out4, out5, out6, out7) \ 1574 { \ 1575 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1576 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1577 \ 1578 ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \ 1579 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 1580 ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ 1581 ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ 1582 ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ 1583 ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ 1584 SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ 1585 SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ 1586 } 1587 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) 1588 1589 /* Description : Transpose 16x4 block into 4x16 with byte elements in vectors 1590 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 1591 in8, in9, in10, in11, in12, in13, in14, in15 1592 Outputs - out0, out1, out2, out3 1593 Return Type - unsigned byte 1594 */ 1595 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 1596 in8, in9, in10, in11, in12, in13, in14, in15, \ 1597 out0, out1, out2, out3) \ 1598 { \ 1599 v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1600 \ 1601 ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \ 1602 out1 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m); \ 1603 \ 1604 ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \ 1605 out3 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m); \ 1606 \ 1607 ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \ 1608 \ 1609 tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \ 1610 ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \ 1611 \ 1612 tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \ 1613 ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \ 1614 out0 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ 1615 out2 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ 1616 \ 1617 tmp0_m = (v2i64)__msa_ilvod_b((v16i8)out3, (v16i8)out1); \ 1618 tmp1_m = (v2i64)__msa_ilvod_b((v16i8)tmp3_m, (v16i8)tmp2_m); \ 1619 out1 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ 1620 out3 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ 1621 } 1622 1623 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors 1624 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 1625 in8, in9, in10, in11, in12, in13, in14, in15 1626 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1627 Return Type - unsigned byte 1628 */ 1629 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 1630 in8, in9, in10, in11, in12, in13, in14, in15, \ 1631 out0, out1, out2, out3, out4, out5, out6, out7) \ 1632 { \ 1633 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1634 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1635 \ 1636 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ 1637 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ 1638 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ 1639 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ 1640 \ 1641 tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ 1642 tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ 1643 tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ 1644 tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ 1645 out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ 1646 tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ 1647 out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ 1648 tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ 1649 \ 1650 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ 1651 out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1652 out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1653 \ 1654 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ 1655 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ 1656 out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1657 out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1658 \ 1659 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ 1660 out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1661 out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1662 \ 1663 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ 1664 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ 1665 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ 1666 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ 1667 out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1668 out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1669 } 1670 1671 /* Description : Transpose 4x4 block with half word elements in vectors 1672 Arguments : Inputs - in0, in1, in2, in3 1673 Outputs - out0, out1, out2, out3 1674 Return Type - signed halfword 1675 */ 1676 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ 1677 { \ 1678 v8i16 s0_m, s1_m; \ 1679 \ 1680 ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ 1681 ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ 1682 out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ 1683 out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ 1684 } 1685 1686 /* Description : Transpose 8x4 block with half word elements in vectors 1687 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1688 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1689 Return Type - signed halfword 1690 */ 1691 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ 1692 { \ 1693 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1694 \ 1695 ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ 1696 ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ 1697 ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ 1698 ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ 1699 } 1700 1701 /* Description : Transpose 4x4 block with word elements in vectors 1702 Arguments : Inputs - in0, in1, in2, in3 1703 Outputs - out0, out1, out2, out3 1704 Return Type - signed word 1705 */ 1706 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ 1707 { \ 1708 v4i32 s0_m, s1_m, s2_m, s3_m; \ 1709 \ 1710 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ 1711 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ 1712 \ 1713 out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ 1714 out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ 1715 out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ 1716 out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ 1717 } 1718 1719 /* Description : Dot product and addition of 3 signed halfword input vectors 1720 Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2 1721 Output - out0_m 1722 Return Type - signed halfword 1723 Details : Dot product of 'in0' with 'coeff0' 1724 Dot product of 'in1' with 'coeff1' 1725 Dot product of 'in2' with 'coeff2' 1726 Addition of all the 3 vector results 1727 out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2) 1728 */ 1729 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \ 1730 ({ \ 1731 v8i16 tmp1_m; \ 1732 v8i16 out0_m; \ 1733 \ 1734 out0_m = __msa_dotp_s_h((v16i8)in0, (v16i8)coeff0); \ 1735 out0_m = __msa_dpadd_s_h(out0_m, (v16i8)in1, (v16i8)coeff1); \ 1736 tmp1_m = __msa_dotp_s_h((v16i8)in2, (v16i8)coeff2); \ 1737 out0_m = __msa_adds_s_h(out0_m, tmp1_m); \ 1738 \ 1739 out0_m; \ 1740 }) 1741 1742 /* Description : Pack even elements of input vectors & xor with 128 1743 Arguments : Inputs - in0, in1 1744 Output - out_m 1745 Return Type - unsigned byte 1746 Details : Signed byte even elements from 'in0' and 'in1' are packed 1747 together in one vector and the resulting vector is xor'ed with 1748 128 to shift the range from signed to unsigned byte 1749 */ 1750 #define PCKEV_XORI128_UB(in0, in1) \ 1751 ({ \ 1752 v16u8 out_m; \ 1753 out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ 1754 out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ 1755 out_m; \ 1756 }) 1757 1758 /* Description : Pack even byte elements and store byte vector in destination 1759 memory 1760 Arguments : Inputs - in0, in1, pdst 1761 */ 1762 #define PCKEV_ST_SB(in0, in1, pdst) \ 1763 { \ 1764 v16i8 tmp_m; \ 1765 tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ 1766 ST_SB(tmp_m, (pdst)); \ 1767 } 1768 1769 /* Description : Horizontal 2 tap filter kernel code 1770 Arguments : Inputs - in0, in1, mask, coeff, shift 1771 */ 1772 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ 1773 ({ \ 1774 v16i8 tmp0_m; \ 1775 v8u16 tmp1_m; \ 1776 \ 1777 tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ 1778 tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ 1779 tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ 1780 \ 1781 tmp1_m; \ 1782 }) 1783 #endif /* VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ */ 1784