1 /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 /* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. */ 12 13 #ifndef NO_WARN_X86_INTRINSICS 14 /* This header file is to help porting code using Intel intrinsics 15 explicitly from x86_64 to powerpc64/powerpc64le. 16 17 Since PowerPC target doesn't support native 64-bit vector type, we 18 typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which 19 works well for _si64 and some _pi32 operations. 20 21 For _pi16 and _pi8 operations, it's better to transfer __m64 into 22 128-bit PowerPC vector first. Power8 introduced direct register 23 move instructions which helps for more efficient implementation. 24 25 It's user's responsibility to determine if the results of such port 26 are acceptable or further changes are needed. Please note that much 27 code using Intel intrinsics CAN BE REWRITTEN in more portable and 28 efficient standard C or GNU C extensions with 64-bit scalar 29 operations, or 128-bit SSE/Altivec operations, which are more 30 recommended. */ 31 #error \ 32 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 33 #endif 34 35 #ifndef _MMINTRIN_H_INCLUDED 36 #define _MMINTRIN_H_INCLUDED 37 38 #if defined(__linux__) && defined(__ppc64__) 39 40 #include <altivec.h> 41 /* The Intel API is flexible enough that we must allow aliasing with other 42 vector types, and their scalar components. */ 43 typedef __attribute__((__aligned__(8))) unsigned long long __m64; 44 45 typedef __attribute__((__aligned__(8))) union { 46 __m64 as_m64; 47 char as_char[8]; 48 signed char as_signed_char[8]; 49 short as_short[4]; 50 int as_int[2]; 51 long long as_long_long; 52 float as_float[2]; 53 double as_double; 54 } __m64_union; 55 56 /* Empty the multimedia state. */ 57 extern __inline void 58 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_empty(void)59 _mm_empty(void) { 60 /* nothing to do on PowerPC. */ 61 } 62 63 extern __inline void 64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_empty(void)65 _m_empty(void) { 66 /* nothing to do on PowerPC. */ 67 } 68 69 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ 70 extern __inline __m64 71 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi32_si64(int __i)72 _mm_cvtsi32_si64(int __i) { 73 return (__m64)(unsigned int)__i; 74 } 75 76 extern __inline __m64 77 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_from_int(int __i)78 _m_from_int(int __i) { 79 return _mm_cvtsi32_si64(__i); 80 } 81 82 /* Convert the lower 32 bits of the __m64 object into an integer. */ 83 extern __inline int 84 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64_si32(__m64 __i)85 _mm_cvtsi64_si32(__m64 __i) { 86 return ((int)__i); 87 } 88 89 extern __inline int 90 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_to_int(__m64 __i)91 _m_to_int(__m64 __i) { 92 return _mm_cvtsi64_si32(__i); 93 } 94 95 /* Convert I to a __m64 object. */ 96 97 /* Intel intrinsic. */ 98 extern __inline __m64 99 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_from_int64(long long __i)100 _m_from_int64(long long __i) { 101 return (__m64)__i; 102 } 103 104 extern __inline __m64 105 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64_m64(long long __i)106 _mm_cvtsi64_m64(long long __i) { 107 return (__m64)__i; 108 } 109 110 /* Microsoft intrinsic. */ 111 extern __inline __m64 112 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64x_si64(long long __i)113 _mm_cvtsi64x_si64(long long __i) { 114 return (__m64)__i; 115 } 116 117 extern __inline __m64 118 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pi64x(long long __i)119 _mm_set_pi64x(long long __i) { 120 return (__m64)__i; 121 } 122 123 /* Convert the __m64 object to a 64bit integer. */ 124 125 /* Intel intrinsic. */ 126 extern __inline long long 127 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_to_int64(__m64 __i)128 _m_to_int64(__m64 __i) { 129 return (long long)__i; 130 } 131 132 extern __inline long long 133 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtm64_si64(__m64 __i)134 _mm_cvtm64_si64(__m64 __i) { 135 return (long long)__i; 136 } 137 138 /* Microsoft intrinsic. */ 139 extern __inline long long 140 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64_si64x(__m64 __i)141 _mm_cvtsi64_si64x(__m64 __i) { 142 return (long long)__i; 143 } 144 145 #ifdef _ARCH_PWR8 146 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of 147 the result, and the four 16-bit values from M2 into the upper four 8-bit 148 values of the result, all with signed saturation. */ 149 extern __inline __m64 150 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_pi16(__m64 __m1,__m64 __m2)151 _mm_packs_pi16(__m64 __m1, __m64 __m2) { 152 __vector signed short vm1; 153 __vector signed char vresult; 154 155 vm1 = (__vector signed short)(__vector unsigned long long) 156 #ifdef __LITTLE_ENDIAN__ 157 {__m1, __m2}; 158 #else 159 {__m2, __m1}; 160 #endif 161 vresult = vec_packs(vm1, vm1); 162 return (__m64)((__vector long long)vresult)[0]; 163 } 164 165 extern __inline __m64 166 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_packsswb(__m64 __m1,__m64 __m2)167 _m_packsswb(__m64 __m1, __m64 __m2) { 168 return _mm_packs_pi16(__m1, __m2); 169 } 170 171 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of 172 the result, and the two 32-bit values from M2 into the upper two 16-bit 173 values of the result, all with signed saturation. */ 174 extern __inline __m64 175 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_pi32(__m64 __m1,__m64 __m2)176 _mm_packs_pi32(__m64 __m1, __m64 __m2) { 177 __vector signed int vm1; 178 __vector signed short vresult; 179 180 vm1 = (__vector signed int)(__vector unsigned long long) 181 #ifdef __LITTLE_ENDIAN__ 182 {__m1, __m2}; 183 #else 184 {__m2, __m1}; 185 #endif 186 vresult = vec_packs(vm1, vm1); 187 return (__m64)((__vector long long)vresult)[0]; 188 } 189 190 extern __inline __m64 191 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_packssdw(__m64 __m1,__m64 __m2)192 _m_packssdw(__m64 __m1, __m64 __m2) { 193 return _mm_packs_pi32(__m1, __m2); 194 } 195 196 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of 197 the result, and the four 16-bit values from M2 into the upper four 8-bit 198 values of the result, all with unsigned saturation. */ 199 extern __inline __m64 200 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_pu16(__m64 __m1,__m64 __m2)201 _mm_packs_pu16(__m64 __m1, __m64 __m2) { 202 __vector unsigned char r; 203 __vector signed short vm1 = (__vector signed short)(__vector long long) 204 #ifdef __LITTLE_ENDIAN__ 205 {__m1, __m2}; 206 #else 207 {__m2, __m1}; 208 #endif 209 const __vector signed short __zero = {0}; 210 __vector __bool short __select = vec_cmplt(vm1, __zero); 211 r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1); 212 __vector __bool char packsel = vec_pack(__select, __select); 213 r = vec_sel(r, (const __vector unsigned char)__zero, packsel); 214 return (__m64)((__vector long long)r)[0]; 215 } 216 217 extern __inline __m64 218 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_packuswb(__m64 __m1,__m64 __m2)219 _m_packuswb(__m64 __m1, __m64 __m2) { 220 return _mm_packs_pu16(__m1, __m2); 221 } 222 #endif /* end ARCH_PWR8 */ 223 224 /* Interleave the four 8-bit values from the high half of M1 with the four 225 8-bit values from the high half of M2. */ 226 extern __inline __m64 227 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_pi8(__m64 __m1,__m64 __m2)228 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { 229 #if _ARCH_PWR8 230 __vector unsigned char a, b, c; 231 232 a = (__vector unsigned char)vec_splats(__m1); 233 b = (__vector unsigned char)vec_splats(__m2); 234 c = vec_mergel(a, b); 235 return (__m64)((__vector long long)c)[1]; 236 #else 237 __m64_union m1, m2, res; 238 239 m1.as_m64 = __m1; 240 m2.as_m64 = __m2; 241 242 res.as_char[0] = m1.as_char[4]; 243 res.as_char[1] = m2.as_char[4]; 244 res.as_char[2] = m1.as_char[5]; 245 res.as_char[3] = m2.as_char[5]; 246 res.as_char[4] = m1.as_char[6]; 247 res.as_char[5] = m2.as_char[6]; 248 res.as_char[6] = m1.as_char[7]; 249 res.as_char[7] = m2.as_char[7]; 250 251 return (__m64)res.as_m64; 252 #endif 253 } 254 255 extern __inline __m64 256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_punpckhbw(__m64 __m1,__m64 __m2)257 _m_punpckhbw(__m64 __m1, __m64 __m2) { 258 return _mm_unpackhi_pi8(__m1, __m2); 259 } 260 261 /* Interleave the two 16-bit values from the high half of M1 with the two 262 16-bit values from the high half of M2. */ 263 extern __inline __m64 264 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_pi16(__m64 __m1,__m64 __m2)265 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { 266 __m64_union m1, m2, res; 267 268 m1.as_m64 = __m1; 269 m2.as_m64 = __m2; 270 271 res.as_short[0] = m1.as_short[2]; 272 res.as_short[1] = m2.as_short[2]; 273 res.as_short[2] = m1.as_short[3]; 274 res.as_short[3] = m2.as_short[3]; 275 276 return (__m64)res.as_m64; 277 } 278 279 extern __inline __m64 280 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_punpckhwd(__m64 __m1,__m64 __m2)281 _m_punpckhwd(__m64 __m1, __m64 __m2) { 282 return _mm_unpackhi_pi16(__m1, __m2); 283 } 284 /* Interleave the 32-bit value from the high half of M1 with the 32-bit 285 value from the high half of M2. */ 286 extern __inline __m64 287 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_pi32(__m64 __m1,__m64 __m2)288 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { 289 __m64_union m1, m2, res; 290 291 m1.as_m64 = __m1; 292 m2.as_m64 = __m2; 293 294 res.as_int[0] = m1.as_int[1]; 295 res.as_int[1] = m2.as_int[1]; 296 297 return (__m64)res.as_m64; 298 } 299 300 extern __inline __m64 301 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_punpckhdq(__m64 __m1,__m64 __m2)302 _m_punpckhdq(__m64 __m1, __m64 __m2) { 303 return _mm_unpackhi_pi32(__m1, __m2); 304 } 305 /* Interleave the four 8-bit values from the low half of M1 with the four 306 8-bit values from the low half of M2. */ 307 extern __inline __m64 308 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_pi8(__m64 __m1,__m64 __m2)309 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { 310 #if _ARCH_PWR8 311 __vector unsigned char a, b, c; 312 313 a = (__vector unsigned char)vec_splats(__m1); 314 b = (__vector unsigned char)vec_splats(__m2); 315 c = vec_mergel(a, b); 316 return (__m64)((__vector long long)c)[0]; 317 #else 318 __m64_union m1, m2, res; 319 320 m1.as_m64 = __m1; 321 m2.as_m64 = __m2; 322 323 res.as_char[0] = m1.as_char[0]; 324 res.as_char[1] = m2.as_char[0]; 325 res.as_char[2] = m1.as_char[1]; 326 res.as_char[3] = m2.as_char[1]; 327 res.as_char[4] = m1.as_char[2]; 328 res.as_char[5] = m2.as_char[2]; 329 res.as_char[6] = m1.as_char[3]; 330 res.as_char[7] = m2.as_char[3]; 331 332 return (__m64)res.as_m64; 333 #endif 334 } 335 336 extern __inline __m64 337 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_punpcklbw(__m64 __m1,__m64 __m2)338 _m_punpcklbw(__m64 __m1, __m64 __m2) { 339 return _mm_unpacklo_pi8(__m1, __m2); 340 } 341 /* Interleave the two 16-bit values from the low half of M1 with the two 342 16-bit values from the low half of M2. */ 343 extern __inline __m64 344 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_pi16(__m64 __m1,__m64 __m2)345 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { 346 __m64_union m1, m2, res; 347 348 m1.as_m64 = __m1; 349 m2.as_m64 = __m2; 350 351 res.as_short[0] = m1.as_short[0]; 352 res.as_short[1] = m2.as_short[0]; 353 res.as_short[2] = m1.as_short[1]; 354 res.as_short[3] = m2.as_short[1]; 355 356 return (__m64)res.as_m64; 357 } 358 359 extern __inline __m64 360 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_punpcklwd(__m64 __m1,__m64 __m2)361 _m_punpcklwd(__m64 __m1, __m64 __m2) { 362 return _mm_unpacklo_pi16(__m1, __m2); 363 } 364 365 /* Interleave the 32-bit value from the low half of M1 with the 32-bit 366 value from the low half of M2. */ 367 extern __inline __m64 368 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_pi32(__m64 __m1,__m64 __m2)369 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { 370 __m64_union m1, m2, res; 371 372 m1.as_m64 = __m1; 373 m2.as_m64 = __m2; 374 375 res.as_int[0] = m1.as_int[0]; 376 res.as_int[1] = m2.as_int[0]; 377 378 return (__m64)res.as_m64; 379 } 380 381 extern __inline __m64 382 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_punpckldq(__m64 __m1,__m64 __m2)383 _m_punpckldq(__m64 __m1, __m64 __m2) { 384 return _mm_unpacklo_pi32(__m1, __m2); 385 } 386 387 /* Add the 8-bit values in M1 to the 8-bit values in M2. */ 388 extern __inline __m64 389 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_pi8(__m64 __m1,__m64 __m2)390 _mm_add_pi8(__m64 __m1, __m64 __m2) { 391 #if _ARCH_PWR8 392 __vector signed char a, b, c; 393 394 a = (__vector signed char)vec_splats(__m1); 395 b = (__vector signed char)vec_splats(__m2); 396 c = vec_add(a, b); 397 return (__m64)((__vector long long)c)[0]; 398 #else 399 __m64_union m1, m2, res; 400 401 m1.as_m64 = __m1; 402 m2.as_m64 = __m2; 403 404 res.as_char[0] = m1.as_char[0] + m2.as_char[0]; 405 res.as_char[1] = m1.as_char[1] + m2.as_char[1]; 406 res.as_char[2] = m1.as_char[2] + m2.as_char[2]; 407 res.as_char[3] = m1.as_char[3] + m2.as_char[3]; 408 res.as_char[4] = m1.as_char[4] + m2.as_char[4]; 409 res.as_char[5] = m1.as_char[5] + m2.as_char[5]; 410 res.as_char[6] = m1.as_char[6] + m2.as_char[6]; 411 res.as_char[7] = m1.as_char[7] + m2.as_char[7]; 412 413 return (__m64)res.as_m64; 414 #endif 415 } 416 417 extern __inline __m64 418 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_paddb(__m64 __m1,__m64 __m2)419 _m_paddb(__m64 __m1, __m64 __m2) { 420 return _mm_add_pi8(__m1, __m2); 421 } 422 423 /* Add the 16-bit values in M1 to the 16-bit values in M2. */ 424 extern __inline __m64 425 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_pi16(__m64 __m1,__m64 __m2)426 _mm_add_pi16(__m64 __m1, __m64 __m2) { 427 #if _ARCH_PWR8 428 __vector signed short a, b, c; 429 430 a = (__vector signed short)vec_splats(__m1); 431 b = (__vector signed short)vec_splats(__m2); 432 c = vec_add(a, b); 433 return (__m64)((__vector long long)c)[0]; 434 #else 435 __m64_union m1, m2, res; 436 437 m1.as_m64 = __m1; 438 m2.as_m64 = __m2; 439 440 res.as_short[0] = m1.as_short[0] + m2.as_short[0]; 441 res.as_short[1] = m1.as_short[1] + m2.as_short[1]; 442 res.as_short[2] = m1.as_short[2] + m2.as_short[2]; 443 res.as_short[3] = m1.as_short[3] + m2.as_short[3]; 444 445 return (__m64)res.as_m64; 446 #endif 447 } 448 449 extern __inline __m64 450 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_paddw(__m64 __m1,__m64 __m2)451 _m_paddw(__m64 __m1, __m64 __m2) { 452 return _mm_add_pi16(__m1, __m2); 453 } 454 455 /* Add the 32-bit values in M1 to the 32-bit values in M2. */ 456 extern __inline __m64 457 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_pi32(__m64 __m1,__m64 __m2)458 _mm_add_pi32(__m64 __m1, __m64 __m2) { 459 #if _ARCH_PWR9 460 __vector signed int a, b, c; 461 462 a = (__vector signed int)vec_splats(__m1); 463 b = (__vector signed int)vec_splats(__m2); 464 c = vec_add(a, b); 465 return (__m64)((__vector long long)c)[0]; 466 #else 467 __m64_union m1, m2, res; 468 469 m1.as_m64 = __m1; 470 m2.as_m64 = __m2; 471 472 res.as_int[0] = m1.as_int[0] + m2.as_int[0]; 473 res.as_int[1] = m1.as_int[1] + m2.as_int[1]; 474 475 return (__m64)res.as_m64; 476 #endif 477 } 478 479 extern __inline __m64 480 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_paddd(__m64 __m1,__m64 __m2)481 _m_paddd(__m64 __m1, __m64 __m2) { 482 return _mm_add_pi32(__m1, __m2); 483 } 484 485 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ 486 extern __inline __m64 487 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_pi8(__m64 __m1,__m64 __m2)488 _mm_sub_pi8(__m64 __m1, __m64 __m2) { 489 #if _ARCH_PWR8 490 __vector signed char a, b, c; 491 492 a = (__vector signed char)vec_splats(__m1); 493 b = (__vector signed char)vec_splats(__m2); 494 c = vec_sub(a, b); 495 return (__m64)((__vector long long)c)[0]; 496 #else 497 __m64_union m1, m2, res; 498 499 m1.as_m64 = __m1; 500 m2.as_m64 = __m2; 501 502 res.as_char[0] = m1.as_char[0] - m2.as_char[0]; 503 res.as_char[1] = m1.as_char[1] - m2.as_char[1]; 504 res.as_char[2] = m1.as_char[2] - m2.as_char[2]; 505 res.as_char[3] = m1.as_char[3] - m2.as_char[3]; 506 res.as_char[4] = m1.as_char[4] - m2.as_char[4]; 507 res.as_char[5] = m1.as_char[5] - m2.as_char[5]; 508 res.as_char[6] = m1.as_char[6] - m2.as_char[6]; 509 res.as_char[7] = m1.as_char[7] - m2.as_char[7]; 510 511 return (__m64)res.as_m64; 512 #endif 513 } 514 515 extern __inline __m64 516 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psubb(__m64 __m1,__m64 __m2)517 _m_psubb(__m64 __m1, __m64 __m2) { 518 return _mm_sub_pi8(__m1, __m2); 519 } 520 521 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ 522 extern __inline __m64 523 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_pi16(__m64 __m1,__m64 __m2)524 _mm_sub_pi16(__m64 __m1, __m64 __m2) { 525 #if _ARCH_PWR8 526 __vector signed short a, b, c; 527 528 a = (__vector signed short)vec_splats(__m1); 529 b = (__vector signed short)vec_splats(__m2); 530 c = vec_sub(a, b); 531 return (__m64)((__vector long long)c)[0]; 532 #else 533 __m64_union m1, m2, res; 534 535 m1.as_m64 = __m1; 536 m2.as_m64 = __m2; 537 538 res.as_short[0] = m1.as_short[0] - m2.as_short[0]; 539 res.as_short[1] = m1.as_short[1] - m2.as_short[1]; 540 res.as_short[2] = m1.as_short[2] - m2.as_short[2]; 541 res.as_short[3] = m1.as_short[3] - m2.as_short[3]; 542 543 return (__m64)res.as_m64; 544 #endif 545 } 546 547 extern __inline __m64 548 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psubw(__m64 __m1,__m64 __m2)549 _m_psubw(__m64 __m1, __m64 __m2) { 550 return _mm_sub_pi16(__m1, __m2); 551 } 552 553 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ 554 extern __inline __m64 555 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_pi32(__m64 __m1,__m64 __m2)556 _mm_sub_pi32(__m64 __m1, __m64 __m2) { 557 #if _ARCH_PWR9 558 __vector signed int a, b, c; 559 560 a = (__vector signed int)vec_splats(__m1); 561 b = (__vector signed int)vec_splats(__m2); 562 c = vec_sub(a, b); 563 return (__m64)((__vector long long)c)[0]; 564 #else 565 __m64_union m1, m2, res; 566 567 m1.as_m64 = __m1; 568 m2.as_m64 = __m2; 569 570 res.as_int[0] = m1.as_int[0] - m2.as_int[0]; 571 res.as_int[1] = m1.as_int[1] - m2.as_int[1]; 572 573 return (__m64)res.as_m64; 574 #endif 575 } 576 577 extern __inline __m64 578 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psubd(__m64 __m1,__m64 __m2)579 _m_psubd(__m64 __m1, __m64 __m2) { 580 return _mm_sub_pi32(__m1, __m2); 581 } 582 583 extern __inline __m64 584 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_si64(__m64 __m1,__m64 __m2)585 _mm_add_si64(__m64 __m1, __m64 __m2) { 586 return (__m1 + __m2); 587 } 588 589 extern __inline __m64 590 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_si64(__m64 __m1,__m64 __m2)591 _mm_sub_si64(__m64 __m1, __m64 __m2) { 592 return (__m1 - __m2); 593 } 594 595 /* Shift the 64-bit value in M left by COUNT. */ 596 extern __inline __m64 597 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_si64(__m64 __m,__m64 __count)598 _mm_sll_si64(__m64 __m, __m64 __count) { 599 return (__m << __count); 600 } 601 602 extern __inline __m64 603 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psllq(__m64 __m,__m64 __count)604 _m_psllq(__m64 __m, __m64 __count) { 605 return _mm_sll_si64(__m, __count); 606 } 607 608 extern __inline __m64 609 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_si64(__m64 __m,const int __count)610 _mm_slli_si64(__m64 __m, const int __count) { 611 return (__m << __count); 612 } 613 614 extern __inline __m64 615 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psllqi(__m64 __m,const int __count)616 _m_psllqi(__m64 __m, const int __count) { 617 return _mm_slli_si64(__m, __count); 618 } 619 620 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */ 621 extern __inline __m64 622 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_si64(__m64 __m,__m64 __count)623 _mm_srl_si64(__m64 __m, __m64 __count) { 624 return (__m >> __count); 625 } 626 627 extern __inline __m64 628 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psrlq(__m64 __m,__m64 __count)629 _m_psrlq(__m64 __m, __m64 __count) { 630 return _mm_srl_si64(__m, __count); 631 } 632 633 extern __inline __m64 634 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_si64(__m64 __m,const int __count)635 _mm_srli_si64(__m64 __m, const int __count) { 636 return (__m >> __count); 637 } 638 639 extern __inline __m64 640 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psrlqi(__m64 __m,const int __count)641 _m_psrlqi(__m64 __m, const int __count) { 642 return _mm_srli_si64(__m, __count); 643 } 644 645 /* Bit-wise AND the 64-bit values in M1 and M2. */ 646 extern __inline __m64 647 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_and_si64(__m64 __m1,__m64 __m2)648 _mm_and_si64(__m64 __m1, __m64 __m2) { 649 return (__m1 & __m2); 650 } 651 652 extern __inline __m64 653 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pand(__m64 __m1,__m64 __m2)654 _m_pand(__m64 __m1, __m64 __m2) { 655 return _mm_and_si64(__m1, __m2); 656 } 657 658 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the 659 64-bit value in M2. */ 660 extern __inline __m64 661 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_andnot_si64(__m64 __m1,__m64 __m2)662 _mm_andnot_si64(__m64 __m1, __m64 __m2) { 663 return (~__m1 & __m2); 664 } 665 666 extern __inline __m64 667 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pandn(__m64 __m1,__m64 __m2)668 _m_pandn(__m64 __m1, __m64 __m2) { 669 return _mm_andnot_si64(__m1, __m2); 670 } 671 672 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ 673 extern __inline __m64 674 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_or_si64(__m64 __m1,__m64 __m2)675 _mm_or_si64(__m64 __m1, __m64 __m2) { 676 return (__m1 | __m2); 677 } 678 679 extern __inline __m64 680 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_por(__m64 __m1,__m64 __m2)681 _m_por(__m64 __m1, __m64 __m2) { 682 return _mm_or_si64(__m1, __m2); 683 } 684 685 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ 686 extern __inline __m64 687 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_xor_si64(__m64 __m1,__m64 __m2)688 _mm_xor_si64(__m64 __m1, __m64 __m2) { 689 return (__m1 ^ __m2); 690 } 691 692 extern __inline __m64 693 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pxor(__m64 __m1,__m64 __m2)694 _m_pxor(__m64 __m1, __m64 __m2) { 695 return _mm_xor_si64(__m1, __m2); 696 } 697 698 /* Creates a 64-bit zero. */ 699 extern __inline __m64 700 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setzero_si64(void)701 _mm_setzero_si64(void) { 702 return (__m64)0; 703 } 704 705 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the 706 test is true and zero if false. */ 707 extern __inline __m64 708 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_pi8(__m64 __m1,__m64 __m2)709 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { 710 #if defined(_ARCH_PWR6) && defined(__powerpc64__) 711 __m64 res; 712 __asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :); 713 return (res); 714 #else 715 __m64_union m1, m2, res; 716 717 m1.as_m64 = __m1; 718 m2.as_m64 = __m2; 719 720 res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0; 721 res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0; 722 res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0; 723 res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0; 724 res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0; 725 res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0; 726 res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0; 727 res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0; 728 729 return (__m64)res.as_m64; 730 #endif 731 } 732 733 extern __inline __m64 734 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pcmpeqb(__m64 __m1,__m64 __m2)735 _m_pcmpeqb(__m64 __m1, __m64 __m2) { 736 return _mm_cmpeq_pi8(__m1, __m2); 737 } 738 739 extern __inline __m64 740 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_pi8(__m64 __m1,__m64 __m2)741 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { 742 #if _ARCH_PWR8 743 __vector signed char a, b, c; 744 745 a = (__vector signed char)vec_splats(__m1); 746 b = (__vector signed char)vec_splats(__m2); 747 c = (__vector signed char)vec_cmpgt(a, b); 748 return (__m64)((__vector long long)c)[0]; 749 #else 750 __m64_union m1, m2, res; 751 752 m1.as_m64 = __m1; 753 m2.as_m64 = __m2; 754 755 res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0; 756 res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0; 757 res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0; 758 res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0; 759 res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0; 760 res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0; 761 res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0; 762 res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0; 763 764 return (__m64)res.as_m64; 765 #endif 766 } 767 768 extern __inline __m64 769 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pcmpgtb(__m64 __m1,__m64 __m2)770 _m_pcmpgtb(__m64 __m1, __m64 __m2) { 771 return _mm_cmpgt_pi8(__m1, __m2); 772 } 773 774 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if 775 the test is true and zero if false. */ 776 extern __inline __m64 777 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_pi16(__m64 __m1,__m64 __m2)778 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { 779 #if _ARCH_PWR8 780 __vector signed short a, b, c; 781 782 a = (__vector signed short)vec_splats(__m1); 783 b = (__vector signed short)vec_splats(__m2); 784 c = (__vector signed short)vec_cmpeq(a, b); 785 return (__m64)((__vector long long)c)[0]; 786 #else 787 __m64_union m1, m2, res; 788 789 m1.as_m64 = __m1; 790 m2.as_m64 = __m2; 791 792 res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0; 793 res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0; 794 res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0; 795 res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0; 796 797 return (__m64)res.as_m64; 798 #endif 799 } 800 801 extern __inline __m64 802 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pcmpeqw(__m64 __m1,__m64 __m2)803 _m_pcmpeqw(__m64 __m1, __m64 __m2) { 804 return _mm_cmpeq_pi16(__m1, __m2); 805 } 806 807 extern __inline __m64 808 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_pi16(__m64 __m1,__m64 __m2)809 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { 810 #if _ARCH_PWR8 811 __vector signed short a, b, c; 812 813 a = (__vector signed short)vec_splats(__m1); 814 b = (__vector signed short)vec_splats(__m2); 815 c = (__vector signed short)vec_cmpgt(a, b); 816 return (__m64)((__vector long long)c)[0]; 817 #else 818 __m64_union m1, m2, res; 819 820 m1.as_m64 = __m1; 821 m2.as_m64 = __m2; 822 823 res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0; 824 res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0; 825 res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0; 826 res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0; 827 828 return (__m64)res.as_m64; 829 #endif 830 } 831 832 extern __inline __m64 833 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pcmpgtw(__m64 __m1,__m64 __m2)834 _m_pcmpgtw(__m64 __m1, __m64 __m2) { 835 return _mm_cmpgt_pi16(__m1, __m2); 836 } 837 838 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if 839 the test is true and zero if false. */ 840 extern __inline __m64 841 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_pi32(__m64 __m1,__m64 __m2)842 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { 843 #if _ARCH_PWR9 844 __vector signed int a, b, c; 845 846 a = (__vector signed int)vec_splats(__m1); 847 b = (__vector signed int)vec_splats(__m2); 848 c = (__vector signed int)vec_cmpeq(a, b); 849 return (__m64)((__vector long long)c)[0]; 850 #else 851 __m64_union m1, m2, res; 852 853 m1.as_m64 = __m1; 854 m2.as_m64 = __m2; 855 856 res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0; 857 res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0; 858 859 return (__m64)res.as_m64; 860 #endif 861 } 862 863 extern __inline __m64 864 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pcmpeqd(__m64 __m1,__m64 __m2)865 _m_pcmpeqd(__m64 __m1, __m64 __m2) { 866 return _mm_cmpeq_pi32(__m1, __m2); 867 } 868 869 extern __inline __m64 870 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_pi32(__m64 __m1,__m64 __m2)871 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { 872 #if _ARCH_PWR9 873 __vector signed int a, b, c; 874 875 a = (__vector signed int)vec_splats(__m1); 876 b = (__vector signed int)vec_splats(__m2); 877 c = (__vector signed int)vec_cmpgt(a, b); 878 return (__m64)((__vector long long)c)[0]; 879 #else 880 __m64_union m1, m2, res; 881 882 m1.as_m64 = __m1; 883 m2.as_m64 = __m2; 884 885 res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0; 886 res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0; 887 888 return (__m64)res.as_m64; 889 #endif 890 } 891 892 extern __inline __m64 893 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pcmpgtd(__m64 __m1,__m64 __m2)894 _m_pcmpgtd(__m64 __m1, __m64 __m2) { 895 return _mm_cmpgt_pi32(__m1, __m2); 896 } 897 898 #if _ARCH_PWR8 899 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed 900 saturated arithmetic. */ 901 extern __inline __m64 902 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pi8(__m64 __m1,__m64 __m2)903 _mm_adds_pi8(__m64 __m1, __m64 __m2) { 904 __vector signed char a, b, c; 905 906 a = (__vector signed char)vec_splats(__m1); 907 b = (__vector signed char)vec_splats(__m2); 908 c = vec_adds(a, b); 909 return (__m64)((__vector long long)c)[0]; 910 } 911 912 extern __inline __m64 913 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_paddsb(__m64 __m1,__m64 __m2)914 _m_paddsb(__m64 __m1, __m64 __m2) { 915 return _mm_adds_pi8(__m1, __m2); 916 } 917 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed 918 saturated arithmetic. */ 919 extern __inline __m64 920 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pi16(__m64 __m1,__m64 __m2)921 _mm_adds_pi16(__m64 __m1, __m64 __m2) { 922 __vector signed short a, b, c; 923 924 a = (__vector signed short)vec_splats(__m1); 925 b = (__vector signed short)vec_splats(__m2); 926 c = vec_adds(a, b); 927 return (__m64)((__vector long long)c)[0]; 928 } 929 930 extern __inline __m64 931 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_paddsw(__m64 __m1,__m64 __m2)932 _m_paddsw(__m64 __m1, __m64 __m2) { 933 return _mm_adds_pi16(__m1, __m2); 934 } 935 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned 936 saturated arithmetic. */ 937 extern __inline __m64 938 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pu8(__m64 __m1,__m64 __m2)939 _mm_adds_pu8(__m64 __m1, __m64 __m2) { 940 __vector unsigned char a, b, c; 941 942 a = (__vector unsigned char)vec_splats(__m1); 943 b = (__vector unsigned char)vec_splats(__m2); 944 c = vec_adds(a, b); 945 return (__m64)((__vector long long)c)[0]; 946 } 947 948 extern __inline __m64 949 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_paddusb(__m64 __m1,__m64 __m2)950 _m_paddusb(__m64 __m1, __m64 __m2) { 951 return _mm_adds_pu8(__m1, __m2); 952 } 953 954 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned 955 saturated arithmetic. */ 956 extern __inline __m64 957 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pu16(__m64 __m1,__m64 __m2)958 _mm_adds_pu16(__m64 __m1, __m64 __m2) { 959 __vector unsigned short a, b, c; 960 961 a = (__vector unsigned short)vec_splats(__m1); 962 b = (__vector unsigned short)vec_splats(__m2); 963 c = vec_adds(a, b); 964 return (__m64)((__vector long long)c)[0]; 965 } 966 967 extern __inline __m64 968 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_paddusw(__m64 __m1,__m64 __m2)969 _m_paddusw(__m64 __m1, __m64 __m2) { 970 return _mm_adds_pu16(__m1, __m2); 971 } 972 973 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed 974 saturating arithmetic. */ 975 extern __inline __m64 976 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_pi8(__m64 __m1,__m64 __m2)977 _mm_subs_pi8(__m64 __m1, __m64 __m2) { 978 __vector signed char a, b, c; 979 980 a = (__vector signed char)vec_splats(__m1); 981 b = (__vector signed char)vec_splats(__m2); 982 c = vec_subs(a, b); 983 return (__m64)((__vector long long)c)[0]; 984 } 985 986 extern __inline __m64 987 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psubsb(__m64 __m1,__m64 __m2)988 _m_psubsb(__m64 __m1, __m64 __m2) { 989 return _mm_subs_pi8(__m1, __m2); 990 } 991 992 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 993 signed saturating arithmetic. */ 994 extern __inline __m64 995 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_pi16(__m64 __m1,__m64 __m2)996 _mm_subs_pi16(__m64 __m1, __m64 __m2) { 997 __vector signed short a, b, c; 998 999 a = (__vector signed short)vec_splats(__m1); 1000 b = (__vector signed short)vec_splats(__m2); 1001 c = vec_subs(a, b); 1002 return (__m64)((__vector long long)c)[0]; 1003 } 1004 1005 extern __inline __m64 1006 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psubsw(__m64 __m1,__m64 __m2)1007 _m_psubsw(__m64 __m1, __m64 __m2) { 1008 return _mm_subs_pi16(__m1, __m2); 1009 } 1010 1011 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using 1012 unsigned saturating arithmetic. */ 1013 extern __inline __m64 1014 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_pu8(__m64 __m1,__m64 __m2)1015 _mm_subs_pu8(__m64 __m1, __m64 __m2) { 1016 __vector unsigned char a, b, c; 1017 1018 a = (__vector unsigned char)vec_splats(__m1); 1019 b = (__vector unsigned char)vec_splats(__m2); 1020 c = vec_subs(a, b); 1021 return (__m64)((__vector long long)c)[0]; 1022 } 1023 1024 extern __inline __m64 1025 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psubusb(__m64 __m1,__m64 __m2)1026 _m_psubusb(__m64 __m1, __m64 __m2) { 1027 return _mm_subs_pu8(__m1, __m2); 1028 } 1029 1030 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 1031 unsigned saturating arithmetic. */ 1032 extern __inline __m64 1033 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_pu16(__m64 __m1,__m64 __m2)1034 _mm_subs_pu16(__m64 __m1, __m64 __m2) { 1035 __vector unsigned short a, b, c; 1036 1037 a = (__vector unsigned short)vec_splats(__m1); 1038 b = (__vector unsigned short)vec_splats(__m2); 1039 c = vec_subs(a, b); 1040 return (__m64)((__vector long long)c)[0]; 1041 } 1042 1043 extern __inline __m64 1044 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psubusw(__m64 __m1,__m64 __m2)1045 _m_psubusw(__m64 __m1, __m64 __m2) { 1046 return _mm_subs_pu16(__m1, __m2); 1047 } 1048 1049 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing 1050 four 32-bit intermediate results, which are then summed by pairs to 1051 produce two 32-bit results. */ 1052 extern __inline __m64 1053 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_madd_pi16(__m64 __m1,__m64 __m2)1054 _mm_madd_pi16(__m64 __m1, __m64 __m2) { 1055 __vector signed short a, b; 1056 __vector signed int c; 1057 __vector signed int zero = {0, 0, 0, 0}; 1058 1059 a = (__vector signed short)vec_splats(__m1); 1060 b = (__vector signed short)vec_splats(__m2); 1061 c = vec_vmsumshm(a, b, zero); 1062 return (__m64)((__vector long long)c)[0]; 1063 } 1064 1065 extern __inline __m64 1066 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pmaddwd(__m64 __m1,__m64 __m2)1067 _m_pmaddwd(__m64 __m1, __m64 __m2) { 1068 return _mm_madd_pi16(__m1, __m2); 1069 } 1070 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in 1071 M2 and produce the high 16 bits of the 32-bit results. */ 1072 extern __inline __m64 1073 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mulhi_pi16(__m64 __m1,__m64 __m2)1074 _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { 1075 __vector signed short a, b; 1076 __vector signed short c; 1077 __vector signed int w0, w1; 1078 __vector unsigned char xform1 = { 1079 #ifdef __LITTLE_ENDIAN__ 1080 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 1081 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1082 #else 1083 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00, 1084 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 1085 #endif 1086 }; 1087 1088 a = (__vector signed short)vec_splats(__m1); 1089 b = (__vector signed short)vec_splats(__m2); 1090 1091 w0 = vec_vmulesh(a, b); 1092 w1 = vec_vmulosh(a, b); 1093 c = (__vector signed short)vec_perm(w0, w1, xform1); 1094 1095 return (__m64)((__vector long long)c)[0]; 1096 } 1097 1098 extern __inline __m64 1099 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pmulhw(__m64 __m1,__m64 __m2)1100 _m_pmulhw(__m64 __m1, __m64 __m2) { 1101 return _mm_mulhi_pi16(__m1, __m2); 1102 } 1103 1104 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce 1105 the low 16 bits of the results. */ 1106 extern __inline __m64 1107 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mullo_pi16(__m64 __m1,__m64 __m2)1108 _mm_mullo_pi16(__m64 __m1, __m64 __m2) { 1109 __vector signed short a, b, c; 1110 1111 a = (__vector signed short)vec_splats(__m1); 1112 b = (__vector signed short)vec_splats(__m2); 1113 c = a * b; 1114 return (__m64)((__vector long long)c)[0]; 1115 } 1116 1117 extern __inline __m64 1118 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pmullw(__m64 __m1,__m64 __m2)1119 _m_pmullw(__m64 __m1, __m64 __m2) { 1120 return _mm_mullo_pi16(__m1, __m2); 1121 } 1122 1123 /* Shift four 16-bit values in M left by COUNT. */ 1124 extern __inline __m64 1125 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_pi16(__m64 __m,__m64 __count)1126 _mm_sll_pi16(__m64 __m, __m64 __count) { 1127 __vector signed short m, r; 1128 __vector unsigned short c; 1129 1130 if (__count <= 15) { 1131 m = (__vector signed short)vec_splats(__m); 1132 c = (__vector unsigned short)vec_splats((unsigned short)__count); 1133 r = vec_sl(m, (__vector unsigned short)c); 1134 return (__m64)((__vector long long)r)[0]; 1135 } else 1136 return (0); 1137 } 1138 1139 extern __inline __m64 1140 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psllw(__m64 __m,__m64 __count)1141 _m_psllw(__m64 __m, __m64 __count) { 1142 return _mm_sll_pi16(__m, __count); 1143 } 1144 1145 extern __inline __m64 1146 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_pi16(__m64 __m,int __count)1147 _mm_slli_pi16(__m64 __m, int __count) { 1148 /* Promote int to long then invoke mm_sll_pi16. */ 1149 return _mm_sll_pi16(__m, __count); 1150 } 1151 1152 extern __inline __m64 1153 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psllwi(__m64 __m,int __count)1154 _m_psllwi(__m64 __m, int __count) { 1155 return _mm_slli_pi16(__m, __count); 1156 } 1157 1158 /* Shift two 32-bit values in M left by COUNT. */ 1159 extern __inline __m64 1160 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_pi32(__m64 __m,__m64 __count)1161 _mm_sll_pi32(__m64 __m, __m64 __count) { 1162 __m64_union m, res; 1163 1164 m.as_m64 = __m; 1165 1166 res.as_int[0] = m.as_int[0] << __count; 1167 res.as_int[1] = m.as_int[1] << __count; 1168 return (res.as_m64); 1169 } 1170 1171 extern __inline __m64 1172 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pslld(__m64 __m,__m64 __count)1173 _m_pslld(__m64 __m, __m64 __count) { 1174 return _mm_sll_pi32(__m, __count); 1175 } 1176 1177 extern __inline __m64 1178 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_pi32(__m64 __m,int __count)1179 _mm_slli_pi32(__m64 __m, int __count) { 1180 /* Promote int to long then invoke mm_sll_pi32. */ 1181 return _mm_sll_pi32(__m, __count); 1182 } 1183 1184 extern __inline __m64 1185 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pslldi(__m64 __m,int __count)1186 _m_pslldi(__m64 __m, int __count) { 1187 return _mm_slli_pi32(__m, __count); 1188 } 1189 1190 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ 1191 extern __inline __m64 1192 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sra_pi16(__m64 __m,__m64 __count)1193 _mm_sra_pi16(__m64 __m, __m64 __count) { 1194 __vector signed short m, r; 1195 __vector unsigned short c; 1196 1197 if (__count <= 15) { 1198 m = (__vector signed short)vec_splats(__m); 1199 c = (__vector unsigned short)vec_splats((unsigned short)__count); 1200 r = vec_sra(m, (__vector unsigned short)c); 1201 return (__m64)((__vector long long)r)[0]; 1202 } else 1203 return (0); 1204 } 1205 1206 extern __inline __m64 1207 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psraw(__m64 __m,__m64 __count)1208 _m_psraw(__m64 __m, __m64 __count) { 1209 return _mm_sra_pi16(__m, __count); 1210 } 1211 1212 extern __inline __m64 1213 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srai_pi16(__m64 __m,int __count)1214 _mm_srai_pi16(__m64 __m, int __count) { 1215 /* Promote int to long then invoke mm_sra_pi32. */ 1216 return _mm_sra_pi16(__m, __count); 1217 } 1218 1219 extern __inline __m64 1220 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psrawi(__m64 __m,int __count)1221 _m_psrawi(__m64 __m, int __count) { 1222 return _mm_srai_pi16(__m, __count); 1223 } 1224 1225 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ 1226 extern __inline __m64 1227 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sra_pi32(__m64 __m,__m64 __count)1228 _mm_sra_pi32(__m64 __m, __m64 __count) { 1229 __m64_union m, res; 1230 1231 m.as_m64 = __m; 1232 1233 res.as_int[0] = m.as_int[0] >> __count; 1234 res.as_int[1] = m.as_int[1] >> __count; 1235 return (res.as_m64); 1236 } 1237 1238 extern __inline __m64 1239 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psrad(__m64 __m,__m64 __count)1240 _m_psrad(__m64 __m, __m64 __count) { 1241 return _mm_sra_pi32(__m, __count); 1242 } 1243 1244 extern __inline __m64 1245 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srai_pi32(__m64 __m,int __count)1246 _mm_srai_pi32(__m64 __m, int __count) { 1247 /* Promote int to long then invoke mm_sra_pi32. */ 1248 return _mm_sra_pi32(__m, __count); 1249 } 1250 1251 extern __inline __m64 1252 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psradi(__m64 __m,int __count)1253 _m_psradi(__m64 __m, int __count) { 1254 return _mm_srai_pi32(__m, __count); 1255 } 1256 1257 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */ 1258 extern __inline __m64 1259 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_pi16(__m64 __m,__m64 __count)1260 _mm_srl_pi16(__m64 __m, __m64 __count) { 1261 __vector unsigned short m, r; 1262 __vector unsigned short c; 1263 1264 if (__count <= 15) { 1265 m = (__vector unsigned short)vec_splats(__m); 1266 c = (__vector unsigned short)vec_splats((unsigned short)__count); 1267 r = vec_sr(m, (__vector unsigned short)c); 1268 return (__m64)((__vector long long)r)[0]; 1269 } else 1270 return (0); 1271 } 1272 1273 extern __inline __m64 1274 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psrlw(__m64 __m,__m64 __count)1275 _m_psrlw(__m64 __m, __m64 __count) { 1276 return _mm_srl_pi16(__m, __count); 1277 } 1278 1279 extern __inline __m64 1280 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_pi16(__m64 __m,int __count)1281 _mm_srli_pi16(__m64 __m, int __count) { 1282 /* Promote int to long then invoke mm_sra_pi32. */ 1283 return _mm_srl_pi16(__m, __count); 1284 } 1285 1286 extern __inline __m64 1287 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psrlwi(__m64 __m,int __count)1288 _m_psrlwi(__m64 __m, int __count) { 1289 return _mm_srli_pi16(__m, __count); 1290 } 1291 1292 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */ 1293 extern __inline __m64 1294 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_pi32(__m64 __m,__m64 __count)1295 _mm_srl_pi32(__m64 __m, __m64 __count) { 1296 __m64_union m, res; 1297 1298 m.as_m64 = __m; 1299 1300 res.as_int[0] = (unsigned int)m.as_int[0] >> __count; 1301 res.as_int[1] = (unsigned int)m.as_int[1] >> __count; 1302 return (res.as_m64); 1303 } 1304 1305 extern __inline __m64 1306 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psrld(__m64 __m,__m64 __count)1307 _m_psrld(__m64 __m, __m64 __count) { 1308 return _mm_srl_pi32(__m, __count); 1309 } 1310 1311 extern __inline __m64 1312 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_pi32(__m64 __m,int __count)1313 _mm_srli_pi32(__m64 __m, int __count) { 1314 /* Promote int to long then invoke mm_srl_pi32. */ 1315 return _mm_srl_pi32(__m, __count); 1316 } 1317 1318 extern __inline __m64 1319 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psrldi(__m64 __m,int __count)1320 _m_psrldi(__m64 __m, int __count) { 1321 return _mm_srli_pi32(__m, __count); 1322 } 1323 #endif /* _ARCH_PWR8 */ 1324 1325 /* Creates a vector of two 32-bit values; I0 is least significant. */ 1326 extern __inline __m64 1327 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pi32(int __i1,int __i0)1328 _mm_set_pi32(int __i1, int __i0) { 1329 __m64_union res; 1330 1331 res.as_int[0] = __i0; 1332 res.as_int[1] = __i1; 1333 return (res.as_m64); 1334 } 1335 1336 /* Creates a vector of four 16-bit values; W0 is least significant. */ 1337 extern __inline __m64 1338 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pi16(short __w3,short __w2,short __w1,short __w0)1339 _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) { 1340 __m64_union res; 1341 1342 res.as_short[0] = __w0; 1343 res.as_short[1] = __w1; 1344 res.as_short[2] = __w2; 1345 res.as_short[3] = __w3; 1346 return (res.as_m64); 1347 } 1348 1349 /* Creates a vector of eight 8-bit values; B0 is least significant. */ 1350 extern __inline __m64 1351 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pi8(char __b7,char __b6,char __b5,char __b4,char __b3,char __b2,char __b1,char __b0)1352 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, 1353 char __b2, char __b1, char __b0) { 1354 __m64_union res; 1355 1356 res.as_char[0] = __b0; 1357 res.as_char[1] = __b1; 1358 res.as_char[2] = __b2; 1359 res.as_char[3] = __b3; 1360 res.as_char[4] = __b4; 1361 res.as_char[5] = __b5; 1362 res.as_char[6] = __b6; 1363 res.as_char[7] = __b7; 1364 return (res.as_m64); 1365 } 1366 1367 /* Similar, but with the arguments in reverse order. */ 1368 extern __inline __m64 1369 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_pi32(int __i0,int __i1)1370 _mm_setr_pi32(int __i0, int __i1) { 1371 __m64_union res; 1372 1373 res.as_int[0] = __i0; 1374 res.as_int[1] = __i1; 1375 return (res.as_m64); 1376 } 1377 1378 extern __inline __m64 1379 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_pi16(short __w0,short __w1,short __w2,short __w3)1380 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { 1381 return _mm_set_pi16(__w3, __w2, __w1, __w0); 1382 } 1383 1384 extern __inline __m64 1385 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_pi8(char __b0,char __b1,char __b2,char __b3,char __b4,char __b5,char __b6,char __b7)1386 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, 1387 char __b5, char __b6, char __b7) { 1388 return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 1389 } 1390 1391 /* Creates a vector of two 32-bit values, both elements containing I. */ 1392 extern __inline __m64 1393 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_pi32(int __i)1394 _mm_set1_pi32(int __i) { 1395 __m64_union res; 1396 1397 res.as_int[0] = __i; 1398 res.as_int[1] = __i; 1399 return (res.as_m64); 1400 } 1401 1402 /* Creates a vector of four 16-bit values, all elements containing W. */ 1403 extern __inline __m64 1404 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_pi16(short __w)1405 _mm_set1_pi16(short __w) { 1406 #if _ARCH_PWR9 1407 __vector signed short w; 1408 1409 w = (__vector signed short)vec_splats(__w); 1410 return (__m64)((__vector long long)w)[0]; 1411 #else 1412 __m64_union res; 1413 1414 res.as_short[0] = __w; 1415 res.as_short[1] = __w; 1416 res.as_short[2] = __w; 1417 res.as_short[3] = __w; 1418 return (res.as_m64); 1419 #endif 1420 } 1421 1422 /* Creates a vector of eight 8-bit values, all elements containing B. */ 1423 extern __inline __m64 1424 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_pi8(signed char __b)1425 _mm_set1_pi8(signed char __b) { 1426 #if _ARCH_PWR8 1427 __vector signed char b; 1428 1429 b = (__vector signed char)vec_splats(__b); 1430 return (__m64)((__vector long long)b)[0]; 1431 #else 1432 __m64_union res; 1433 1434 res.as_char[0] = __b; 1435 res.as_char[1] = __b; 1436 res.as_char[2] = __b; 1437 res.as_char[3] = __b; 1438 res.as_char[4] = __b; 1439 res.as_char[5] = __b; 1440 res.as_char[6] = __b; 1441 res.as_char[7] = __b; 1442 return (res.as_m64); 1443 #endif 1444 } 1445 1446 #else 1447 #include_next <mmintrin.h> 1448 #endif /* defined(__linux__) && defined(__ppc64__) */ 1449 1450 #endif /* _MMINTRIN_H_INCLUDED */ 1451