1 /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining a copy
4  * of this software and associated documentation files (the "Software"), to deal
5  * in the Software without restriction, including without limitation the rights
6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7  * copies of the Software, and to permit persons to whom the Software is
8  * furnished to do so, subject to the following conditions:
9  *
10  * The above copyright notice and this permission notice shall be included in
11  * all copies or substantial portions of the Software.
12  *
13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19  * THE SOFTWARE.
20  *
21  *===-----------------------------------------------------------------------===
22  */
23 
24 #ifndef __TMMINTRIN_H
25 #define __TMMINTRIN_H
26 
27 #include <pmmintrin.h>
28 
29 /* Define the default attributes for the functions in this file. */
30 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
31 
32 /// \brief Computes the absolute value of each of the packed 8-bit signed
33 ///    integers in the source operand and stores the 8-bit unsigned integer
34 ///    results in the destination.
35 ///
36 /// \headerfile <x86intrin.h>
37 ///
38 /// This intrinsic corresponds to the \c PABSB instruction.
39 ///
40 /// \param __a
41 ///    A 64-bit vector of [8 x i8].
42 /// \returns A 64-bit integer vector containing the absolute values of the
43 ///    elements in the operand.
44 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_abs_pi8(__m64 __a)45 _mm_abs_pi8(__m64 __a)
46 {
47     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
48 }
49 
50 /// \brief Computes the absolute value of each of the packed 8-bit signed
51 ///    integers in the source operand and stores the 8-bit unsigned integer
52 ///    results in the destination.
53 ///
54 /// \headerfile <x86intrin.h>
55 ///
56 /// This intrinsic corresponds to the \c VPABSB instruction.
57 ///
58 /// \param __a
59 ///    A 128-bit vector of [16 x i8].
60 /// \returns A 128-bit integer vector containing the absolute values of the
61 ///    elements in the operand.
62 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi8(__m128i __a)63 _mm_abs_epi8(__m128i __a)
64 {
65     return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
66 }
67 
68 /// \brief Computes the absolute value of each of the packed 16-bit signed
69 ///    integers in the source operand and stores the 16-bit unsigned integer
70 ///    results in the destination.
71 ///
72 /// \headerfile <x86intrin.h>
73 ///
74 /// This intrinsic corresponds to the \c PABSW instruction.
75 ///
76 /// \param __a
77 ///    A 64-bit vector of [4 x i16].
78 /// \returns A 64-bit integer vector containing the absolute values of the
79 ///    elements in the operand.
80 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_abs_pi16(__m64 __a)81 _mm_abs_pi16(__m64 __a)
82 {
83     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
84 }
85 
86 /// \brief Computes the absolute value of each of the packed 16-bit signed
87 ///    integers in the source operand and stores the 16-bit unsigned integer
88 ///    results in the destination.
89 ///
90 /// \headerfile <x86intrin.h>
91 ///
92 /// This intrinsic corresponds to the \c VPABSW instruction.
93 ///
94 /// \param __a
95 ///    A 128-bit vector of [8 x i16].
96 /// \returns A 128-bit integer vector containing the absolute values of the
97 ///    elements in the operand.
98 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi16(__m128i __a)99 _mm_abs_epi16(__m128i __a)
100 {
101     return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
102 }
103 
104 /// \brief Computes the absolute value of each of the packed 32-bit signed
105 ///    integers in the source operand and stores the 32-bit unsigned integer
106 ///    results in the destination.
107 ///
108 /// \headerfile <x86intrin.h>
109 ///
110 /// This intrinsic corresponds to the \c PABSD instruction.
111 ///
112 /// \param __a
113 ///    A 64-bit vector of [2 x i32].
114 /// \returns A 64-bit integer vector containing the absolute values of the
115 ///    elements in the operand.
116 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_abs_pi32(__m64 __a)117 _mm_abs_pi32(__m64 __a)
118 {
119     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
120 }
121 
122 /// \brief Computes the absolute value of each of the packed 32-bit signed
123 ///    integers in the source operand and stores the 32-bit unsigned integer
124 ///    results in the destination.
125 ///
126 /// \headerfile <x86intrin.h>
127 ///
128 /// This intrinsic corresponds to the \c VPABSD instruction.
129 ///
130 /// \param __a
131 ///    A 128-bit vector of [4 x i32].
132 /// \returns A 128-bit integer vector containing the absolute values of the
133 ///    elements in the operand.
134 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi32(__m128i __a)135 _mm_abs_epi32(__m128i __a)
136 {
137     return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
138 }
139 
140 /// \brief Concatenates the two 128-bit integer vector operands, and
141 ///    right-shifts the result by the number of bytes specified in the immediate
142 ///    operand.
143 ///
144 /// \headerfile <x86intrin.h>
145 ///
146 /// \code
147 /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
148 /// \endcode
149 ///
150 /// This intrinsic corresponds to the \c PALIGNR instruction.
151 ///
152 /// \param a
153 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
154 /// \param b
155 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
156 /// \param n
157 ///    An immediate operand specifying how many bytes to right-shift the result.
158 /// \returns A 128-bit integer vector containing the concatenated right-shifted
159 ///    value.
160 #define _mm_alignr_epi8(a, b, n) __extension__ ({ \
161   (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
162                                      (__v16qi)(__m128i)(b), (n)); })
163 
164 /// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
165 ///    the result by the number of bytes specified in the immediate operand.
166 ///
167 /// \headerfile <x86intrin.h>
168 ///
169 /// \code
170 /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
171 /// \endcode
172 ///
173 /// This intrinsic corresponds to the \c PALIGNR instruction.
174 ///
175 /// \param a
176 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
177 /// \param b
178 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
179 /// \param n
180 ///    An immediate operand specifying how many bytes to right-shift the result.
181 /// \returns A 64-bit integer vector containing the concatenated right-shifted
182 ///    value.
183 #define _mm_alignr_pi8(a, b, n) __extension__ ({ \
184   (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
185 
186 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
187 ///    128-bit vectors of [8 x i16].
188 ///
189 /// \headerfile <x86intrin.h>
190 ///
191 /// This intrinsic corresponds to the \c VPHADDW instruction.
192 ///
193 /// \param __a
194 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
195 ///    horizontal sums of the values are stored in the lower bits of the
196 ///    destination.
197 /// \param __b
198 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
199 ///    horizontal sums of the values are stored in the upper bits of the
200 ///    destination.
201 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
202 ///    both operands.
203 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hadd_epi16(__m128i __a,__m128i __b)204 _mm_hadd_epi16(__m128i __a, __m128i __b)
205 {
206     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
207 }
208 
209 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
210 ///    128-bit vectors of [4 x i32].
211 ///
212 /// \headerfile <x86intrin.h>
213 ///
214 /// This intrinsic corresponds to the \c VPHADDD instruction.
215 ///
216 /// \param __a
217 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
218 ///    horizontal sums of the values are stored in the lower bits of the
219 ///    destination.
220 /// \param __b
221 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
222 ///    horizontal sums of the values are stored in the upper bits of the
223 ///    destination.
224 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
225 ///    both operands.
226 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hadd_epi32(__m128i __a,__m128i __b)227 _mm_hadd_epi32(__m128i __a, __m128i __b)
228 {
229     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
230 }
231 
232 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
233 ///    64-bit vectors of [4 x i16].
234 ///
235 /// \headerfile <x86intrin.h>
236 ///
237 /// This intrinsic corresponds to the \c PHADDW instruction.
238 ///
239 /// \param __a
240 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
241 ///    horizontal sums of the values are stored in the lower bits of the
242 ///    destination.
243 /// \param __b
244 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
245 ///    horizontal sums of the values are stored in the upper bits of the
246 ///    destination.
247 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
248 ///    operands.
249 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hadd_pi16(__m64 __a,__m64 __b)250 _mm_hadd_pi16(__m64 __a, __m64 __b)
251 {
252     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
253 }
254 
255 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
256 ///    64-bit vectors of [2 x i32].
257 ///
258 /// \headerfile <x86intrin.h>
259 ///
260 /// This intrinsic corresponds to the \c PHADDD instruction.
261 ///
262 /// \param __a
263 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
264 ///    horizontal sums of the values are stored in the lower bits of the
265 ///    destination.
266 /// \param __b
267 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
268 ///    horizontal sums of the values are stored in the upper bits of the
269 ///    destination.
270 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
271 ///    operands.
272 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hadd_pi32(__m64 __a,__m64 __b)273 _mm_hadd_pi32(__m64 __a, __m64 __b)
274 {
275     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
276 }
277 
278 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
279 ///    128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
280 ///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
281 ///
282 /// \headerfile <x86intrin.h>
283 ///
284 /// This intrinsic corresponds to the \c VPHADDSW instruction.
285 ///
286 /// \param __a
287 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
288 ///    horizontal sums of the values are stored in the lower bits of the
289 ///    destination.
290 /// \param __b
291 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
292 ///    horizontal sums of the values are stored in the upper bits of the
293 ///    destination.
294 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
295 ///    sums of both operands.
296 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hadds_epi16(__m128i __a,__m128i __b)297 _mm_hadds_epi16(__m128i __a, __m128i __b)
298 {
299     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
300 }
301 
302 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
303 ///    64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
304 ///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
305 ///
306 /// \headerfile <x86intrin.h>
307 ///
308 /// This intrinsic corresponds to the \c PHADDSW instruction.
309 ///
310 /// \param __a
311 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
312 ///    horizontal sums of the values are stored in the lower bits of the
313 ///    destination.
314 /// \param __b
315 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
316 ///    horizontal sums of the values are stored in the upper bits of the
317 ///    destination.
318 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
319 ///    sums of both operands.
320 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hadds_pi16(__m64 __a,__m64 __b)321 _mm_hadds_pi16(__m64 __a, __m64 __b)
322 {
323     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
324 }
325 
326 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
327 ///    packed 128-bit vectors of [8 x i16].
328 ///
329 /// \headerfile <x86intrin.h>
330 ///
331 /// This intrinsic corresponds to the \c VPHSUBW instruction.
332 ///
333 /// \param __a
334 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
335 ///    horizontal differences between the values are stored in the lower bits of
336 ///    the destination.
337 /// \param __b
338 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
339 ///    horizontal differences between the values are stored in the upper bits of
340 ///    the destination.
341 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
342 ///    of both operands.
343 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hsub_epi16(__m128i __a,__m128i __b)344 _mm_hsub_epi16(__m128i __a, __m128i __b)
345 {
346     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
347 }
348 
349 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
350 ///    packed 128-bit vectors of [4 x i32].
351 ///
352 /// \headerfile <x86intrin.h>
353 ///
354 /// This intrinsic corresponds to the \c VPHSUBD instruction.
355 ///
356 /// \param __a
357 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
358 ///    horizontal differences between the values are stored in the lower bits of
359 ///    the destination.
360 /// \param __b
361 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
362 ///    horizontal differences between the values are stored in the upper bits of
363 ///    the destination.
364 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
365 ///    of both operands.
366 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hsub_epi32(__m128i __a,__m128i __b)367 _mm_hsub_epi32(__m128i __a, __m128i __b)
368 {
369     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
370 }
371 
372 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
373 ///    packed 64-bit vectors of [4 x i16].
374 ///
375 /// \headerfile <x86intrin.h>
376 ///
377 /// This intrinsic corresponds to the \c PHSUBW instruction.
378 ///
379 /// \param __a
380 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
381 ///    horizontal differences between the values are stored in the lower bits of
382 ///    the destination.
383 /// \param __b
384 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
385 ///    horizontal differences between the values are stored in the upper bits of
386 ///    the destination.
387 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
388 ///    of both operands.
389 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hsub_pi16(__m64 __a,__m64 __b)390 _mm_hsub_pi16(__m64 __a, __m64 __b)
391 {
392     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
393 }
394 
395 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
396 ///    packed 64-bit vectors of [2 x i32].
397 ///
398 /// \headerfile <x86intrin.h>
399 ///
400 /// This intrinsic corresponds to the \c PHSUBD instruction.
401 ///
402 /// \param __a
403 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
404 ///    horizontal differences between the values are stored in the lower bits of
405 ///    the destination.
406 /// \param __b
407 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
408 ///    horizontal differences between the values are stored in the upper bits of
409 ///    the destination.
410 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
411 ///    of both operands.
412 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hsub_pi32(__m64 __a,__m64 __b)413 _mm_hsub_pi32(__m64 __a, __m64 __b)
414 {
415     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
416 }
417 
418 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
419 ///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
420 ///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
421 ///    saturated to 8000h.
422 ///
423 /// \headerfile <x86intrin.h>
424 ///
425 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
426 ///
427 /// \param __a
428 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
429 ///    horizontal differences between the values are stored in the lower bits of
430 ///    the destination.
431 /// \param __b
432 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
433 ///    horizontal differences between the values are stored in the upper bits of
434 ///    the destination.
435 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
436 ///    differences of both operands.
437 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hsubs_epi16(__m128i __a,__m128i __b)438 _mm_hsubs_epi16(__m128i __a, __m128i __b)
439 {
440     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
441 }
442 
443 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
444 ///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
445 ///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
446 ///    saturated to 8000h.
447 ///
448 /// \headerfile <x86intrin.h>
449 ///
450 /// This intrinsic corresponds to the \c PHSUBSW instruction.
451 ///
452 /// \param __a
453 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
454 ///    horizontal differences between the values are stored in the lower bits of
455 ///    the destination.
456 /// \param __b
457 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
458 ///    horizontal differences between the values are stored in the upper bits of
459 ///    the destination.
460 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
461 ///    differences of both operands.
462 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hsubs_pi16(__m64 __a,__m64 __b)463 _mm_hsubs_pi16(__m64 __a, __m64 __b)
464 {
465     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
466 }
467 
468 /// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
469 ///    values contained in the first source operand and packed 8-bit signed
470 ///    integer values contained in the second source operand, adds pairs of
471 ///    contiguous products with signed saturation, and writes the 16-bit sums to
472 ///    the corresponding bits in the destination. For example, bits [7:0] of
473 ///    both operands are multiplied, bits [15:8] of both operands are
474 ///    multiplied, and the sum of both results is written to bits [15:0] of the
475 ///    destination.
476 ///
477 /// \headerfile <x86intrin.h>
478 ///
479 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
480 ///
481 /// \param __a
482 ///    A 128-bit integer vector containing the first source operand.
483 /// \param __b
484 ///    A 128-bit integer vector containing the second source operand.
485 /// \returns A 128-bit integer vector containing the sums of products of both
486 ///    operands:
487 ///    R0 := (__a0 * __b0) + (__a1 * __b1)
488 ///    R1 := (__a2 * __b2) + (__a3 * __b3)
489 ///    R2 := (__a4 * __b4) + (__a5 * __b5)
490 ///    R3 := (__a6 * __b6) + (__a7 * __b7)
491 ///    R4 := (__a8 * __b8) + (__a9 * __b9)
492 ///    R5 := (__a10 * __b10) + (__a11 * __b11)
493 ///    R6 := (__a12 * __b12) + (__a13 * __b13)
494 ///    R7 := (__a14 * __b14) + (__a15 * __b15)
495 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maddubs_epi16(__m128i __a,__m128i __b)496 _mm_maddubs_epi16(__m128i __a, __m128i __b)
497 {
498     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
499 }
500 
501 /// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
502 ///    values contained in the first source operand and packed 8-bit signed
503 ///    integer values contained in the second source operand, adds pairs of
504 ///    contiguous products with signed saturation, and writes the 16-bit sums to
505 ///    the corresponding bits in the destination. For example, bits [7:0] of
506 ///    both operands are multiplied, bits [15:8] of both operands are
507 ///    multiplied, and the sum of both results is written to bits [15:0] of the
508 ///    destination.
509 ///
510 /// \headerfile <x86intrin.h>
511 ///
512 /// This intrinsic corresponds to the \c PMADDUBSW instruction.
513 ///
514 /// \param __a
515 ///    A 64-bit integer vector containing the first source operand.
516 /// \param __b
517 ///    A 64-bit integer vector containing the second source operand.
518 /// \returns A 64-bit integer vector containing the sums of products of both
519 ///    operands:
520 ///    R0 := (__a0 * __b0) + (__a1 * __b1)
521 ///    R1 := (__a2 * __b2) + (__a3 * __b3)
522 ///    R2 := (__a4 * __b4) + (__a5 * __b5)
523 ///    R3 := (__a6 * __b6) + (__a7 * __b7)
524 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_maddubs_pi16(__m64 __a,__m64 __b)525 _mm_maddubs_pi16(__m64 __a, __m64 __b)
526 {
527     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
528 }
529 
530 /// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
531 ///    products to the 18 most significant bits by right-shifting, rounds the
532 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
533 ///
534 /// \headerfile <x86intrin.h>
535 ///
536 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
537 ///
538 /// \param __a
539 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
540 /// \param __b
541 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
542 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
543 ///    products of both operands.
544 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mulhrs_epi16(__m128i __a,__m128i __b)545 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
546 {
547     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
548 }
549 
550 /// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
551 ///    products to the 18 most significant bits by right-shifting, rounds the
552 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
553 ///
554 /// \headerfile <x86intrin.h>
555 ///
556 /// This intrinsic corresponds to the \c PMULHRSW instruction.
557 ///
558 /// \param __a
559 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
560 /// \param __b
561 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
562 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
563 ///    products of both operands.
564 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_mulhrs_pi16(__m64 __a,__m64 __b)565 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
566 {
567     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
568 }
569 
570 /// \brief Copies the 8-bit integers from a 128-bit integer vector to the
571 ///    destination or clears 8-bit values in the destination, as specified by
572 ///    the second source operand.
573 ///
574 /// \headerfile <x86intrin.h>
575 ///
576 /// This intrinsic corresponds to the \c VPSHUFB instruction.
577 ///
578 /// \param __a
579 ///    A 128-bit integer vector containing the values to be copied.
580 /// \param __b
581 ///    A 128-bit integer vector containing control bytes corresponding to
582 ///    positions in the destination:
583 ///    Bit 7:
584 ///    1: Clear the corresponding byte in the destination.
585 ///    0: Copy the selected source byte to the corresponding byte in the
586 ///    destination.
587 ///    Bits [6:4] Reserved.
588 ///    Bits [3:0] select the source byte to be copied.
589 /// \returns A 128-bit integer vector containing the copied or cleared values.
590 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_shuffle_epi8(__m128i __a,__m128i __b)591 _mm_shuffle_epi8(__m128i __a, __m128i __b)
592 {
593     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
594 }
595 
596 /// \brief Copies the 8-bit integers from a 64-bit integer vector to the
597 ///    destination or clears 8-bit values in the destination, as specified by
598 ///    the second source operand.
599 ///
600 /// \headerfile <x86intrin.h>
601 ///
602 /// This intrinsic corresponds to the \c PSHUFB instruction.
603 ///
604 /// \param __a
605 ///    A 64-bit integer vector containing the values to be copied.
606 /// \param __b
607 ///    A 64-bit integer vector containing control bytes corresponding to
608 ///    positions in the destination:
609 ///    Bit 7:
610 ///    1: Clear the corresponding byte in the destination.
611 ///    0: Copy the selected source byte to the corresponding byte in the
612 ///    destination.
613 ///    Bits [3:0] select the source byte to be copied.
614 /// \returns A 64-bit integer vector containing the copied or cleared values.
615 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_shuffle_pi8(__m64 __a,__m64 __b)616 _mm_shuffle_pi8(__m64 __a, __m64 __b)
617 {
618     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
619 }
620 
621 /// \brief For each 8-bit integer in the first source operand, perform one of
622 ///    the following actions as specified by the second source operand: If the
623 ///    byte in the second source is negative, calculate the two's complement of
624 ///    the corresponding byte in the first source, and write that value to the
625 ///    destination. If the byte in the second source is positive, copy the
626 ///    corresponding byte from the first source to the destination. If the byte
627 ///    in the second source is zero, clear the corresponding byte in the
628 ///    destination.
629 ///
630 /// \headerfile <x86intrin.h>
631 ///
632 /// This intrinsic corresponds to the \c VPSIGNB instruction.
633 ///
634 /// \param __a
635 ///    A 128-bit integer vector containing the values to be copied.
636 /// \param __b
637 ///    A 128-bit integer vector containing control bytes corresponding to
638 ///    positions in the destination.
639 /// \returns A 128-bit integer vector containing the resultant values.
640 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sign_epi8(__m128i __a,__m128i __b)641 _mm_sign_epi8(__m128i __a, __m128i __b)
642 {
643     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
644 }
645 
646 /// \brief For each 16-bit integer in the first source operand, perform one of
647 ///    the following actions as specified by the second source operand: If the
648 ///    word in the second source is negative, calculate the two's complement of
649 ///    the corresponding word in the first source, and write that value to the
650 ///    destination. If the word in the second source is positive, copy the
651 ///    corresponding word from the first source to the destination. If the word
652 ///    in the second source is zero, clear the corresponding word in the
653 ///    destination.
654 ///
655 /// \headerfile <x86intrin.h>
656 ///
657 /// This intrinsic corresponds to the \c VPSIGNW instruction.
658 ///
659 /// \param __a
660 ///    A 128-bit integer vector containing the values to be copied.
661 /// \param __b
662 ///    A 128-bit integer vector containing control words corresponding to
663 ///    positions in the destination.
664 /// \returns A 128-bit integer vector containing the resultant values.
665 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sign_epi16(__m128i __a,__m128i __b)666 _mm_sign_epi16(__m128i __a, __m128i __b)
667 {
668     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
669 }
670 
671 /// \brief For each 32-bit integer in the first source operand, perform one of
672 ///    the following actions as specified by the second source operand: If the
673 ///    doubleword in the second source is negative, calculate the two's
674 ///    complement of the corresponding word in the first source, and write that
675 ///    value to the destination. If the doubleword in the second source is
676 ///    positive, copy the corresponding word from the first source to the
677 ///    destination. If the doubleword in the second source is zero, clear the
678 ///    corresponding word in the destination.
679 ///
680 /// \headerfile <x86intrin.h>
681 ///
682 /// This intrinsic corresponds to the \c VPSIGND instruction.
683 ///
684 /// \param __a
685 ///    A 128-bit integer vector containing the values to be copied.
686 /// \param __b
687 ///    A 128-bit integer vector containing control doublewords corresponding to
688 ///    positions in the destination.
689 /// \returns A 128-bit integer vector containing the resultant values.
690 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sign_epi32(__m128i __a,__m128i __b)691 _mm_sign_epi32(__m128i __a, __m128i __b)
692 {
693     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
694 }
695 
696 /// \brief For each 8-bit integer in the first source operand, perform one of
697 ///    the following actions as specified by the second source operand: If the
698 ///    byte in the second source is negative, calculate the two's complement of
699 ///    the corresponding byte in the first source, and write that value to the
700 ///    destination. If the byte in the second source is positive, copy the
701 ///    corresponding byte from the first source to the destination. If the byte
702 ///    in the second source is zero, clear the corresponding byte in the
703 ///    destination.
704 ///
705 /// \headerfile <x86intrin.h>
706 ///
707 /// This intrinsic corresponds to the \c PSIGNB instruction.
708 ///
709 /// \param __a
710 ///    A 64-bit integer vector containing the values to be copied.
711 /// \param __b
712 ///    A 64-bit integer vector containing control bytes corresponding to
713 ///    positions in the destination.
714 /// \returns A 64-bit integer vector containing the resultant values.
715 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sign_pi8(__m64 __a,__m64 __b)716 _mm_sign_pi8(__m64 __a, __m64 __b)
717 {
718     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
719 }
720 
721 /// \brief For each 16-bit integer in the first source operand, perform one of
722 ///    the following actions as specified by the second source operand: If the
723 ///    word in the second source is negative, calculate the two's complement of
724 ///    the corresponding word in the first source, and write that value to the
725 ///    destination. If the word in the second source is positive, copy the
726 ///    corresponding word from the first source to the destination. If the word
727 ///    in the second source is zero, clear the corresponding word in the
728 ///    destination.
729 ///
730 /// \headerfile <x86intrin.h>
731 ///
732 /// This intrinsic corresponds to the \c PSIGNW instruction.
733 ///
734 /// \param __a
735 ///    A 64-bit integer vector containing the values to be copied.
736 /// \param __b
737 ///    A 64-bit integer vector containing control words corresponding to
738 ///    positions in the destination.
739 /// \returns A 64-bit integer vector containing the resultant values.
740 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sign_pi16(__m64 __a,__m64 __b)741 _mm_sign_pi16(__m64 __a, __m64 __b)
742 {
743     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
744 }
745 
746 /// \brief For each 32-bit integer in the first source operand, perform one of
747 ///    the following actions as specified by the second source operand: If the
748 ///    doubleword in the second source is negative, calculate the two's
749 ///    complement of the corresponding doubleword in the first source, and
750 ///    write that value to the destination. If the doubleword in the second
751 ///    source is positive, copy the corresponding doubleword from the first
752 ///    source to the destination. If the doubleword in the second source is
753 ///    zero, clear the corresponding doubleword in the destination.
754 ///
755 /// \headerfile <x86intrin.h>
756 ///
757 /// This intrinsic corresponds to the \c PSIGND instruction.
758 ///
759 /// \param __a
760 ///    A 64-bit integer vector containing the values to be copied.
761 /// \param __b
762 ///    A 64-bit integer vector containing two control doublewords corresponding
763 ///    to positions in the destination.
764 /// \returns A 64-bit integer vector containing the resultant values.
765 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sign_pi32(__m64 __a,__m64 __b)766 _mm_sign_pi32(__m64 __a, __m64 __b)
767 {
768     return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
769 }
770 
771 #undef __DEFAULT_FN_ATTRS
772 
773 #endif /* __TMMINTRIN_H */
774