1/****************************************************************************
2* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
3*
4* Permission is hereby granted, free of charge, to any person obtaining a
5* copy of this software and associated documentation files (the "Software"),
6* to deal in the Software without restriction, including without limitation
7* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8* and/or sell copies of the Software, and to permit persons to whom the
9* Software is furnished to do so, subject to the following conditions:
10*
11* The above copyright notice and this permission notice (including the next
12* paragraph) shall be included in all copies or substantial portions of the
13* Software.
14*
15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21* IN THE SOFTWARE.
22****************************************************************************/
23#if !defined(__SIMD_LIB_AVX_HPP__)
24#error Do not include this file directly, use "simdlib.hpp" instead.
25#endif
26
27//============================================================================
28// SIMD128 AVX (1) implementation
29//============================================================================
30
31#define SIMD_WRAPPER_1(op)  \
32    static SIMDINLINE Float SIMDCALL op(Float a)   \
33    {\
34        return _mm_##op(a);\
35    }
36
37#define SIMD_WRAPPER_2(op)  \
38    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
39    {\
40        return _mm_##op(a, b);\
41    }
42
43#define SIMD_DWRAPPER_2(op)  \
44    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
45    {\
46        return _mm_##op(a, b);\
47    }
48
49#define SIMD_WRAPPER_2I(op)  \
50    template<int ImmT>\
51    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
52    {\
53        return _mm_##op(a, b, ImmT);\
54    }
55
56#define SIMD_DWRAPPER_2I(op)  \
57    template<int ImmT>\
58    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
59    {\
60        return _mm_##op(a, b, ImmT);\
61    }
62
63#define SIMD_WRAPPER_3(op)  \
64    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
65    {\
66        return _mm_##op(a, b, c);\
67    }
68
69#define SIMD_IWRAPPER_1(op)  \
70    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
71    {\
72        return _mm_##op(a);\
73    }
74
75#define SIMD_IWRAPPER_1I_(op, intrin)  \
76    template<int ImmT> \
77    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
78    {\
79        return intrin(a, ImmT);\
80    }
81#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op)
82
83#define SIMD_IWRAPPER_2_(op, intrin)  \
84    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
85    {\
86        return intrin(a, b);\
87    }
88
89#define SIMD_IWRAPPER_2(op)  \
90    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
91    {\
92        return _mm_##op(a, b);\
93    }
94
95#define SIMD_IFWRAPPER_2(op, intrin)  \
96    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
97    {\
98        return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
99    }
100
101#define SIMD_IWRAPPER_2I(op)  \
102    template<int ImmT>\
103    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
104    {\
105        return _mm_##op(a, b, ImmT);\
106    }
107
108//-----------------------------------------------------------------------
109// Single precision floating point arithmetic operations
110//-----------------------------------------------------------------------
111SIMD_WRAPPER_2(add_ps);     // return a + b
112SIMD_WRAPPER_2(div_ps);     // return a / b
113SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
114SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
115SIMD_WRAPPER_2(mul_ps);     // return a * b
116SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
117SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
118SIMD_WRAPPER_2(sub_ps);     // return a - b
119
120static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c)    // return (a * b) + c
121{
122    return add_ps(mul_ps(a, b), c);
123}
124static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c)    // return (a * b) - c
125{
126    return sub_ps(mul_ps(a, b), c);
127}
128
129template <RoundMode RMT>
130static SIMDINLINE Float SIMDCALL round_ps(Float a)
131{
132    return _mm_round_ps(a, static_cast<int>(RMT));
133}
134
135static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
136static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
137
138//-----------------------------------------------------------------------
139// Integer (various width) arithmetic operations
140//-----------------------------------------------------------------------
141SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
142SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
143SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
144SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
145SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
146SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
147SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
148SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
149SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
150
151// return (a * b) & 0xFFFFFFFF
152//
153// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
154// and store the low 32 bits of the intermediate integers in dst.
155SIMD_IWRAPPER_2(mullo_epi32);
156SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
157SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
158SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
159
160//-----------------------------------------------------------------------
161// Logical operations
162//-----------------------------------------------------------------------
163SIMD_WRAPPER_2(and_ps);                             // return a & b       (float treated as int)
164SIMD_IWRAPPER_2_(and_si, _mm_and_si128);        // return a & b       (int)
165SIMD_WRAPPER_2(andnot_ps);                          // return (~a) & b    (float treated as int)
166SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128);  // return (~a) & b    (int)
167SIMD_WRAPPER_2(or_ps);                              // return a | b       (float treated as int)
168SIMD_IWRAPPER_2_(or_si, _mm_or_si128);          // return a | b       (int)
169SIMD_WRAPPER_2(xor_ps);                             // return a ^ b       (float treated as int)
170SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128);        // return a ^ b       (int)
171
172
173//-----------------------------------------------------------------------
174// Shift operations
175//-----------------------------------------------------------------------
176SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
177
178static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
179{
180    int32_t a, count;
181    a = _mm_extract_epi32(vA, 0);
182    count = _mm_extract_epi32(vB, 0);
183    a <<= count;
184    vA = _mm_insert_epi32(vA, a, 0);
185
186    a = _mm_extract_epi32(vA, 1);
187    count = _mm_extract_epi32(vB, 1);
188    a <<= count;
189    vA = _mm_insert_epi32(vA, a, 1);
190
191    a = _mm_extract_epi32(vA, 2);
192    count = _mm_extract_epi32(vB, 2);
193    a <<= count;
194    vA = _mm_insert_epi32(vA, a, 2);
195
196    a = _mm_extract_epi32(vA, 3);
197    count = _mm_extract_epi32(vB, 3);
198    a <<= count;
199    vA = _mm_insert_epi32(vA, a, 3);
200
201    return vA;
202}
203
204SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
205SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
206SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint)
207
208template<int ImmT>                              // same as srli_si, but with Float cast to int
209static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
210{
211    return castsi_ps(srli_si<ImmT>(castps_si(a)));
212}
213
214static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b      (uint32)
215{
216    int32_t a, count;
217    a = _mm_extract_epi32(vA, 0);
218    count = _mm_extract_epi32(vB, 0);
219    a >>= count;
220    vA = _mm_insert_epi32(vA, a, 0);
221
222    a = _mm_extract_epi32(vA, 1);
223    count = _mm_extract_epi32(vB, 1);
224    a >>= count;
225    vA = _mm_insert_epi32(vA, a, 1);
226
227    a = _mm_extract_epi32(vA, 2);
228    count = _mm_extract_epi32(vB, 2);
229    a >>= count;
230    vA = _mm_insert_epi32(vA, a, 2);
231
232    a = _mm_extract_epi32(vA, 3);
233    count = _mm_extract_epi32(vB, 3);
234    a >>= count;
235    vA = _mm_insert_epi32(vA, a, 3);
236
237    return vA;
238}
239
240
241
242//-----------------------------------------------------------------------
243// Conversion operations
244//-----------------------------------------------------------------------
245static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
246{
247    return _mm_castpd_ps(a);
248}
249
250static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
251{
252    return _mm_castps_si128(a);
253}
254
255static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
256{
257    return _mm_castsi128_pd(a);
258}
259
260static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
261{
262    return _mm_castps_pd(a);
263}
264
265static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
266{
267    return _mm_castsi128_ps(a);
268}
269
270static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
271{
272    return _mm_cvtepi32_ps(a);
273}
274
275SIMD_IWRAPPER_1(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
276SIMD_IWRAPPER_1(cvtepu8_epi32);     // return (int32)a    (uint8 --> int32)
277SIMD_IWRAPPER_1(cvtepu16_epi32);    // return (int32)a    (uint16 --> int32)
278SIMD_IWRAPPER_1(cvtepu16_epi64);    // return (int64)a    (uint16 --> int64)
279SIMD_IWRAPPER_1(cvtepu32_epi64);    // return (int64)a    (uint32 --> int64)
280
281static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
282{
283    return _mm_cvtps_epi32(a);
284}
285
286static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
287{
288    return _mm_cvttps_epi32(a);
289}
290
291//-----------------------------------------------------------------------
292// Comparison operations
293//-----------------------------------------------------------------------
294template<CompareType CmpTypeT>
295static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
296{
297    return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
298}
299static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
300static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
301static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
302static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
303static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
304static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
305
306SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
307SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
308SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
309SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
310SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
311SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
312SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
313SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
314SIMD_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
315
316static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
317{
318    return  0 != _mm_testz_ps(a, b);
319}
320
321static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
322{
323    return  0 != _mm_testz_si128(a, b);
324}
325
326//-----------------------------------------------------------------------
327// Blend / shuffle / permute operations
328//-----------------------------------------------------------------------
329SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
330SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
331
332static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
333{
334    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
335}
336
337static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
338{
339    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
340}
341
342static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
343{
344    return _mm_broadcast_ss(p);
345}
346
347SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm_packs_epi16 and _mm512_packs_epi16
348SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm_packs_epi32 and _mm512_packs_epi32
349SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm_packus_epi16 and _mm512_packus_epi16
350SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm_packus_epi32 and _mm512_packus_epi32
351
352static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
353{
354    return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz));
355}
356
357static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
358{
359    return _mm_permutevar_ps(a, swiz);
360}
361
362SIMD_IWRAPPER_1I(shuffle_epi32);
363
364template<int ImmT>
365static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete;
366
367SIMD_IWRAPPER_2(shuffle_epi8);
368SIMD_DWRAPPER_2I(shuffle_pd);
369SIMD_WRAPPER_2I(shuffle_ps);
370SIMD_IWRAPPER_2(unpackhi_epi16);
371
372//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps);
373static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
374{
375    return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
376}
377
378SIMD_IWRAPPER_2(unpackhi_epi64);
379SIMD_IWRAPPER_2(unpackhi_epi8);
380SIMD_DWRAPPER_2(unpackhi_pd);
381SIMD_WRAPPER_2(unpackhi_ps);
382SIMD_IWRAPPER_2(unpacklo_epi16);
383SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps);
384SIMD_IWRAPPER_2(unpacklo_epi64);
385SIMD_IWRAPPER_2(unpacklo_epi8);
386SIMD_DWRAPPER_2(unpacklo_pd);
387SIMD_WRAPPER_2(unpacklo_ps);
388
389//-----------------------------------------------------------------------
390// Load / store operations
391//-----------------------------------------------------------------------
392template<ScaleFactor ScaleT>
393static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
394{
395    uint32_t *pOffsets = (uint32_t*)&idx;
396    Float vResult;
397    float* pResult = (float*)&vResult;
398    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
399    {
400        uint32_t offset = pOffsets[i];
401        offset = offset * static_cast<uint32_t>(ScaleT);
402        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
403    }
404
405    return vResult;
406}
407
408static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
409{
410    return broadcast_ss(p);
411}
412
413static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
414{
415    return _mm_load_ps(p);
416}
417
418static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
419{
420    return _mm_load_si128(&p->v);
421}
422
423static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
424{
425    return _mm_loadu_ps(p);
426}
427
428static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
429{
430    return _mm_lddqu_si128(&p->v);
431}
432
433// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
434template<ScaleFactor ScaleT>
435static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
436{
437    uint32_t *pOffsets = (uint32_t*)&idx;
438    Float vResult = old;
439    float* pResult = (float*)&vResult;
440    DWORD index;
441    uint32_t umask = movemask_ps(mask);
442    while (_BitScanForward(&index, umask))
443    {
444        umask &= ~(1 << index);
445        uint32_t offset = pOffsets[index];
446        offset = offset * static_cast<uint32_t>(ScaleT);
447        pResult[index] = *(float const *)(((uint8_t const *)p + offset));
448    }
449
450    return vResult;
451}
452
453static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
454{
455    _mm_maskstore_ps(p, mask, src);
456}
457
458static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
459{
460    return static_cast<uint32_t>(_mm_movemask_epi8(a));
461}
462
463static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
464{
465    return static_cast<uint32_t>(_mm_movemask_pd(a));
466}
467static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
468{
469    return static_cast<uint32_t>(_mm_movemask_ps(a));
470}
471
472static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
473{
474    return _mm_set1_epi32(i);
475}
476
477static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
478{
479    return _mm_set1_epi8(i);
480}
481
482static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
483{
484    return _mm_set1_ps(f);
485}
486
487static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
488{
489    return _mm_setzero_ps();
490}
491
492static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
493{
494    return _mm_setzero_si128();
495}
496
497static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
498{
499    _mm_store_ps(p, a);
500}
501
502static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
503{
504    _mm_store_si128(&p->v, a);
505}
506
507static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
508{
509    _mm_storeu_si128(&p->v, a);
510}
511
512static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
513{
514    _mm_stream_ps(p, a);
515}
516
517static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0)
518{
519    return _mm_set_ps(in3, in2, in1, in0);
520}
521
522static SIMDINLINE Integer SIMDCALL set_epi32(int in3, int in2, int in1, int in0)
523{
524    return _mm_set_epi32(in3, in2, in1, in0);
525}
526
527template <int ImmT>
528static SIMDINLINE float SIMDCALL extract_ps(Float a)
529{
530    int tmp = _mm_extract_ps(a, ImmT);
531    return *reinterpret_cast<float*>(&tmp);
532}
533
534static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
535{
536    Integer vec = set1_epi32(mask);
537    const Integer bit = set_epi32(
538        0x08, 0x04, 0x02, 0x01);
539    vec = and_si(vec, bit);
540    vec = cmplt_epi32(setzero_si(), vec);
541    return castsi_ps(vec);
542}
543
544#undef SIMD_WRAPPER_1
545#undef SIMD_WRAPPER_2
546#undef SIMD_DWRAPPER_2
547#undef SIMD_DWRAPPER_2I
548#undef SIMD_WRAPPER_2I
549#undef SIMD_WRAPPER_3
550#undef SIMD_IWRAPPER_1
551#undef SIMD_IWRAPPER_2
552#undef SIMD_IFWRAPPER_2
553#undef SIMD_IWRAPPER_2I
554#undef SIMD_IWRAPPER_1
555#undef SIMD_IWRAPPER_1I
556#undef SIMD_IWRAPPER_1I_
557#undef SIMD_IWRAPPER_2
558#undef SIMD_IWRAPPER_2_
559#undef SIMD_IWRAPPER_2I
560
561