1/****************************************************************************
2 * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23#if !defined(__SIMD_LIB_AVX_HPP__)
24#error Do not include this file directly, use "simdlib.hpp" instead.
25#endif
26
27using SIMD128T = SIMD128Impl::AVXImpl;
28
29//============================================================================
30// SIMD256 AVX (1) implementation
31//============================================================================
32
33#define SIMD_WRAPPER_1(op) \
34    static SIMDINLINE Float SIMDCALL op(Float const& a) { return _mm256_##op(a); }
35
36#define SIMD_WRAPPER_2(op)                                              \
37    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
38    {                                                                   \
39        return _mm256_##op(a, b);                                       \
40    }
41
42#define SIMD_DWRAPPER_2(op)                                                \
43    static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \
44    {                                                                      \
45        return _mm256_##op(a, b);                                          \
46    }
47
48#define SIMD_WRAPPER_2I(op)                                             \
49    template <int ImmT>                                                 \
50    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
51    {                                                                   \
52        return _mm256_##op(a, b, ImmT);                                 \
53    }
54
55#define SIMD_DWRAPPER_2I(op)                                               \
56    template <int ImmT>                                                    \
57    static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \
58    {                                                                      \
59        return _mm256_##op(a, b, ImmT);                                    \
60    }
61
62#define SIMD_WRAPPER_3(op)                                                              \
63    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
64    {                                                                                   \
65        return _mm256_##op(a, b, c);                                                    \
66    }
67
68#define SIMD_IWRAPPER_1(op) \
69    static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); }
70
71#define SIMD_IWRAPPER_2(op)                                                   \
72    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
73    {                                                                         \
74        return _mm256_##op(a, b);                                             \
75    }
76
77#define SIMD_IFWRAPPER_2(op, intrin)                                          \
78    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
79    {                                                                         \
80        return castps_si(intrin(castsi_ps(a), castsi_ps(b)));                 \
81    }
82
83#define SIMD_IFWRAPPER_2I(op, intrin)                                         \
84    template <int ImmT>                                                       \
85    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
86    {                                                                         \
87        return castps_si(intrin(castsi_ps(a), castsi_ps(b), ImmT));           \
88    }
89
90#define SIMD_IWRAPPER_2I_(op, intrin)                                         \
91    template <int ImmT>                                                       \
92    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
93    {                                                                         \
94        return _mm256_##intrin(a, b, ImmT);                                   \
95    }
96#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
97
98#define SIMD_IWRAPPER_3(op)                                                                     \
99    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
100    {                                                                                           \
101        return _mm256_##op(a, b, c);                                                            \
102    }
103
104// emulated integer simd
105#define SIMD_EMU_IWRAPPER_1(op)                             \
106    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
107    {                                                       \
108        return Integer{                                     \
109            SIMD128T::op(a.v4[0]),                          \
110            SIMD128T::op(a.v4[1]),                          \
111        };                                                  \
112    }
113#define SIMD_EMU_IWRAPPER_1L(op, shift)                                  \
114    static SIMDINLINE Integer SIMDCALL op(Integer const& a)              \
115    {                                                                    \
116        return Integer{                                                  \
117            SIMD128T::op(a.v4[0]),                                       \
118            SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])),    \
119        };                                                               \
120    }                                                                    \
121    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer const& a) \
122    {                                                                    \
123        return Integer{                                                  \
124            SIMD128T::op(a),                                             \
125            SIMD128T::op(SIMD128T::template srli_si<shift>(a)),          \
126        };                                                               \
127    }
128
129#define SIMD_EMU_IWRAPPER_1I(op)                            \
130    template <int ImmT>                                     \
131    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
132    {                                                       \
133        return Integer{                                     \
134            SIMD128T::template op<ImmT>(a.v4[0]),           \
135            SIMD128T::template op<ImmT>(a.v4[1]),           \
136        };                                                  \
137    }
138
139#define SIMD_EMU_IWRAPPER_2(op)                                               \
140    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
141    {                                                                         \
142        return Integer{                                                       \
143            SIMD128T::op(a.v4[0], b.v4[0]),                                   \
144            SIMD128T::op(a.v4[1], b.v4[1]),                                   \
145        };                                                                    \
146    }
147
148#define SIMD_EMU_IWRAPPER_2I(op)                                              \
149    template <int ImmT>                                                       \
150    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
151    {                                                                         \
152        return Integer{                                                       \
153            SIMD128T::template op<ImmT>(a.v4[0], b.v[0]),                     \
154            SIMD128T::template op<ImmT>(a.v4[1], b.v[1]),                     \
155        };                                                                    \
156    }
157
158//-----------------------------------------------------------------------
159// Single precision floating point arithmetic operations
160//-----------------------------------------------------------------------
161SIMD_WRAPPER_2(add_ps); // return a + b
162SIMD_WRAPPER_2(div_ps); // return a / b
163
164static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a,
165                                          Float const& b,
166                                          Float const& c) // return (a * b) + c
167{
168    return add_ps(mul_ps(a, b), c);
169}
170
171static SIMDINLINE Float SIMDCALL fmsub_ps(Float const& a,
172                                          Float const& b,
173                                          Float const& c) // return (a * b) - c
174{
175    return sub_ps(mul_ps(a, b), c);
176}
177
178SIMD_WRAPPER_2(max_ps);   // return (a > b) ? a : b
179SIMD_WRAPPER_2(min_ps);   // return (a < b) ? a : b
180SIMD_WRAPPER_2(mul_ps);   // return a * b
181SIMD_WRAPPER_1(rcp_ps);   // return 1.0f / a
182SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
183SIMD_WRAPPER_2(sub_ps);   // return a - b
184
185template <RoundMode RMT>
186static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
187{
188    return _mm256_round_ps(a, static_cast<int>(RMT));
189}
190
191static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
192{
193    return round_ps<RoundMode::CEIL_NOEXC>(a);
194}
195static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
196{
197    return round_ps<RoundMode::FLOOR_NOEXC>(a);
198}
199
200//-----------------------------------------------------------------------
201// Integer (various width) arithmetic operations
202//-----------------------------------------------------------------------
203SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
204SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32)
205SIMD_EMU_IWRAPPER_2(add_epi8);  // return a + b (int8)
206SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
207SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
208SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
209SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
210SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
211SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32)
212
213// return (a * b) & 0xFFFFFFFF
214//
215// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
216// and store the low 32 bits of the intermediate integers in dst.
217SIMD_EMU_IWRAPPER_2(mullo_epi32);
218SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32)
219SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64)
220SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
221
222//-----------------------------------------------------------------------
223// Logical operations
224//-----------------------------------------------------------------------
225SIMD_WRAPPER_2(and_ps);                         // return a & b       (float treated as int)
226SIMD_IFWRAPPER_2(and_si, _mm256_and_ps);        // return a & b       (int)
227SIMD_WRAPPER_2(andnot_ps);                      // return (~a) & b    (float treated as int)
228SIMD_IFWRAPPER_2(andnot_si, _mm256_andnot_ps);  // return (~a) & b    (int)
229SIMD_WRAPPER_2(or_ps);                          // return a | b       (float treated as int)
230SIMD_IFWRAPPER_2(or_si, _mm256_or_ps);          // return a | b       (int)
231SIMD_WRAPPER_2(xor_ps);                         // return a ^ b       (float treated as int)
232SIMD_IFWRAPPER_2(xor_si, _mm256_xor_ps);        // return a ^ b       (int)
233
234//-----------------------------------------------------------------------
235// Shift operations
236//-----------------------------------------------------------------------
237SIMD_EMU_IWRAPPER_1I(slli_epi32); // return a << ImmT
238
239static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer const& vA,
240                                              Integer const& vCount) // return a << b      (uint32)
241{
242    int32_t aHi, aLow, countHi, countLow;
243    __m128i vAHi      = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
244    __m128i vALow     = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
245    __m128i vCountHi  = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
246    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
247
248    aHi     = _mm_extract_epi32(vAHi, 0);
249    countHi = _mm_extract_epi32(vCountHi, 0);
250    aHi <<= countHi;
251    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
252
253    aLow     = _mm_extract_epi32(vALow, 0);
254    countLow = _mm_extract_epi32(vCountLow, 0);
255    aLow <<= countLow;
256    vALow = _mm_insert_epi32(vALow, aLow, 0);
257
258    aHi     = _mm_extract_epi32(vAHi, 1);
259    countHi = _mm_extract_epi32(vCountHi, 1);
260    aHi <<= countHi;
261    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
262
263    aLow     = _mm_extract_epi32(vALow, 1);
264    countLow = _mm_extract_epi32(vCountLow, 1);
265    aLow <<= countLow;
266    vALow = _mm_insert_epi32(vALow, aLow, 1);
267
268    aHi     = _mm_extract_epi32(vAHi, 2);
269    countHi = _mm_extract_epi32(vCountHi, 2);
270    aHi <<= countHi;
271    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
272
273    aLow     = _mm_extract_epi32(vALow, 2);
274    countLow = _mm_extract_epi32(vCountLow, 2);
275    aLow <<= countLow;
276    vALow = _mm_insert_epi32(vALow, aLow, 2);
277
278    aHi     = _mm_extract_epi32(vAHi, 3);
279    countHi = _mm_extract_epi32(vCountHi, 3);
280    aHi <<= countHi;
281    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
282
283    aLow     = _mm_extract_epi32(vALow, 3);
284    countLow = _mm_extract_epi32(vCountLow, 3);
285    aLow <<= countLow;
286    vALow = _mm_insert_epi32(vALow, aLow, 3);
287
288    __m256i ret = _mm256_set1_epi32(0);
289    ret         = _mm256_insertf128_si256(ret, vAHi, 1);
290    ret         = _mm256_insertf128_si256(ret, vALow, 0);
291    return ret;
292}
293
294SIMD_EMU_IWRAPPER_1I(srai_epi32); // return a >> ImmT   (int32)
295SIMD_EMU_IWRAPPER_1I(srli_epi32); // return a >> ImmT   (uint32)
296SIMD_EMU_IWRAPPER_1I(srli_si);    // return a >> (ImmT*8) (uint)
297
298template <int ImmT> // same as srli_si, but with Float cast to int
299static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a)
300{
301    return castsi_ps(srli_si<ImmT>(castps_si(a)));
302}
303
304static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer const& vA,
305                                              Integer const& vCount) // return a >> b      (uint32)
306{
307    int32_t aHi, aLow, countHi, countLow;
308    __m128i vAHi      = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
309    __m128i vALow     = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
310    __m128i vCountHi  = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
311    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
312
313    aHi     = _mm_extract_epi32(vAHi, 0);
314    countHi = _mm_extract_epi32(vCountHi, 0);
315    aHi >>= countHi;
316    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
317
318    aLow     = _mm_extract_epi32(vALow, 0);
319    countLow = _mm_extract_epi32(vCountLow, 0);
320    aLow >>= countLow;
321    vALow = _mm_insert_epi32(vALow, aLow, 0);
322
323    aHi     = _mm_extract_epi32(vAHi, 1);
324    countHi = _mm_extract_epi32(vCountHi, 1);
325    aHi >>= countHi;
326    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
327
328    aLow     = _mm_extract_epi32(vALow, 1);
329    countLow = _mm_extract_epi32(vCountLow, 1);
330    aLow >>= countLow;
331    vALow = _mm_insert_epi32(vALow, aLow, 1);
332
333    aHi     = _mm_extract_epi32(vAHi, 2);
334    countHi = _mm_extract_epi32(vCountHi, 2);
335    aHi >>= countHi;
336    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
337
338    aLow     = _mm_extract_epi32(vALow, 2);
339    countLow = _mm_extract_epi32(vCountLow, 2);
340    aLow >>= countLow;
341    vALow = _mm_insert_epi32(vALow, aLow, 2);
342
343    aHi     = _mm_extract_epi32(vAHi, 3);
344    countHi = _mm_extract_epi32(vCountHi, 3);
345    aHi >>= countHi;
346    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
347
348    aLow     = _mm_extract_epi32(vALow, 3);
349    countLow = _mm_extract_epi32(vCountLow, 3);
350    aLow >>= countLow;
351    vALow = _mm_insert_epi32(vALow, aLow, 3);
352
353    __m256i ret = _mm256_set1_epi32(0);
354    ret         = _mm256_insertf128_si256(ret, vAHi, 1);
355    ret         = _mm256_insertf128_si256(ret, vALow, 0);
356    return ret;
357}
358
359//-----------------------------------------------------------------------
360// Conversion operations
361//-----------------------------------------------------------------------
362static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
363{
364    return _mm256_castpd_ps(a);
365}
366
367static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
368{
369    return _mm256_castps_si256(a);
370}
371
372static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
373{
374    return _mm256_castsi256_pd(a);
375}
376
377static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
378{
379    return _mm256_castps_pd(a);
380}
381
382static SIMDINLINE Integer SIMDCALL castpd_si(Double const& a) // return *(Integer*)(&a)
383{
384    return _mm256_castpd_si256(a);
385}
386
387static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
388{
389    return _mm256_castsi256_ps(a);
390}
391
392static SIMDINLINE Float SIMDCALL
393                        cvtepi32_ps(Integer const& a) // return (float)a    (int32 --> float)
394{
395    return _mm256_cvtepi32_ps(a);
396}
397
398SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8);  // return (int16)a    (uint8 --> int16)
399SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4);  // return (int32)a    (uint8 --> int32)
400SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8); // return (int32)a    (uint16 --> int32)
401SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4); // return (int64)a    (uint16 --> int64)
402SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8); // return (int64)a    (uint32 --> int64)
403
404static SIMDINLINE Integer SIMDCALL
405                          cvtps_epi32(Float const& a) // return (int32)a    (float --> int32)
406{
407    return _mm256_cvtps_epi32(a);
408}
409
410static SIMDINLINE Integer SIMDCALL
411                          cvttps_epi32(Float const& a) // return (int32)a    (rnd_to_zero(float) --> int32)
412{
413    return _mm256_cvttps_epi32(a);
414}
415
416//-----------------------------------------------------------------------
417// Comparison operations
418//-----------------------------------------------------------------------
419template <CompareType CmpTypeT>
420static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
421{
422    return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
423}
424static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
425{
426    return cmp_ps<CompareType::LT_OQ>(a, b);
427}
428static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
429{
430    return cmp_ps<CompareType::GT_OQ>(a, b);
431}
432static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
433{
434    return cmp_ps<CompareType::NEQ_OQ>(a, b);
435}
436static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
437{
438    return cmp_ps<CompareType::EQ_OQ>(a, b);
439}
440static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
441{
442    return cmp_ps<CompareType::GE_OQ>(a, b);
443}
444static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
445{
446    return cmp_ps<CompareType::LE_OQ>(a, b);
447}
448
449SIMD_EMU_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
450SIMD_EMU_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
451SIMD_EMU_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
452SIMD_EMU_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
453SIMD_EMU_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
454SIMD_EMU_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
455SIMD_EMU_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
456SIMD_EMU_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
457SIMD_EMU_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
458
459static SIMDINLINE bool SIMDCALL
460                       testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
461{
462    return 0 != _mm256_testz_ps(a, b);
463}
464
465static SIMDINLINE bool SIMDCALL
466                       testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
467{
468    return 0 != _mm256_testz_si256(a, b);
469}
470
471//-----------------------------------------------------------------------
472// Blend / shuffle / permute operations
473//-----------------------------------------------------------------------
474SIMD_WRAPPER_2I(blend_ps);                       // return ImmT ? b : a  (float)
475SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps); // return ImmT ? b : a  (int32)
476SIMD_WRAPPER_3(blendv_ps);                       // return mask ? b : a  (float)
477
478static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
479                                                Integer const& b,
480                                                Float const&   mask) // return mask ? b : a (int)
481{
482    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
483}
484
485static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
486                                                Integer const& b,
487                                                Integer const& mask) // return mask ? b : a (int)
488{
489    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
490}
491
492static SIMDINLINE Float SIMDCALL
493                        broadcast_ss(float const* p) // return *p (all elements in vector get same value)
494{
495    return _mm256_broadcast_ss(p);
496}
497
498SIMD_EMU_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
499SIMD_EMU_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
500SIMD_EMU_IWRAPPER_2(
501    packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
502SIMD_EMU_IWRAPPER_2(
503    packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
504
505template <int ImmT>
506static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
507{
508    return _mm256_permute_ps(a, ImmT);
509}
510
511static SIMDINLINE Integer SIMDCALL permute_epi32(
512    Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
513{
514    Integer result;
515
516    // Ugly slow implementation
517    uint32_t const* pA      = reinterpret_cast<uint32_t const*>(&a);
518    uint32_t const* pSwiz   = reinterpret_cast<uint32_t const*>(&swiz);
519    uint32_t*       pResult = reinterpret_cast<uint32_t*>(&result);
520
521    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
522    {
523        pResult[i] = pA[0xF & pSwiz[i]];
524    }
525
526    return result;
527}
528
529static SIMDINLINE Float SIMDCALL
530                        permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
531{
532    Float result;
533
534    // Ugly slow implementation
535    float const*    pA      = reinterpret_cast<float const*>(&a);
536    uint32_t const* pSwiz   = reinterpret_cast<uint32_t const*>(&swiz);
537    float*          pResult = reinterpret_cast<float*>(&result);
538
539    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
540    {
541        pResult[i] = pA[0xF & pSwiz[i]];
542    }
543
544    return result;
545}
546
547SIMD_WRAPPER_2I(permute2f128_ps);
548SIMD_DWRAPPER_2I(permute2f128_pd);
549SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256);
550
551SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
552
553template <int ImmT>
554static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b)
555{
556    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
557}
558SIMD_EMU_IWRAPPER_2(shuffle_epi8);
559SIMD_DWRAPPER_2I(shuffle_pd);
560SIMD_WRAPPER_2I(shuffle_ps);
561SIMD_EMU_IWRAPPER_2(unpackhi_epi16);
562SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps);
563SIMD_EMU_IWRAPPER_2(unpackhi_epi64);
564SIMD_EMU_IWRAPPER_2(unpackhi_epi8);
565SIMD_DWRAPPER_2(unpackhi_pd);
566SIMD_WRAPPER_2(unpackhi_ps);
567SIMD_EMU_IWRAPPER_2(unpacklo_epi16);
568SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps);
569SIMD_EMU_IWRAPPER_2(unpacklo_epi64);
570SIMD_EMU_IWRAPPER_2(unpacklo_epi8);
571SIMD_DWRAPPER_2(unpacklo_pd);
572SIMD_WRAPPER_2(unpacklo_ps);
573
574//-----------------------------------------------------------------------
575// Load / store operations
576//-----------------------------------------------------------------------
577template <ScaleFactor ScaleT = ScaleFactor::SF_1>
578static SIMDINLINE Float SIMDCALL
579                        i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
580{
581    uint32_t* pOffsets = (uint32_t*)&idx;
582    Float     vResult;
583    float*    pResult = (float*)&vResult;
584    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
585    {
586        uint32_t offset = pOffsets[i];
587        offset          = offset * static_cast<uint32_t>(ScaleT);
588        pResult[i]      = *(float const*)(((uint8_t const*)p + offset));
589    }
590
591    return vResult;
592}
593
594template <ScaleFactor ScaleT = ScaleFactor::SF_1>
595static SIMDINLINE Float SIMDCALL
596sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
597{
598    return i32gather_ps<ScaleT>(p, idx);
599}
600
601static SIMDINLINE Float SIMDCALL
602                        load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
603{
604    return broadcast_ss(p);
605}
606
607static SIMDINLINE Float SIMDCALL
608                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
609{
610    return _mm256_load_ps(p);
611}
612
613static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
614{
615    return _mm256_load_si256(&p->v);
616}
617
618static SIMDINLINE Float SIMDCALL
619                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
620{
621    return _mm256_loadu_ps(p);
622}
623
624static SIMDINLINE Integer SIMDCALL
625                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
626{
627    return _mm256_lddqu_si256(&p->v);
628}
629
630// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
631template <ScaleFactor ScaleT = ScaleFactor::SF_1>
632static SIMDINLINE Float SIMDCALL
633                        mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
634{
635    uint32_t* pOffsets = (uint32_t*)&idx;
636    Float     vResult  = old;
637    float*    pResult  = (float*)&vResult;
638    unsigned long index = 0;
639    uint32_t  umask = movemask_ps(mask);
640    while (_BitScanForward(&index, umask))
641    {
642        umask &= ~(1 << index);
643        uint32_t offset = pOffsets[index];
644        offset          = offset * static_cast<uint32_t>(ScaleT);
645        pResult[index]  = *(float const*)(((uint8_t const*)p + offset));
646    }
647
648    return vResult;
649}
650
651template <ScaleFactor ScaleT = ScaleFactor::SF_1>
652static SIMDINLINE Float SIMDCALL
653sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
654{
655    return mask_i32gather_ps<ScaleT>(old, p, idx, mask);
656}
657
658static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
659{
660    _mm256_maskstore_ps(p, mask, src);
661}
662
663static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a)
664{
665    return SIMD128T::movemask_epi8(a.v4[0]) | (SIMD128T::movemask_epi8(a.v4[1]) << 16);
666}
667
668static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
669{
670    return static_cast<uint32_t>(_mm256_movemask_pd(a));
671}
672static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
673{
674    return static_cast<uint32_t>(_mm256_movemask_ps(a));
675}
676
677static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
678{
679    return _mm256_set1_epi32(i);
680}
681
682static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
683{
684    return _mm256_set1_epi8(i);
685}
686
687static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
688{
689    return _mm256_set1_ps(f);
690}
691
692static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
693{
694    return _mm256_setzero_ps();
695}
696
697static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
698{
699    return _mm256_setzero_si256();
700}
701
702static SIMDINLINE void SIMDCALL
703                       store_ps(float* p, Float const& a) // *p = a   (stores all elements contiguously in memory)
704{
705    _mm256_store_ps(p, a);
706}
707
708static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
709{
710    _mm256_store_si256(&p->v, a);
711}
712
713static SIMDINLINE void SIMDCALL
714                       stream_ps(float* p, Float const& a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
715{
716    _mm256_stream_ps(p, a);
717}
718
719//=======================================================================
720// Legacy interface (available only in SIMD256 width)
721//=======================================================================
722
723static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const* p)
724{
725    return _mm256_broadcast_ps(&p->v);
726}
727
728template <int ImmT>
729static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double const& a)
730{
731    return _mm256_extractf128_pd(a, ImmT);
732}
733
734template <int ImmT>
735static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float const& a)
736{
737    return _mm256_extractf128_ps(a, ImmT);
738}
739
740template <int ImmT>
741static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer const& a)
742{
743    return _mm256_extractf128_si256(a, ImmT);
744}
745
746template <int ImmT>
747static SIMDINLINE Double SIMDCALL insertf128_pd(Double const& a, SIMD128Impl::Double const& b)
748{
749    return _mm256_insertf128_pd(a, b, ImmT);
750}
751
752template <int ImmT>
753static SIMDINLINE Float SIMDCALL insertf128_ps(Float const& a, SIMD128Impl::Float const& b)
754{
755    return _mm256_insertf128_ps(a, b, ImmT);
756}
757
758template <int ImmT>
759static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const& a, SIMD128Impl::Integer const& b)
760{
761    return _mm256_insertf128_si256(a, b, ImmT);
762}
763
764#ifndef _mm256_set_m128i
765#define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \
766    _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
767#endif
768
769#ifndef _mm256_loadu2_m128i
770#define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \
771                            /* SIMD128Impl::Integer const* */ loaddr) \
772    _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
773#endif
774
775static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi,
776                                             SIMD128Impl::Integer const* plo)
777{
778    return _mm256_loadu2_m128i(&phi->v, &plo->v);
779}
780
781static SIMDINLINE Integer SIMDCALL
782                          set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
783{
784    return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
785}
786
787static SIMDINLINE Float SIMDCALL
788                        set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
789{
790    return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
791}
792
793static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer* phi,
794                                           SIMD128Impl::Integer* plo,
795                                           Integer const&        src)
796{
797    _mm256_storeu2_m128i(&phi->v, &plo->v, src);
798}
799
800static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
801{
802    Integer       vec = set1_epi32(mask);
803    const Integer bit = set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
804    vec               = and_si(vec, bit);
805    vec               = cmplt_epi32(setzero_si(), vec);
806    return castsi_ps(vec);
807}
808
809#undef SIMD_WRAPPER_1
810#undef SIMD_WRAPPER_2
811#undef SIMD_DWRAPPER_2
812#undef SIMD_DWRAPPER_2I
813#undef SIMD_WRAPPER_2I
814#undef SIMD_WRAPPER_3
815#undef SIMD_IWRAPPER_1
816#undef SIMD_IWRAPPER_2
817#undef SIMD_IFWRAPPER_2
818#undef SIMD_IFWRAPPER_2I
819#undef SIMD_IWRAPPER_2I
820#undef SIMD_IWRAPPER_2I_
821#undef SIMD_IWRAPPER_2_
822#undef SIMD_IWRAPPER_3
823#undef SIMD_EMU_IWRAPPER_1
824#undef SIMD_EMU_IWRAPPER_1I
825#undef SIMD_EMU_IWRAPPER_2
826#undef SIMD_EMU_IWRAPPER_2I
827