1/****************************************************************************
2* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
3*
4* Permission is hereby granted, free of charge, to any person obtaining a
5* copy of this software and associated documentation files (the "Software"),
6* to deal in the Software without restriction, including without limitation
7* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8* and/or sell copies of the Software, and to permit persons to whom the
9* Software is furnished to do so, subject to the following conditions:
10*
11* The above copyright notice and this permission notice (including the next
12* paragraph) shall be included in all copies or substantial portions of the
13* Software.
14*
15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21* IN THE SOFTWARE.
22****************************************************************************/
23#if !defined(__SIMD_LIB_AVX_HPP__)
24#error Do not include this file directly, use "simdlib.hpp" instead.
25#endif
26
27//============================================================================
28// SIMD16 AVX (1) implementation
29//============================================================================
30
31static const int TARGET_SIMD_WIDTH = 8;
32using SIMD128T = SIMD128Impl::AVXImpl;
33
34#define SIMD_WRAPPER_1(op)  \
35    static SIMDINLINE Float SIMDCALL op(Float const &a)   \
36    {\
37        return Float\
38        {\
39            SIMD256T::op(a.v8[0]),\
40            SIMD256T::op(a.v8[1]),\
41        };\
42    }
43
44#define SIMD_WRAPPER_2(op)  \
45    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)    \
46    {\
47        return Float\
48        {\
49            SIMD256T::op(a.v8[0], b.v8[0]),\
50            SIMD256T::op(a.v8[1], b.v8[1]),\
51        };\
52    }
53
54#define SIMD_WRAPPER_2I(op)  \
55    template<int ImmT>\
56    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)   \
57    {\
58        return Float\
59        {\
60            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
61            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
62        };\
63    }
64
65#define SIMD_WRAPPER_2I_1(op)  \
66    template<int ImmT>\
67    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)   \
68    {\
69        return Float\
70        {\
71            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
72            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
73        };\
74    }
75
76#define SIMD_WRAPPER_3(op)  \
77        static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b, Float const &c)   \
78        {\
79            return Float\
80            {\
81                SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
82                SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
83            };\
84        }
85
86#define SIMD_IWRAPPER_1(op)  \
87    static SIMDINLINE Integer SIMDCALL op(Integer const &a)   \
88    {\
89        return Integer\
90        {\
91            SIMD256T::op(a.v8[0]),\
92            SIMD256T::op(a.v8[1]),\
93        };\
94    }
95
96#define SIMD_IWRAPPER_2(op)  \
97    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
98    {\
99        return Integer\
100        {\
101            SIMD256T::op(a.v8[0], b.v8[0]),\
102            SIMD256T::op(a.v8[1], b.v8[1]),\
103        };\
104    }
105
106#define SIMD_IWRAPPER_2I(op)  \
107    template<int ImmT>\
108    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
109    {\
110        return Integer\
111        {\
112            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
113            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
114        };\
115    }
116
117#define SIMD_IWRAPPER_2I_1(op)  \
118    template<int ImmT>\
119    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
120    {\
121        return Integer\
122        {\
123            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
124            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
125        };\
126    }
127
128#define SIMD_IWRAPPER_2I_2(op)  \
129    template<int ImmT>\
130    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
131    {\
132        return Integer\
133        {\
134            SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\
135            SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\
136        };\
137    }
138
139#define SIMD_IWRAPPER_3(op)  \
140    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b, Integer const &c)   \
141    {\
142        return Integer\
143        {\
144            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
145            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
146        };\
147    }
148
149//-----------------------------------------------------------------------
150// Single precision floating point arithmetic operations
151//-----------------------------------------------------------------------
152SIMD_WRAPPER_2(add_ps);     // return a + b
153SIMD_WRAPPER_2(div_ps);     // return a / b
154SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
155SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
156SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
157SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
158SIMD_WRAPPER_2(mul_ps);     // return a * b
159SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
160SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
161SIMD_WRAPPER_2(sub_ps);     // return a - b
162
163template <RoundMode RMT>
164static SIMDINLINE Float SIMDCALL round_ps(Float const &a)
165{
166    return Float
167    {
168        SIMD256T::template round_ps<RMT>(a.v8[0]),
169        SIMD256T::template round_ps<RMT>(a.v8[1]),
170    };
171}
172
173static SIMDINLINE Float SIMDCALL ceil_ps(Float const &a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
174static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
175
176//-----------------------------------------------------------------------
177// Integer (various width) arithmetic operations
178//-----------------------------------------------------------------------
179SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
180SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
181SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
182SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
183SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
184SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
185SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
186SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
187SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
188
189// return (a * b) & 0xFFFFFFFF
190//
191// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
192// and store the low 32 bits of the intermediate integers in dst.
193SIMD_IWRAPPER_2(mullo_epi32);
194SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
195SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
196SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
197
198//-----------------------------------------------------------------------
199// Logical operations
200//-----------------------------------------------------------------------
201SIMD_WRAPPER_2(and_ps);     // return a & b       (float treated as int)
202SIMD_IWRAPPER_2(and_si);    // return a & b       (int)
203SIMD_WRAPPER_2(andnot_ps);  // return (~a) & b    (float treated as int)
204SIMD_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
205SIMD_WRAPPER_2(or_ps);      // return a | b       (float treated as int)
206SIMD_IWRAPPER_2(or_si);     // return a | b       (int)
207SIMD_WRAPPER_2(xor_ps);     // return a ^ b       (float treated as int)
208SIMD_IWRAPPER_2(xor_si);    // return a ^ b       (int)
209
210
211//-----------------------------------------------------------------------
212// Shift operations
213//-----------------------------------------------------------------------
214template<int ImmT>
215static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const &a)      // return a << ImmT
216{
217    return Integer
218    {
219        SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
220        SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
221    };
222}
223
224SIMD_IWRAPPER_2(sllv_epi32);                                // return a << b      (uint32)
225
226template<int ImmT>
227static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const &a)      // return a >> ImmT   (int32)
228{
229    return Integer
230    {
231        SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
232        SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
233    };
234}
235
236template<int ImmT>
237static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const &a)      // return a >> ImmT   (uint32)
238{
239    return Integer
240    {
241        SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
242        SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
243    };
244}
245
246template<int ImmT>                                          // for each 128-bit lane:
247static SIMDINLINE Integer SIMDCALL srli_si(Integer const &a)         //  return a >> (ImmT*8) (uint)
248{
249    return Integer
250    {
251        SIMD256T::template srli_si<ImmT>(a.v8[0]),
252        SIMD256T::template srli_si<ImmT>(a.v8[1]),
253    };
254}
255template<int ImmT>
256static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a)       // same as srli_si, but with Float cast to int
257{
258    return Float
259    {
260        SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
261        SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
262    };
263}
264
265SIMD_IWRAPPER_2(srlv_epi32);                                // return a >> b      (uint32)
266
267//-----------------------------------------------------------------------
268// Conversion operations
269//-----------------------------------------------------------------------
270static SIMDINLINE Float SIMDCALL castpd_ps(Double const &a)              // return *(Float*)(&a)
271{
272    return Float
273    {
274        SIMD256T::castpd_ps(a.v8[0]),
275        SIMD256T::castpd_ps(a.v8[1]),
276    };
277}
278
279static SIMDINLINE Integer SIMDCALL castps_si(Float const &a)              // return *(Integer*)(&a)
280{
281    return Integer
282    {
283        SIMD256T::castps_si(a.v8[0]),
284        SIMD256T::castps_si(a.v8[1]),
285    };
286}
287
288static SIMDINLINE Double SIMDCALL castsi_pd(Integer const &a)              // return *(Double*)(&a)
289{
290    return Double
291    {
292        SIMD256T::castsi_pd(a.v8[0]),
293        SIMD256T::castsi_pd(a.v8[1]),
294    };
295}
296
297static SIMDINLINE Double SIMDCALL castps_pd(Float const &a)   // return *(Double*)(&a)
298{
299    return Double
300    {
301        SIMD256T::castps_pd(a.v8[0]),
302        SIMD256T::castps_pd(a.v8[1]),
303    };
304}
305
306static SIMDINLINE Float SIMDCALL castsi_ps(Integer const &a)              // return *(Float*)(&a)
307{
308    return Float
309    {
310        SIMD256T::castsi_ps(a.v8[0]),
311        SIMD256T::castsi_ps(a.v8[1]),
312    };
313}
314
315static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer const &a)            // return (float)a    (int32 --> float)
316{
317    return Float
318    {
319        SIMD256T::cvtepi32_ps(a.v8[0]),
320        SIMD256T::cvtepi32_ps(a.v8[1]),
321    };
322}
323
324static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer const &a)          // return (int16)a    (uint8 --> int16)
325{
326    return Integer
327    {
328        SIMD256T::cvtepu8_epi16(a.v4[0]),
329        SIMD256T::cvtepu8_epi16(a.v4[1]),
330    };
331}
332
333static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer const &a)          // return (int32)a    (uint8 --> int32)
334{
335    return Integer
336	{
337        SIMD256T::cvtepu8_epi32(a.v4[0]),
338        SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
339	};
340}
341
342static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer const &a)         // return (int32)a    (uint16 --> int32)
343{
344    return Integer
345    {
346        SIMD256T::cvtepu16_epi32(a.v4[0]),
347        SIMD256T::cvtepu16_epi32(a.v4[1]),
348    };
349}
350
351static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer const &a)         // return (int64)a    (uint16 --> int64)
352{
353    return Integer
354    {
355        SIMD256T::cvtepu16_epi64(a.v4[0]),
356        SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
357    };
358}
359
360static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer const &a)         // return (int64)a    (uint32 --> int64)
361{
362    return Integer
363    {
364        SIMD256T::cvtepu32_epi64(a.v4[0]),
365        SIMD256T::cvtepu32_epi64(a.v4[1]),
366    };
367}
368
369static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float const &a)            // return (int32)a    (float --> int32)
370{
371    return Integer
372    {
373        SIMD256T::cvtps_epi32(a.v8[0]),
374        SIMD256T::cvtps_epi32(a.v8[1]),
375    };
376}
377
378static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a)           // return (int32)a    (rnd_to_zero(float) --> int32)
379{
380    return Integer
381    {
382        SIMD256T::cvtps_epi32(a.v8[0]),
383        SIMD256T::cvtps_epi32(a.v8[1]),
384    };
385}
386
387//-----------------------------------------------------------------------
388// Comparison operations
389//-----------------------------------------------------------------------
390template<CompareType CmpTypeT>
391static SIMDINLINE Float SIMDCALL cmp_ps(Float const &a, Float const &b) // return a (CmpTypeT) b
392{
393    return Float
394    {
395        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
396        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
397    };
398}
399static SIMDINLINE Float SIMDCALL cmplt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
400static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
401static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
402static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
403static SIMDINLINE Float SIMDCALL cmpge_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
404static SIMDINLINE Float SIMDCALL cmple_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
405
406template<CompareType CmpTypeT>
407static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const &a, Float const &b)
408{
409    return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
410}
411
412
413SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
414SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
415SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
416SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
417SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
418SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
419SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
420SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
421SIMD_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
422
423static SIMDINLINE bool SIMDCALL testz_ps(Float const &a, Float const &b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
424{
425    return  0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) &
426                  SIMD256T::testz_ps(a.v8[1], b.v8[1]));
427}
428
429static SIMDINLINE int SIMDCALL testz_si(Integer const &a, Integer const &b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
430{
431    return  0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
432                  SIMD256T::testz_si(a.v8[1], b.v8[1]));
433}
434
435//-----------------------------------------------------------------------
436// Blend / shuffle / permute operations
437//-----------------------------------------------------------------------
438SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
439SIMD_IWRAPPER_2I(blend_epi32);  // return ImmT ? b : a  (int32)
440SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
441static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Float const &mask) // return mask ? b : a (int)
442{
443    return Integer
444    {
445        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
446        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
447    };
448}
449
450static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Integer const &mask) // return mask ? b : a (int)
451{
452    return Integer
453    {
454        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
455        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
456    };
457}
458
459static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)         // return *p (all elements in vector get same value)
460{
461    float f = *p;
462    return Float
463    {
464        SIMD256T::set1_ps(f),
465        SIMD256T::set1_ps(f),
466    };
467}
468
469template<int imm>
470static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const &a)
471{
472    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
473    return a.v8[imm];
474}
475
476template<int imm>
477static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const &a)
478{
479    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
480    return a.v8[imm];
481}
482
483template<int imm>
484static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const &a)
485{
486    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
487    return a.v8[imm];
488}
489
490template<int imm>
491static SIMDINLINE Float SIMDCALL insert_ps(Float const &a, SIMD256Impl::Float const &b)
492{
493    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
494    Float r = a;
495    r.v8[imm] = b;
496    return r;
497}
498
499template<int imm>
500static SIMDINLINE Double SIMDCALL insert_pd(Double const &a, SIMD256Impl::Double const &b)
501{
502    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
503    Double r = a;
504    r.v8[imm] = b;
505    return r;
506}
507
508template<int imm>
509static SIMDINLINE Integer SIMDCALL insert_si(Integer const &a, SIMD256Impl::Integer const &b)
510{
511    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
512    Integer r = a;
513    r.v8[imm] = b;
514    return r;
515}
516
517SIMD_IWRAPPER_2(packs_epi16);      // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
518SIMD_IWRAPPER_2(packs_epi32);      // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
519SIMD_IWRAPPER_2(packus_epi16);     // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
520SIMD_IWRAPPER_2(packus_epi32);     // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
521
522static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
523{
524    return castps_si(permute_ps(castsi_ps(a), swiz));
525}
526
527static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
528{
529    const auto mask = SIMD256T::set1_epi32(7);
530
531    auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask));
532    auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask));
533
534    auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask));
535    auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask));
536
537    return Float
538    {
539        SIMD256T::blendv_ps(lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
540        SIMD256T::blendv_ps(hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
541    };
542}
543
544// All of the 512-bit permute2f128_XX intrinsics do the following:
545//
546//      SELECT4(src, control) {
547//          CASE(control[1:0])
548//              0:	tmp[127:0] : = src[127:0]
549//              1 : tmp[127:0] : = src[255:128]
550//              2 : tmp[127:0] : = src[383:256]
551//              3 : tmp[127:0] : = src[511:384]
552//              ESAC
553//              RETURN tmp[127:0]
554//      }
555//
556//      dst[127:0]   : = SELECT4(a[511:0], imm8[1:0])
557//      dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
558//      dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
559//      dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
560//      dst[MAX:512] : = 0
561//
562// Since the 256-bit AVX instructions use a 4-bit control field (instead
563// of 2-bit for AVX512), we need to expand the control bits sent to the
564// AVX instructions for emulation.
565//
566template <int shuf>
567static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const &a, Float const &b)
568{
569    return Float
570    {
571        SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
572        SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
573    };
574}
575
576template <int shuf>
577static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const &a, Double const &b)
578{
579    return Double
580    {
581        SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
582        SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
583    };
584}
585
586template <int shuf>
587static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const &a, Integer const &b)
588{
589    return Integer
590	{
591        SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
592        SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
593	};
594}
595
596SIMD_IWRAPPER_2I_1(shuffle_epi32);
597SIMD_IWRAPPER_2I_2(shuffle_epi64);
598SIMD_IWRAPPER_2(shuffle_epi8);
599SIMD_WRAPPER_2I_1(shuffle_pd);
600SIMD_WRAPPER_2I_1(shuffle_ps);
601SIMD_IWRAPPER_2(unpackhi_epi16);
602SIMD_IWRAPPER_2(unpackhi_epi32);
603SIMD_IWRAPPER_2(unpackhi_epi64);
604SIMD_IWRAPPER_2(unpackhi_epi8);
605SIMD_WRAPPER_2(unpackhi_pd);
606SIMD_WRAPPER_2(unpackhi_ps);
607SIMD_IWRAPPER_2(unpacklo_epi16);
608SIMD_IWRAPPER_2(unpacklo_epi32);
609SIMD_IWRAPPER_2(unpacklo_epi64);
610SIMD_IWRAPPER_2(unpacklo_epi8);
611SIMD_WRAPPER_2(unpacklo_pd);
612SIMD_WRAPPER_2(unpacklo_ps);
613
614//-----------------------------------------------------------------------
615// Load / store operations
616//-----------------------------------------------------------------------
617template<ScaleFactor ScaleT>
618static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
619{
620    return Float
621    {
622        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
623        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
624    };
625}
626
627static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
628{
629    return broadcast_ss(p);
630}
631
632static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
633{
634    return Float
635    {
636        SIMD256T::load_ps(p),
637        SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)
638    };
639}
640
641static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
642{
643    return Integer
644    {
645        SIMD256T::load_si(&p->v8[0]),
646        SIMD256T::load_si(&p->v8[1]),
647    };
648}
649
650static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
651{
652    return Float
653    {
654        SIMD256T::loadu_ps(p),
655        SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)
656    };
657}
658
659static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
660{
661    return Integer
662    {
663        SIMD256T::loadu_si(&p->v8[0]),
664        SIMD256T::loadu_si(&p->v8[1]),
665    };
666}
667
668// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
669template<ScaleFactor ScaleT>
670static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask)
671{
672    return Float
673    {
674        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
675        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
676    };
677}
678
679static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer const &mask, Float const &src)
680{
681    SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
682    SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
683}
684
685static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const &a)
686{
687    uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
688             mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
689
690    return mask;
691}
692
693static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const &a)
694{
695    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
696             mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
697
698    return mask;
699}
700static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const &a)
701{
702    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
703             mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
704
705    return mask;
706}
707
708static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
709{
710    return Integer
711    {
712        SIMD256T::set1_epi32(i),
713        SIMD256T::set1_epi32(i)
714    };
715}
716
717static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
718{
719    return Integer
720    {
721        SIMD256T::set1_epi8(i),
722        SIMD256T::set1_epi8(i)
723    };
724}
725
726static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
727{
728    return Float
729    {
730        SIMD256T::set1_ps(f),
731        SIMD256T::set1_ps(f)
732    };
733}
734
735static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
736{
737    return Float
738    {
739        SIMD256T::setzero_ps(),
740        SIMD256T::setzero_ps()
741    };
742}
743
744static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
745{
746    return Integer
747    {
748        SIMD256T::setzero_si(),
749        SIMD256T::setzero_si()
750    };
751}
752
753static SIMDINLINE void SIMDCALL store_ps(float *p, Float const &a)    // *p = a   (stores all elements contiguously in memory)
754{
755    SIMD256T::store_ps(p, a.v8[0]);
756    SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
757}
758
759static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer const &a)   // *p = a
760{
761    SIMD256T::store_si(&p->v8[0], a.v8[0]);
762    SIMD256T::store_si(&p->v8[1], a.v8[1]);
763}
764
765static SIMDINLINE void SIMDCALL stream_ps(float *p, Float const &a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
766{
767    SIMD256T::stream_ps(p, a.v8[0]);
768    SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
769}
770
771static SIMDINLINE Integer SIMDCALL set_epi32(
772    int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
773    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
774{
775    return Integer
776    {
777        SIMD256T::set_epi32(
778            i7, i6, i5, i4, i3, i2, i1, i0),
779        SIMD256T::set_epi32(
780            i15, i14, i13, i12, i11, i10, i9, i8)
781    };
782}
783
784static SIMDINLINE Integer SIMDCALL set_epi32(
785    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
786{
787    return set_epi32(
788        0, 0, 0, 0, 0, 0, 0, 0,
789        i7, i6, i5, i4, i3, i2, i1, i0);
790}
791
792static SIMDINLINE Float SIMDCALL set_ps(
793    float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
794    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
795{
796    return Float
797    {
798        SIMD256T::set_ps(
799            i7, i6, i5, i4, i3, i2, i1, i0),
800        SIMD256T::set_ps(
801            i15, i14, i13, i12, i11, i10, i9, i8)
802    };
803}
804
805static SIMDINLINE Float SIMDCALL set_ps(
806    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
807{
808    return set_ps(
809        0, 0, 0, 0, 0, 0, 0, 0,
810        i7, i6, i5, i4, i3, i2, i1, i0);
811}
812
813static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
814{
815    return Float
816    {
817        SIMD256T::vmask_ps(mask),
818        SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)
819    };
820}
821
822#undef SIMD_WRAPPER_1
823#undef SIMD_WRAPPER_2
824#undef SIMD_WRAPPER_2I
825#undef SIMD_WRAPPER_2I_1
826#undef SIMD_WRAPPER_3
827#undef SIMD_IWRAPPER_1
828#undef SIMD_IWRAPPER_2
829#undef SIMD_IWRAPPER_2I
830#undef SIMD_IWRAPPER_2I_1
831#undef SIMD_IWRAPPER_3
832