1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23 
24 #ifndef __SWR_INTRIN_H__
25 #define __SWR_INTRIN_H__
26 
27 #include "os.h"
28 
29 #define SIMD_ARCH KNOB_ARCH
30 #include "simdlib_types.hpp"
31 
32 typedef SIMDImpl::SIMD128Impl::Float                      simd4scalar;
33 typedef SIMDImpl::SIMD128Impl::Double                     simd4scalard;
34 typedef SIMDImpl::SIMD128Impl::Integer                    simd4scalari;
35 typedef SIMDImpl::SIMD128Impl::Vec4                       simd4vector;
36 typedef SIMDImpl::SIMD128Impl::Mask                       simd4mask;
37 
38 typedef SIMDImpl::SIMD256Impl::Float                      simd8scalar;
39 typedef SIMDImpl::SIMD256Impl::Double                     simd8scalard;
40 typedef SIMDImpl::SIMD256Impl::Integer                    simd8scalari;
41 typedef SIMDImpl::SIMD256Impl::Vec4                       simd8vector;
42 typedef SIMDImpl::SIMD256Impl::Mask                       simd8mask;
43 
44 typedef SIMDImpl::SIMD512Impl::Float                      simd16scalar;
45 typedef SIMDImpl::SIMD512Impl::Double                     simd16scalard;
46 typedef SIMDImpl::SIMD512Impl::Integer                    simd16scalari;
47 typedef SIMDImpl::SIMD512Impl::Vec4                       simd16vector;
48 typedef SIMDImpl::SIMD512Impl::Mask                       simd16mask;
49 
50 #if KNOB_SIMD_WIDTH == 8
51 typedef simd8scalar     simdscalar;
52 typedef simd8scalard    simdscalard;
53 typedef simd8scalari    simdscalari;
54 typedef simd8vector     simdvector;
55 typedef simd8mask       simdmask;
56 #else
57 #error Unsupported vector width
58 #endif
59 
60 INLINE
pdep_u32(UINT a,UINT mask)61 UINT pdep_u32(UINT a, UINT mask)
62 {
63 #if KNOB_ARCH >= KNOB_ARCH_AVX2
64     return _pdep_u32(a, mask);
65 #else
66     UINT result = 0;
67 
68     // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
69     // using bsf instead of funky loop
70     DWORD maskIndex;
71     while (_BitScanForward(&maskIndex, mask))
72     {
73         // 1. isolate lowest set bit of mask
74         const UINT lowest = 1 << maskIndex;
75 
76         // 2. populate LSB from src
77         const UINT LSB = (UINT)((int)(a << 31) >> 31);
78 
79         // 3. copy bit from mask
80         result |= LSB & lowest;
81 
82         // 4. clear lowest bit
83         mask &= ~lowest;
84 
85         // 5. prepare for next iteration
86         a >>= 1;
87     }
88 
89     return result;
90 #endif
91 }
92 
93 INLINE
pext_u32(UINT a,UINT mask)94 UINT pext_u32(UINT a, UINT mask)
95 {
96 #if KNOB_ARCH >= KNOB_ARCH_AVX2
97     return _pext_u32(a, mask);
98 #else
99     UINT result = 0;
100     DWORD maskIndex;
101     uint32_t currentBit = 0;
102     while (_BitScanForward(&maskIndex, mask))
103     {
104         // 1. isolate lowest set bit of mask
105         const UINT lowest = 1 << maskIndex;
106 
107         // 2. copy bit from mask
108         result |= ((a & lowest) > 0) << currentBit++;
109 
110         // 3. clear lowest bit
111         mask &= ~lowest;
112     }
113     return result;
114 #endif
115 }
116 
117 #endif//__SWR_INTRIN_H__
118