1 /****************************************************************************
2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  ****************************************************************************/
23 
24 #ifndef __SWR_INTRIN_H__
25 #define __SWR_INTRIN_H__
26 
27 #include "os.h"
28 
29 #if !defined(SIMD_ARCH)
30 #define SIMD_ARCH KNOB_ARCH
31 #endif
32 
33 #include "simdlib_types.hpp"
34 
35 typedef SIMDImpl::SIMD128Impl::Float   simd4scalar;
36 typedef SIMDImpl::SIMD128Impl::Double  simd4scalard;
37 typedef SIMDImpl::SIMD128Impl::Integer simd4scalari;
38 typedef SIMDImpl::SIMD128Impl::Vec4    simd4vector;
39 typedef SIMDImpl::SIMD128Impl::Mask    simd4mask;
40 
41 typedef SIMDImpl::SIMD256Impl::Float   simd8scalar;
42 typedef SIMDImpl::SIMD256Impl::Double  simd8scalard;
43 typedef SIMDImpl::SIMD256Impl::Integer simd8scalari;
44 typedef SIMDImpl::SIMD256Impl::Vec4    simd8vector;
45 typedef SIMDImpl::SIMD256Impl::Mask    simd8mask;
46 
47 typedef SIMDImpl::SIMD512Impl::Float   simd16scalar;
48 typedef SIMDImpl::SIMD512Impl::Double  simd16scalard;
49 typedef SIMDImpl::SIMD512Impl::Integer simd16scalari;
50 typedef SIMDImpl::SIMD512Impl::Vec4    simd16vector;
51 typedef SIMDImpl::SIMD512Impl::Mask    simd16mask;
52 
53 #if KNOB_SIMD_WIDTH == 8
54 typedef simd8scalar  simdscalar;
55 typedef simd8scalard simdscalard;
56 typedef simd8scalari simdscalari;
57 typedef simd8vector  simdvector;
58 typedef simd8mask    simdmask;
59 #else
60 #error Unsupported vector width
61 #endif
62 
63 INLINE
pdep_u32(UINT a,UINT mask)64 UINT pdep_u32(UINT a, UINT mask)
65 {
66 #if KNOB_ARCH >= KNOB_ARCH_AVX2
67     return _pdep_u32(a, mask);
68 #else
69     UINT result = 0;
70 
71     // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
72     // using bsf instead of funky loop
73     unsigned long maskIndex = 0;
74     while (_BitScanForward(&maskIndex, mask))
75     {
76         // 1. isolate lowest set bit of mask
77         const UINT lowest = 1 << maskIndex;
78 
79         // 2. populate LSB from src
80         const UINT LSB = (UINT)((int)(a << 31) >> 31);
81 
82         // 3. copy bit from mask
83         result |= LSB & lowest;
84 
85         // 4. clear lowest bit
86         mask &= ~lowest;
87 
88         // 5. prepare for next iteration
89         a >>= 1;
90     }
91 
92     return result;
93 #endif
94 }
95 
96 INLINE
pext_u32(UINT a,UINT mask)97 UINT pext_u32(UINT a, UINT mask)
98 {
99 #if KNOB_ARCH >= KNOB_ARCH_AVX2
100     return _pext_u32(a, mask);
101 #else
102     UINT     result = 0;
103     unsigned long maskIndex;
104     uint32_t currentBit = 0;
105     while (_BitScanForward(&maskIndex, mask))
106     {
107         // 1. isolate lowest set bit of mask
108         const UINT lowest = 1 << maskIndex;
109 
110         // 2. copy bit from mask
111         result |= ((a & lowest) > 0) << currentBit++;
112 
113         // 3. clear lowest bit
114         mask &= ~lowest;
115     }
116     return result;
117 #endif
118 }
119 
120 #endif //__SWR_INTRIN_H__
121