1 /*
2  * Copyright 2015 Red Hat Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  *
23  * Author: Oded Gabbay <oded.gabbay@redhat.com>
24  */
25 
26 /**
27  * @file
28  * POWER8 intrinsics portability header.
29  *
30  */
31 
32 #ifndef U_PWR8_H_
33 #define U_PWR8_H_
34 
35 #if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
36 
37 #define VECTOR_ALIGN_16 __attribute__ ((__aligned__ (16)))
38 
39 typedef VECTOR_ALIGN_16 vector unsigned char __m128i;
40 
41 typedef VECTOR_ALIGN_16 union m128i {
42    __m128i m128i;
43    vector signed int m128si;
44    vector unsigned int m128ui;
45    ubyte ub[16];
46    ushort us[8];
47    int i[4];
48    uint ui[4];
49 } __m128i_union;
50 
51 static inline __m128i
vec_set_epi32(int i3,int i2,int i1,int i0)52 vec_set_epi32 (int i3, int i2, int i1, int i0)
53 {
54    __m128i_union vdst;
55 
56 #ifdef PIPE_ARCH_LITTLE_ENDIAN
57    vdst.i[0] = i0;
58    vdst.i[1] = i1;
59    vdst.i[2] = i2;
60    vdst.i[3] = i3;
61 #else
62    vdst.i[3] = i0;
63    vdst.i[2] = i1;
64    vdst.i[1] = i2;
65    vdst.i[0] = i3;
66 #endif
67 
68    return (__m128i) vdst.m128si;
69 }
70 
71 static inline __m128i
vec_setr_epi32(int i0,int i1,int i2,int i3)72 vec_setr_epi32 (int i0, int i1, int i2, int i3)
73 {
74   return vec_set_epi32 (i3, i2, i1, i0);
75 }
76 
77 static inline __m128i
vec_unpacklo_epi32(__m128i even,__m128i odd)78 vec_unpacklo_epi32 (__m128i even, __m128i odd)
79 {
80    static const __m128i perm_mask =
81 #ifdef PIPE_ARCH_LITTLE_ENDIAN
82       { 0,  1,  2,  3, 16, 17, 18, 19,  4,  5,  6,  7, 20, 21, 22, 23};
83 #else
84       {24, 25, 26, 27,  8,  9, 10, 11, 28, 29, 30, 31, 12, 13, 14, 15};
85 #endif
86 
87    return vec_perm (even, odd, perm_mask);
88 }
89 
90 static inline __m128i
vec_unpackhi_epi32(__m128i even,__m128i odd)91 vec_unpackhi_epi32 (__m128i even, __m128i odd)
92 {
93    static const __m128i perm_mask =
94 #ifdef PIPE_ARCH_LITTLE_ENDIAN
95       { 8,  9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31};
96 #else
97       {16, 17, 18, 19,  0,  1,  2,  3, 20, 21, 22, 23,  4,  5,  6,  7};
98 #endif
99 
100    return vec_perm (even, odd, perm_mask);
101 }
102 
103 static inline __m128i
vec_unpacklo_epi64(__m128i even,__m128i odd)104 vec_unpacklo_epi64 (__m128i even, __m128i odd)
105 {
106    static const __m128i perm_mask =
107 #ifdef PIPE_ARCH_LITTLE_ENDIAN
108       { 0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23};
109 #else
110       {24, 25, 26, 27, 28, 29, 30, 31,  8,  9, 10, 11, 12, 13, 14, 15};
111 #endif
112 
113    return vec_perm (even, odd, perm_mask);
114 }
115 
116 static inline __m128i
vec_unpackhi_epi64(__m128i even,__m128i odd)117 vec_unpackhi_epi64 (__m128i even, __m128i odd)
118 {
119    static const __m128i perm_mask =
120 #ifdef PIPE_ARCH_LITTLE_ENDIAN
121       { 8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
122 #else
123       {16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7};
124 #endif
125 
126    return vec_perm (even, odd, perm_mask);
127 }
128 
129 static inline __m128i
vec_add_epi32(__m128i a,__m128i b)130 vec_add_epi32 (__m128i a, __m128i b)
131 {
132    return (__m128i) vec_add ((vector signed int) a, (vector signed int) b);
133 }
134 
135 static inline __m128i
vec_sub_epi32(__m128i a,__m128i b)136 vec_sub_epi32 (__m128i a, __m128i b)
137 {
138    return (__m128i) vec_sub ((vector signed int) a, (vector signed int) b);
139 }
140 
141 /* Call this function ONLY on POWER8 and newer platforms */
142 static inline __m128i
vec_mullo_epi32(__m128i a,__m128i b)143 vec_mullo_epi32 (__m128i a, __m128i b)
144 {
145    __m128i v;
146 
147    __asm__(
148            "vmuluwm %0, %1, %2   \n"
149            : "=v" (v)
150            : "v" (a), "v" (b)
151            );
152 
153    return v;
154 }
155 
156 static inline __m128i
vec_andnot_si128(__m128i a,__m128i b)157 vec_andnot_si128 (__m128i a, __m128i b)
158 {
159    return vec_andc (b, a);
160 }
161 
162 static inline void
transpose4_epi32(const __m128i * restrict a,const __m128i * restrict b,const __m128i * restrict c,const __m128i * restrict d,__m128i * restrict o,__m128i * restrict p,__m128i * restrict q,__m128i * restrict r)163 transpose4_epi32(const __m128i * restrict a,
164                  const __m128i * restrict b,
165                  const __m128i * restrict c,
166                  const __m128i * restrict d,
167                  __m128i * restrict o,
168                  __m128i * restrict p,
169                  __m128i * restrict q,
170                  __m128i * restrict r)
171 {
172    __m128i t0 = vec_unpacklo_epi32(*a, *b);
173    __m128i t1 = vec_unpacklo_epi32(*c, *d);
174    __m128i t2 = vec_unpackhi_epi32(*a, *b);
175    __m128i t3 = vec_unpackhi_epi32(*c, *d);
176 
177    *o = vec_unpacklo_epi64(t0, t1);
178    *p = vec_unpackhi_epi64(t0, t1);
179    *q = vec_unpacklo_epi64(t2, t3);
180    *r = vec_unpackhi_epi64(t2, t3);
181 }
182 
183 static inline __m128i
vec_slli_epi32(__m128i vsrc,unsigned int count)184 vec_slli_epi32 (__m128i vsrc, unsigned int count)
185 {
186    __m128i_union vec_count;
187 
188    if (count >= 32)
189       return (__m128i) vec_splats (0);
190    else if (count == 0)
191       return vsrc;
192 
193    /* In VMX, all shift count fields must contain the same value */
194    vec_count.m128si = (vector signed int) vec_splats (count);
195    return (__m128i) vec_sl ((vector signed int) vsrc, vec_count.m128ui);
196 }
197 
198 static inline __m128i
vec_srli_epi32(__m128i vsrc,unsigned int count)199 vec_srli_epi32 (__m128i vsrc, unsigned int count)
200 {
201    __m128i_union vec_count;
202 
203    if (count >= 32)
204       return (__m128i) vec_splats (0);
205    else if (count == 0)
206       return vsrc;
207 
208    /* In VMX, all shift count fields must contain the same value */
209    vec_count.m128si = (vector signed int) vec_splats (count);
210    return (__m128i) vec_sr ((vector signed int) vsrc, vec_count.m128ui);
211 }
212 
213 static inline __m128i
vec_srai_epi32(__m128i vsrc,unsigned int count)214 vec_srai_epi32 (__m128i vsrc, unsigned int count)
215 {
216    __m128i_union vec_count;
217 
218    if (count >= 32)
219       return (__m128i) vec_splats (0);
220    else if (count == 0)
221       return vsrc;
222 
223    /* In VMX, all shift count fields must contain the same value */
224    vec_count.m128si = (vector signed int) vec_splats (count);
225    return (__m128i) vec_sra ((vector signed int) vsrc, vec_count.m128ui);
226 }
227 
228 static inline __m128i
vec_cmpeq_epi32(__m128i a,__m128i b)229 vec_cmpeq_epi32 (__m128i a, __m128i b)
230 {
231    return (__m128i) vec_cmpeq ((vector signed int) a, (vector signed int) b);
232 }
233 
234 static inline __m128i
vec_loadu_si128(const uint32_t * src)235 vec_loadu_si128 (const uint32_t* src)
236 {
237    __m128i_union vsrc;
238 
239 #ifdef PIPE_ARCH_LITTLE_ENDIAN
240 
241    vsrc.m128ui = *((vector unsigned int *) src);
242 
243 #else
244 
245    __m128i vmask, tmp1, tmp2;
246 
247    vmask = vec_lvsl(0, src);
248 
249    tmp1 = (__m128i) vec_ld (0, src);
250    tmp2 = (__m128i) vec_ld (15, src);
251    vsrc.m128ui = (vector unsigned int) vec_perm (tmp1, tmp2, vmask);
252 
253 #endif
254 
255    return vsrc.m128i;
256 }
257 
258 static inline __m128i
vec_load_si128(const uint32_t * src)259 vec_load_si128 (const uint32_t* src)
260 {
261    __m128i_union vsrc;
262 
263    vsrc.m128ui = *((vector unsigned int *) src);
264 
265    return vsrc.m128i;
266 }
267 
268 static inline void
vec_store_si128(uint32_t * dest,__m128i vdata)269 vec_store_si128 (uint32_t* dest, __m128i vdata)
270 {
271    vec_st ((vector unsigned int) vdata, 0, dest);
272 }
273 
274 /* Call this function ONLY on POWER8 and newer platforms */
275 static inline int
vec_movemask_epi8(__m128i vsrc)276 vec_movemask_epi8 (__m128i vsrc)
277 {
278    __m128i_union vtemp;
279    int result;
280 
281    vtemp.m128i = vec_vgbbd(vsrc);
282 
283 #ifdef PIPE_ARCH_LITTLE_ENDIAN
284    result = vtemp.ub[15] << 8 | vtemp.ub[7];
285 #else
286    result = vtemp.ub[0] << 8 | vtemp.ub[8];
287 #endif
288 
289    return result;
290 }
291 
292 static inline __m128i
vec_packs_epi16(__m128i a,__m128i b)293 vec_packs_epi16 (__m128i a, __m128i b)
294 {
295 #ifdef PIPE_ARCH_LITTLE_ENDIAN
296    return (__m128i) vec_packs ((vector signed short) a,
297                                (vector signed short) b);
298 #else
299    return (__m128i) vec_packs ((vector signed short) b,
300                                (vector signed short) a);
301 #endif
302 }
303 
304 static inline __m128i
vec_packs_epi32(__m128i a,__m128i b)305 vec_packs_epi32 (__m128i a, __m128i b)
306 {
307 #ifdef PIPE_ARCH_LITTLE_ENDIAN
308    return (__m128i) vec_packs ((vector signed int) a, (vector signed int) b);
309 #else
310    return (__m128i) vec_packs ((vector signed int) b, (vector signed int) a);
311 #endif
312 }
313 
314 #endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */
315 
316 #endif /* U_PWR8_H_ */
317