1 
2 /*---------------------------------------------------------------*/
3 /*--- begin                            host_generic_simd128.c ---*/
4 /*---------------------------------------------------------------*/
5 
6 /*
7    This file is part of Valgrind, a dynamic binary instrumentation
8    framework.
9 
10    Copyright (C) 2010-2015 OpenWorks GbR
11       info@open-works.net
12 
13    This program is free software; you can redistribute it and/or
14    modify it under the terms of the GNU General Public License as
15    published by the Free Software Foundation; either version 2 of the
16    License, or (at your option) any later version.
17 
18    This program is distributed in the hope that it will be useful, but
19    WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21    General Public License for more details.
22 
23    You should have received a copy of the GNU General Public License
24    along with this program; if not, write to the Free Software
25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26    02110-1301, USA.
27 
28    The GNU General Public License is contained in the file COPYING.
29 */
30 
31 /* Generic helper functions for doing 128-bit SIMD arithmetic in cases
32    where the instruction selectors cannot generate code in-line.
33    These are purely back-end entities and cannot be seen/referenced
34    from IR. */
35 
36 #include "libvex_basictypes.h"
37 #include "host_generic_simd128.h"
38 
39 
40 /* Primitive helpers always take args of the real type (signed vs
41    unsigned) but return an unsigned result, so there's no conversion
42    weirdness when stuffing results back in the V128 union fields,
43    which are all unsigned. */
44 
mul32(Int xx,Int yy)45 static inline UInt mul32 ( Int xx, Int yy )
46 {
47    Long t = ((Long)xx) * ((Long)yy);
48    return toUInt(t);
49 }
50 
max32S(Int xx,Int yy)51 static inline UInt max32S ( Int xx, Int yy )
52 {
53    return toUInt((xx > yy) ? xx : yy);
54 }
55 
min32S(Int xx,Int yy)56 static inline UInt min32S ( Int xx, Int yy )
57 {
58    return toUInt((xx < yy) ? xx : yy);
59 }
60 
max32U(UInt xx,UInt yy)61 static inline UInt max32U ( UInt xx, UInt yy )
62 {
63    return toUInt((xx > yy) ? xx : yy);
64 }
65 
min32U(UInt xx,UInt yy)66 static inline UInt min32U ( UInt xx, UInt yy )
67 {
68    return toUInt((xx < yy) ? xx : yy);
69 }
70 
max16U(UShort xx,UShort yy)71 static inline UShort max16U ( UShort xx, UShort yy )
72 {
73    return toUShort((xx > yy) ? xx : yy);
74 }
75 
min16U(UShort xx,UShort yy)76 static inline UShort min16U ( UShort xx, UShort yy )
77 {
78    return toUShort((xx < yy) ? xx : yy);
79 }
80 
max8S(Char xx,Char yy)81 static inline UChar max8S ( Char xx, Char yy )
82 {
83    return toUChar((xx > yy) ? xx : yy);
84 }
85 
min8S(Char xx,Char yy)86 static inline UChar min8S ( Char xx, Char yy )
87 {
88    return toUChar((xx < yy) ? xx : yy);
89 }
90 
cmpEQ64(Long xx,Long yy)91 static inline ULong cmpEQ64 ( Long xx, Long yy )
92 {
93    return (((Long)xx) == ((Long)yy))
94              ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
95 }
96 
cmpGT64S(Long xx,Long yy)97 static inline ULong cmpGT64S ( Long xx, Long yy )
98 {
99    return (((Long)xx) > ((Long)yy))
100              ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
101 }
102 
sar64(ULong v,UInt n)103 static inline ULong sar64 ( ULong v, UInt n )
104 {
105    return ((Long)v) >> n;
106 }
107 
sar8(UChar v,UInt n)108 static inline UChar sar8 ( UChar v, UInt n )
109 {
110    return toUChar(((Char)v) >> n);
111 }
112 
qnarrow32Sto16U(UInt xx0)113 static inline UShort qnarrow32Sto16U ( UInt xx0 )
114 {
115    Int xx = (Int)xx0;
116    if (xx < 0)     xx = 0;
117    if (xx > 65535) xx = 65535;
118    return (UShort)xx;
119 }
120 
narrow32to16(UInt xx)121 static inline UShort narrow32to16 ( UInt xx )
122 {
123    return (UShort)xx;
124 }
125 
narrow16to8(UShort xx)126 static inline UChar narrow16to8 ( UShort xx )
127 {
128    return (UChar)xx;
129 }
130 
131 
132 void VEX_REGPARM(3)
h_generic_calc_Mul32x4(V128 * res,V128 * argL,V128 * argR)133      h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
134                               V128* argL, V128* argR )
135 {
136    res->w32[0] = mul32(argL->w32[0], argR->w32[0]);
137    res->w32[1] = mul32(argL->w32[1], argR->w32[1]);
138    res->w32[2] = mul32(argL->w32[2], argR->w32[2]);
139    res->w32[3] = mul32(argL->w32[3], argR->w32[3]);
140 }
141 
142 void VEX_REGPARM(3)
h_generic_calc_Max32Sx4(V128 * res,V128 * argL,V128 * argR)143      h_generic_calc_Max32Sx4 ( /*OUT*/V128* res,
144                                V128* argL, V128* argR )
145 {
146    res->w32[0] = max32S(argL->w32[0], argR->w32[0]);
147    res->w32[1] = max32S(argL->w32[1], argR->w32[1]);
148    res->w32[2] = max32S(argL->w32[2], argR->w32[2]);
149    res->w32[3] = max32S(argL->w32[3], argR->w32[3]);
150 }
151 
152 void VEX_REGPARM(3)
h_generic_calc_Min32Sx4(V128 * res,V128 * argL,V128 * argR)153      h_generic_calc_Min32Sx4 ( /*OUT*/V128* res,
154                                V128* argL, V128* argR )
155 {
156    res->w32[0] = min32S(argL->w32[0], argR->w32[0]);
157    res->w32[1] = min32S(argL->w32[1], argR->w32[1]);
158    res->w32[2] = min32S(argL->w32[2], argR->w32[2]);
159    res->w32[3] = min32S(argL->w32[3], argR->w32[3]);
160 }
161 
162 void VEX_REGPARM(3)
h_generic_calc_Max32Ux4(V128 * res,V128 * argL,V128 * argR)163      h_generic_calc_Max32Ux4 ( /*OUT*/V128* res,
164                                V128* argL, V128* argR )
165 {
166    res->w32[0] = max32U(argL->w32[0], argR->w32[0]);
167    res->w32[1] = max32U(argL->w32[1], argR->w32[1]);
168    res->w32[2] = max32U(argL->w32[2], argR->w32[2]);
169    res->w32[3] = max32U(argL->w32[3], argR->w32[3]);
170 }
171 
172 void VEX_REGPARM(3)
h_generic_calc_Min32Ux4(V128 * res,V128 * argL,V128 * argR)173      h_generic_calc_Min32Ux4 ( /*OUT*/V128* res,
174                                V128* argL, V128* argR )
175 {
176    res->w32[0] = min32U(argL->w32[0], argR->w32[0]);
177    res->w32[1] = min32U(argL->w32[1], argR->w32[1]);
178    res->w32[2] = min32U(argL->w32[2], argR->w32[2]);
179    res->w32[3] = min32U(argL->w32[3], argR->w32[3]);
180 }
181 
182 void VEX_REGPARM(3)
h_generic_calc_Max16Ux8(V128 * res,V128 * argL,V128 * argR)183      h_generic_calc_Max16Ux8 ( /*OUT*/V128* res,
184                                V128* argL, V128* argR )
185 {
186    res->w16[0] = max16U(argL->w16[0], argR->w16[0]);
187    res->w16[1] = max16U(argL->w16[1], argR->w16[1]);
188    res->w16[2] = max16U(argL->w16[2], argR->w16[2]);
189    res->w16[3] = max16U(argL->w16[3], argR->w16[3]);
190    res->w16[4] = max16U(argL->w16[4], argR->w16[4]);
191    res->w16[5] = max16U(argL->w16[5], argR->w16[5]);
192    res->w16[6] = max16U(argL->w16[6], argR->w16[6]);
193    res->w16[7] = max16U(argL->w16[7], argR->w16[7]);
194 }
195 
196 void VEX_REGPARM(3)
h_generic_calc_Min16Ux8(V128 * res,V128 * argL,V128 * argR)197      h_generic_calc_Min16Ux8 ( /*OUT*/V128* res,
198                                V128* argL, V128* argR )
199 {
200    res->w16[0] = min16U(argL->w16[0], argR->w16[0]);
201    res->w16[1] = min16U(argL->w16[1], argR->w16[1]);
202    res->w16[2] = min16U(argL->w16[2], argR->w16[2]);
203    res->w16[3] = min16U(argL->w16[3], argR->w16[3]);
204    res->w16[4] = min16U(argL->w16[4], argR->w16[4]);
205    res->w16[5] = min16U(argL->w16[5], argR->w16[5]);
206    res->w16[6] = min16U(argL->w16[6], argR->w16[6]);
207    res->w16[7] = min16U(argL->w16[7], argR->w16[7]);
208 }
209 
210 void VEX_REGPARM(3)
h_generic_calc_Max8Sx16(V128 * res,V128 * argL,V128 * argR)211      h_generic_calc_Max8Sx16 ( /*OUT*/V128* res,
212                                V128* argL, V128* argR )
213 {
214    res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]);
215    res->w8[ 1] = max8S(argL->w8[ 1], argR->w8[ 1]);
216    res->w8[ 2] = max8S(argL->w8[ 2], argR->w8[ 2]);
217    res->w8[ 3] = max8S(argL->w8[ 3], argR->w8[ 3]);
218    res->w8[ 4] = max8S(argL->w8[ 4], argR->w8[ 4]);
219    res->w8[ 5] = max8S(argL->w8[ 5], argR->w8[ 5]);
220    res->w8[ 6] = max8S(argL->w8[ 6], argR->w8[ 6]);
221    res->w8[ 7] = max8S(argL->w8[ 7], argR->w8[ 7]);
222    res->w8[ 8] = max8S(argL->w8[ 8], argR->w8[ 8]);
223    res->w8[ 9] = max8S(argL->w8[ 9], argR->w8[ 9]);
224    res->w8[10] = max8S(argL->w8[10], argR->w8[10]);
225    res->w8[11] = max8S(argL->w8[11], argR->w8[11]);
226    res->w8[12] = max8S(argL->w8[12], argR->w8[12]);
227    res->w8[13] = max8S(argL->w8[13], argR->w8[13]);
228    res->w8[14] = max8S(argL->w8[14], argR->w8[14]);
229    res->w8[15] = max8S(argL->w8[15], argR->w8[15]);
230 }
231 
232 void VEX_REGPARM(3)
h_generic_calc_Min8Sx16(V128 * res,V128 * argL,V128 * argR)233      h_generic_calc_Min8Sx16 ( /*OUT*/V128* res,
234                                V128* argL, V128* argR )
235 {
236    res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]);
237    res->w8[ 1] = min8S(argL->w8[ 1], argR->w8[ 1]);
238    res->w8[ 2] = min8S(argL->w8[ 2], argR->w8[ 2]);
239    res->w8[ 3] = min8S(argL->w8[ 3], argR->w8[ 3]);
240    res->w8[ 4] = min8S(argL->w8[ 4], argR->w8[ 4]);
241    res->w8[ 5] = min8S(argL->w8[ 5], argR->w8[ 5]);
242    res->w8[ 6] = min8S(argL->w8[ 6], argR->w8[ 6]);
243    res->w8[ 7] = min8S(argL->w8[ 7], argR->w8[ 7]);
244    res->w8[ 8] = min8S(argL->w8[ 8], argR->w8[ 8]);
245    res->w8[ 9] = min8S(argL->w8[ 9], argR->w8[ 9]);
246    res->w8[10] = min8S(argL->w8[10], argR->w8[10]);
247    res->w8[11] = min8S(argL->w8[11], argR->w8[11]);
248    res->w8[12] = min8S(argL->w8[12], argR->w8[12]);
249    res->w8[13] = min8S(argL->w8[13], argR->w8[13]);
250    res->w8[14] = min8S(argL->w8[14], argR->w8[14]);
251    res->w8[15] = min8S(argL->w8[15], argR->w8[15]);
252 }
253 
254 void VEX_REGPARM(3)
h_generic_calc_CmpEQ64x2(V128 * res,V128 * argL,V128 * argR)255      h_generic_calc_CmpEQ64x2 ( /*OUT*/V128* res,
256                                 V128* argL, V128* argR )
257 {
258    res->w64[0] = cmpEQ64(argL->w64[0], argR->w64[0]);
259    res->w64[1] = cmpEQ64(argL->w64[1], argR->w64[1]);
260 }
261 
262 void VEX_REGPARM(3)
h_generic_calc_CmpGT64Sx2(V128 * res,V128 * argL,V128 * argR)263      h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
264                                  V128* argL, V128* argR )
265 {
266    res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]);
267    res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
268 }
269 
270 /* ------------ Shifting ------------ */
271 /* Note that because these primops are undefined if the shift amount
272    equals or exceeds the lane width, the shift amount is masked so
273    that the scalar shifts are always in range.  In fact, given the
274    semantics of these primops (Sar64x2, etc) it is an error if in
275    fact we are ever given an out-of-range shift amount.
276 */
277 void /*not-regparm*/
h_generic_calc_SarN64x2(V128 * res,V128 * argL,UInt nn)278      h_generic_calc_SarN64x2 ( /*OUT*/V128* res,
279                                V128* argL, UInt nn)
280 {
281    /* vassert(nn < 64); */
282    nn &= 63;
283    res->w64[0] = sar64(argL->w64[0], nn);
284    res->w64[1] = sar64(argL->w64[1], nn);
285 }
286 
287 void /*not-regparm*/
h_generic_calc_SarN8x16(V128 * res,V128 * argL,UInt nn)288      h_generic_calc_SarN8x16 ( /*OUT*/V128* res,
289                               V128* argL, UInt nn)
290 {
291    /* vassert(nn < 8); */
292    nn &= 7;
293    res->w8[ 0] = sar8(argL->w8[ 0], nn);
294    res->w8[ 1] = sar8(argL->w8[ 1], nn);
295    res->w8[ 2] = sar8(argL->w8[ 2], nn);
296    res->w8[ 3] = sar8(argL->w8[ 3], nn);
297    res->w8[ 4] = sar8(argL->w8[ 4], nn);
298    res->w8[ 5] = sar8(argL->w8[ 5], nn);
299    res->w8[ 6] = sar8(argL->w8[ 6], nn);
300    res->w8[ 7] = sar8(argL->w8[ 7], nn);
301    res->w8[ 8] = sar8(argL->w8[ 8], nn);
302    res->w8[ 9] = sar8(argL->w8[ 9], nn);
303    res->w8[10] = sar8(argL->w8[10], nn);
304    res->w8[11] = sar8(argL->w8[11], nn);
305    res->w8[12] = sar8(argL->w8[12], nn);
306    res->w8[13] = sar8(argL->w8[13], nn);
307    res->w8[14] = sar8(argL->w8[14], nn);
308    res->w8[15] = sar8(argL->w8[15], nn);
309 }
310 
311 void VEX_REGPARM(3)
h_generic_calc_QNarrowBin32Sto16Ux8(V128 * res,V128 * argL,V128 * argR)312      h_generic_calc_QNarrowBin32Sto16Ux8 ( /*OUT*/V128* res,
313                                            V128* argL, V128* argR )
314 {
315    res->w16[0] = qnarrow32Sto16U(argR->w32[0]);
316    res->w16[1] = qnarrow32Sto16U(argR->w32[1]);
317    res->w16[2] = qnarrow32Sto16U(argR->w32[2]);
318    res->w16[3] = qnarrow32Sto16U(argR->w32[3]);
319    res->w16[4] = qnarrow32Sto16U(argL->w32[0]);
320    res->w16[5] = qnarrow32Sto16U(argL->w32[1]);
321    res->w16[6] = qnarrow32Sto16U(argL->w32[2]);
322    res->w16[7] = qnarrow32Sto16U(argL->w32[3]);
323 }
324 
325 void VEX_REGPARM(3)
h_generic_calc_NarrowBin16to8x16(V128 * res,V128 * argL,V128 * argR)326      h_generic_calc_NarrowBin16to8x16 ( /*OUT*/V128* res,
327                                         V128* argL, V128* argR )
328 {
329    res->w8[ 0] = narrow16to8(argR->w16[0]);
330    res->w8[ 1] = narrow16to8(argR->w16[1]);
331    res->w8[ 2] = narrow16to8(argR->w16[2]);
332    res->w8[ 3] = narrow16to8(argR->w16[3]);
333    res->w8[ 4] = narrow16to8(argR->w16[4]);
334    res->w8[ 5] = narrow16to8(argR->w16[5]);
335    res->w8[ 6] = narrow16to8(argR->w16[6]);
336    res->w8[ 7] = narrow16to8(argR->w16[7]);
337    res->w8[ 8] = narrow16to8(argL->w16[0]);
338    res->w8[ 9] = narrow16to8(argL->w16[1]);
339    res->w8[10] = narrow16to8(argL->w16[2]);
340    res->w8[11] = narrow16to8(argL->w16[3]);
341    res->w8[12] = narrow16to8(argL->w16[4]);
342    res->w8[13] = narrow16to8(argL->w16[5]);
343    res->w8[14] = narrow16to8(argL->w16[6]);
344    res->w8[15] = narrow16to8(argL->w16[7]);
345 }
346 
347 void VEX_REGPARM(3)
h_generic_calc_NarrowBin32to16x8(V128 * res,V128 * argL,V128 * argR)348      h_generic_calc_NarrowBin32to16x8 ( /*OUT*/V128* res,
349                                         V128* argL, V128* argR )
350 {
351    res->w16[0] = narrow32to16(argR->w32[0]);
352    res->w16[1] = narrow32to16(argR->w32[1]);
353    res->w16[2] = narrow32to16(argR->w32[2]);
354    res->w16[3] = narrow32to16(argR->w32[3]);
355    res->w16[4] = narrow32to16(argL->w32[0]);
356    res->w16[5] = narrow32to16(argL->w32[1]);
357    res->w16[6] = narrow32to16(argL->w32[2]);
358    res->w16[7] = narrow32to16(argL->w32[3]);
359 }
360 
361 void VEX_REGPARM(3)
h_generic_calc_Perm32x4(V128 * res,V128 * argL,V128 * argR)362      h_generic_calc_Perm32x4 ( /*OUT*/V128* res,
363                                V128* argL, V128* argR )
364 {
365    res->w32[0] = argL->w32[ argR->w32[0] & 3 ];
366    res->w32[1] = argL->w32[ argR->w32[1] & 3 ];
367    res->w32[2] = argL->w32[ argR->w32[2] & 3 ];
368    res->w32[3] = argL->w32[ argR->w32[3] & 3 ];
369 }
370 
371 UInt /*not-regparm*/
h_generic_calc_GetMSBs8x16(ULong w64hi,ULong w64lo)372      h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo )
373 {
374    UInt r = 0;
375    if (w64hi & (1ULL << (64-1))) r |= (1<<15);
376    if (w64hi & (1ULL << (56-1))) r |= (1<<14);
377    if (w64hi & (1ULL << (48-1))) r |= (1<<13);
378    if (w64hi & (1ULL << (40-1))) r |= (1<<12);
379    if (w64hi & (1ULL << (32-1))) r |= (1<<11);
380    if (w64hi & (1ULL << (24-1))) r |= (1<<10);
381    if (w64hi & (1ULL << (16-1))) r |= (1<<9);
382    if (w64hi & (1ULL << ( 8-1))) r |= (1<<8);
383    if (w64lo & (1ULL << (64-1))) r |= (1<<7);
384    if (w64lo & (1ULL << (56-1))) r |= (1<<6);
385    if (w64lo & (1ULL << (48-1))) r |= (1<<5);
386    if (w64lo & (1ULL << (40-1))) r |= (1<<4);
387    if (w64lo & (1ULL << (32-1))) r |= (1<<3);
388    if (w64lo & (1ULL << (24-1))) r |= (1<<2);
389    if (w64lo & (1ULL << (16-1))) r |= (1<<1);
390    if (w64lo & (1ULL << ( 8-1))) r |= (1<<0);
391    return r;
392 }
393 
394 /*---------------------------------------------------------------*/
395 /*--- end                              host_generic_simd128.c ---*/
396 /*---------------------------------------------------------------*/
397