1 
2 /*---------------------------------------------------------------*/
3 /*--- begin                             host_generic_simd64.c ---*/
4 /*---------------------------------------------------------------*/
5 
6 /*
7    This file is part of Valgrind, a dynamic binary instrumentation
8    framework.
9 
10    Copyright (C) 2004-2015 OpenWorks LLP
11       info@open-works.net
12 
13    This program is free software; you can redistribute it and/or
14    modify it under the terms of the GNU General Public License as
15    published by the Free Software Foundation; either version 2 of the
16    License, or (at your option) any later version.
17 
18    This program is distributed in the hope that it will be useful, but
19    WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21    General Public License for more details.
22 
23    You should have received a copy of the GNU General Public License
24    along with this program; if not, write to the Free Software
25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26    02110-1301, USA.
27 
28    The GNU General Public License is contained in the file COPYING.
29 
30    Neither the names of the U.S. Department of Energy nor the
31    University of California nor the names of its contributors may be
32    used to endorse or promote products derived from this software
33    without prior written permission.
34 */
35 
36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
37    where the instruction selectors cannot generate code in-line.
38    These are purely back-end entities and cannot be seen/referenced
39    from IR.  There are also helpers for 32-bit arithmetic in here. */
40 
41 #include "libvex_basictypes.h"
42 #include "main_util.h"              // LIKELY, UNLIKELY
43 #include "host_generic_simd64.h"
44 
45 
46 
47 /* Tuple/select functions for 32x2 vectors. */
48 
mk32x2(UInt w1,UInt w0)49 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
50    return (((ULong)w1) << 32) | ((ULong)w0);
51 }
52 
sel32x2_1(ULong w64)53 static inline UInt sel32x2_1 ( ULong w64 ) {
54    return 0xFFFFFFFF & toUInt(w64 >> 32);
55 }
sel32x2_0(ULong w64)56 static inline UInt sel32x2_0 ( ULong w64 ) {
57    return 0xFFFFFFFF & toUInt(w64);
58 }
59 
60 
61 /* Tuple/select functions for 16x4 vectors.  gcc is pretty hopeless
62    with 64-bit shifts so we give it a hand. */
63 
mk16x4(UShort w3,UShort w2,UShort w1,UShort w0)64 static inline ULong mk16x4 ( UShort w3, UShort w2,
65                              UShort w1, UShort w0 ) {
66    UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
67    UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
68    return mk32x2(hi32, lo32);
69 }
70 
sel16x4_3(ULong w64)71 static inline UShort sel16x4_3 ( ULong w64 ) {
72    UInt hi32 = toUInt(w64 >> 32);
73    return toUShort(0xFFFF & (hi32 >> 16));
74 }
sel16x4_2(ULong w64)75 static inline UShort sel16x4_2 ( ULong w64 ) {
76    UInt hi32 = toUInt(w64 >> 32);
77    return toUShort(0xFFFF & hi32);
78 }
sel16x4_1(ULong w64)79 static inline UShort sel16x4_1 ( ULong w64 ) {
80    UInt lo32 = (UInt)w64;
81    return toUShort(0xFFFF & (lo32 >> 16));
82 }
sel16x4_0(ULong w64)83 static inline UShort sel16x4_0 ( ULong w64 ) {
84    UInt lo32 = (UInt)w64;
85    return toUShort(0xFFFF & lo32);
86 }
87 
88 
89 /* Tuple/select functions for 8x8 vectors. */
90 
mk8x8(UChar w7,UChar w6,UChar w5,UChar w4,UChar w3,UChar w2,UChar w1,UChar w0)91 static inline ULong mk8x8 ( UChar w7, UChar w6,
92                             UChar w5, UChar w4,
93                             UChar w3, UChar w2,
94                             UChar w1, UChar w0 ) {
95    UInt hi32 =   (((UInt)w7) << 24) | (((UInt)w6) << 16)
96                | (((UInt)w5) << 8)  | (((UInt)w4) << 0);
97    UInt lo32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
98                | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
99    return mk32x2(hi32, lo32);
100 }
101 
sel8x8_7(ULong w64)102 static inline UChar sel8x8_7 ( ULong w64 ) {
103    UInt hi32 = toUInt(w64 >> 32);
104    return toUChar(0xFF & (hi32 >> 24));
105 }
sel8x8_6(ULong w64)106 static inline UChar sel8x8_6 ( ULong w64 ) {
107    UInt hi32 = toUInt(w64 >> 32);
108    return toUChar(0xFF & (hi32 >> 16));
109 }
sel8x8_5(ULong w64)110 static inline UChar sel8x8_5 ( ULong w64 ) {
111    UInt hi32 = toUInt(w64 >> 32);
112    return toUChar(0xFF & (hi32 >> 8));
113 }
sel8x8_4(ULong w64)114 static inline UChar sel8x8_4 ( ULong w64 ) {
115    UInt hi32 = toUInt(w64 >> 32);
116    return toUChar(0xFF & (hi32 >> 0));
117 }
sel8x8_3(ULong w64)118 static inline UChar sel8x8_3 ( ULong w64 ) {
119    UInt lo32 = (UInt)w64;
120    return toUChar(0xFF & (lo32 >> 24));
121 }
sel8x8_2(ULong w64)122 static inline UChar sel8x8_2 ( ULong w64 ) {
123    UInt lo32 = (UInt)w64;
124    return toUChar(0xFF & (lo32 >> 16));
125 }
sel8x8_1(ULong w64)126 static inline UChar sel8x8_1 ( ULong w64 ) {
127    UInt lo32 = (UInt)w64;
128    return toUChar(0xFF & (lo32 >> 8));
129 }
sel8x8_0(ULong w64)130 static inline UChar sel8x8_0 ( ULong w64 ) {
131    UInt lo32 = (UInt)w64;
132    return toUChar(0xFF & (lo32 >> 0));
133 }
134 
index8x8(ULong w64,UChar ix)135 static inline UChar index8x8 ( ULong w64, UChar ix ) {
136    ix &= 7;
137    return toUChar((w64 >> (8*ix)) & 0xFF);
138 }
139 
140 
141 /* Scalar helpers. */
142 
qadd32S(Int xx,Int yy)143 static inline Int qadd32S ( Int xx, Int yy )
144 {
145    Long t = ((Long)xx) + ((Long)yy);
146    const Long loLim = -0x80000000LL;
147    const Long hiLim =  0x7FFFFFFFLL;
148    if (t < loLim) t = loLim;
149    if (t > hiLim) t = hiLim;
150    return (Int)t;
151 }
152 
qadd16S(Short xx,Short yy)153 static inline Short qadd16S ( Short xx, Short yy )
154 {
155    Int t = ((Int)xx) + ((Int)yy);
156    if (t < -32768) t = -32768;
157    if (t > 32767)  t = 32767;
158    return (Short)t;
159 }
160 
qadd8S(Char xx,Char yy)161 static inline Char qadd8S ( Char xx, Char yy )
162 {
163    Int t = ((Int)xx) + ((Int)yy);
164    if (t < -128) t = -128;
165    if (t > 127)  t = 127;
166    return (Char)t;
167 }
168 
qadd16U(UShort xx,UShort yy)169 static inline UShort qadd16U ( UShort xx, UShort yy )
170 {
171    UInt t = ((UInt)xx) + ((UInt)yy);
172    if (t > 0xFFFF) t = 0xFFFF;
173    return (UShort)t;
174 }
175 
qadd8U(UChar xx,UChar yy)176 static inline UChar qadd8U ( UChar xx, UChar yy )
177 {
178    UInt t = ((UInt)xx) + ((UInt)yy);
179    if (t > 0xFF) t = 0xFF;
180    return (UChar)t;
181 }
182 
qsub32S(Int xx,Int yy)183 static inline Int qsub32S ( Int xx, Int yy )
184 {
185    Long t = ((Long)xx) - ((Long)yy);
186    const Long loLim = -0x80000000LL;
187    const Long hiLim =  0x7FFFFFFFLL;
188    if (t < loLim) t = loLim;
189    if (t > hiLim) t = hiLim;
190    return (Int)t;
191 }
192 
qsub16S(Short xx,Short yy)193 static inline Short qsub16S ( Short xx, Short yy )
194 {
195    Int t = ((Int)xx) - ((Int)yy);
196    if (t < -32768) t = -32768;
197    if (t > 32767)  t = 32767;
198    return (Short)t;
199 }
200 
qsub8S(Char xx,Char yy)201 static inline Char qsub8S ( Char xx, Char yy )
202 {
203    Int t = ((Int)xx) - ((Int)yy);
204    if (t < -128) t = -128;
205    if (t > 127)  t = 127;
206    return (Char)t;
207 }
208 
qsub16U(UShort xx,UShort yy)209 static inline UShort qsub16U ( UShort xx, UShort yy )
210 {
211    Int t = ((Int)xx) - ((Int)yy);
212    if (t < 0)      t = 0;
213    if (t > 0xFFFF) t = 0xFFFF;
214    return (UShort)t;
215 }
216 
qsub8U(UChar xx,UChar yy)217 static inline UChar qsub8U ( UChar xx, UChar yy )
218 {
219    Int t = ((Int)xx) - ((Int)yy);
220    if (t < 0)    t = 0;
221    if (t > 0xFF) t = 0xFF;
222    return (UChar)t;
223 }
224 
mul16(Short xx,Short yy)225 static inline Short mul16 ( Short xx, Short yy )
226 {
227    Int t = ((Int)xx) * ((Int)yy);
228    return (Short)t;
229 }
230 
mul32(Int xx,Int yy)231 static inline Int mul32 ( Int xx, Int yy )
232 {
233    Int t = ((Int)xx) * ((Int)yy);
234    return (Int)t;
235 }
236 
mulhi16S(Short xx,Short yy)237 static inline Short mulhi16S ( Short xx, Short yy )
238 {
239    Int t = ((Int)xx) * ((Int)yy);
240    t >>=/*s*/ 16;
241    return (Short)t;
242 }
243 
mulhi16U(UShort xx,UShort yy)244 static inline UShort mulhi16U ( UShort xx, UShort yy )
245 {
246    UInt t = ((UInt)xx) * ((UInt)yy);
247    t >>=/*u*/ 16;
248    return (UShort)t;
249 }
250 
cmpeq32(UInt xx,UInt yy)251 static inline UInt cmpeq32 ( UInt xx, UInt yy )
252 {
253    return xx==yy ? 0xFFFFFFFF : 0;
254 }
255 
cmpeq16(UShort xx,UShort yy)256 static inline UShort cmpeq16 ( UShort xx, UShort yy )
257 {
258    return toUShort(xx==yy ? 0xFFFF : 0);
259 }
260 
cmpeq8(UChar xx,UChar yy)261 static inline UChar cmpeq8 ( UChar xx, UChar yy )
262 {
263    return toUChar(xx==yy ? 0xFF : 0);
264 }
265 
cmpgt32S(Int xx,Int yy)266 static inline UInt cmpgt32S ( Int xx, Int yy )
267 {
268    return xx>yy ? 0xFFFFFFFF : 0;
269 }
270 
cmpgt16S(Short xx,Short yy)271 static inline UShort cmpgt16S ( Short xx, Short yy )
272 {
273    return toUShort(xx>yy ? 0xFFFF : 0);
274 }
275 
cmpgt8S(Char xx,Char yy)276 static inline UChar cmpgt8S ( Char xx, Char yy )
277 {
278    return toUChar(xx>yy ? 0xFF : 0);
279 }
280 
cmpnez32(UInt xx)281 static inline UInt cmpnez32 ( UInt xx )
282 {
283    return xx==0 ? 0 : 0xFFFFFFFF;
284 }
285 
cmpnez16(UShort xx)286 static inline UShort cmpnez16 ( UShort xx )
287 {
288    return toUShort(xx==0 ? 0 : 0xFFFF);
289 }
290 
cmpnez8(UChar xx)291 static inline UChar cmpnez8 ( UChar xx )
292 {
293    return toUChar(xx==0 ? 0 : 0xFF);
294 }
295 
qnarrow32Sto16S(UInt xx0)296 static inline Short qnarrow32Sto16S ( UInt xx0 )
297 {
298    Int xx = (Int)xx0;
299    if (xx < -32768) xx = -32768;
300    if (xx > 32767)  xx = 32767;
301    return (Short)xx;
302 }
303 
qnarrow16Sto8S(UShort xx0)304 static inline Char qnarrow16Sto8S ( UShort xx0 )
305 {
306    Short xx = (Short)xx0;
307    if (xx < -128) xx = -128;
308    if (xx > 127)  xx = 127;
309    return (Char)xx;
310 }
311 
qnarrow16Sto8U(UShort xx0)312 static inline UChar qnarrow16Sto8U ( UShort xx0 )
313 {
314    Short xx = (Short)xx0;
315    if (xx < 0)   xx = 0;
316    if (xx > 255) xx = 255;
317    return (UChar)xx;
318 }
319 
narrow32to16(UInt xx)320 static inline UShort narrow32to16 ( UInt xx )
321 {
322    return (UShort)xx;
323 }
324 
narrow16to8(UShort xx)325 static inline UChar narrow16to8 ( UShort xx )
326 {
327    return (UChar)xx;
328 }
329 
330 /* shifts: we don't care about out-of-range ones, since
331    that is dealt with at a higher level. */
332 
shl8(UChar v,UInt n)333 static inline UChar shl8 ( UChar v, UInt n )
334 {
335    return toUChar(v << n);
336 }
337 
sar8(UChar v,UInt n)338 static inline UChar sar8 ( UChar v, UInt n )
339 {
340    return toUChar(((Char)v) >> n);
341 }
342 
shl16(UShort v,UInt n)343 static inline UShort shl16 ( UShort v, UInt n )
344 {
345    return toUShort(v << n);
346 }
347 
shr16(UShort v,UInt n)348 static inline UShort shr16 ( UShort v, UInt n )
349 {
350    return toUShort((((UShort)v) >> n));
351 }
352 
sar16(UShort v,UInt n)353 static inline UShort sar16 ( UShort v, UInt n )
354 {
355    return toUShort(((Short)v) >> n);
356 }
357 
shl32(UInt v,UInt n)358 static inline UInt shl32 ( UInt v, UInt n )
359 {
360    return v << n;
361 }
362 
shr32(UInt v,UInt n)363 static inline UInt shr32 ( UInt v, UInt n )
364 {
365    return (((UInt)v) >> n);
366 }
367 
sar32(UInt v,UInt n)368 static inline UInt sar32 ( UInt v, UInt n )
369 {
370    return ((Int)v) >> n;
371 }
372 
avg8U(UChar xx,UChar yy)373 static inline UChar avg8U ( UChar xx, UChar yy )
374 {
375    UInt xxi = (UInt)xx;
376    UInt yyi = (UInt)yy;
377    UInt r   = (xxi + yyi + 1) >> 1;
378    return (UChar)r;
379 }
380 
avg16U(UShort xx,UShort yy)381 static inline UShort avg16U ( UShort xx, UShort yy )
382 {
383    UInt xxi = (UInt)xx;
384    UInt yyi = (UInt)yy;
385    UInt r   = (xxi + yyi + 1) >> 1;
386    return (UShort)r;
387 }
388 
max16S(Short xx,Short yy)389 static inline Short max16S ( Short xx, Short yy )
390 {
391    return toUShort((xx > yy) ? xx : yy);
392 }
393 
max8U(UChar xx,UChar yy)394 static inline UChar max8U ( UChar xx, UChar yy )
395 {
396    return toUChar((xx > yy) ? xx : yy);
397 }
398 
min16S(Short xx,Short yy)399 static inline Short min16S ( Short xx, Short yy )
400 {
401    return toUShort((xx < yy) ? xx : yy);
402 }
403 
min8U(UChar xx,UChar yy)404 static inline UChar min8U ( UChar xx, UChar yy )
405 {
406    return toUChar((xx < yy) ? xx : yy);
407 }
408 
hadd16U(UShort xx,UShort yy)409 static inline UShort hadd16U ( UShort xx, UShort yy )
410 {
411    UInt xxi = (UInt)xx;
412    UInt yyi = (UInt)yy;
413    UInt r   = (xxi + yyi) >> 1;
414    return (UShort)r;
415 }
416 
hadd16S(Short xx,Short yy)417 static inline Short hadd16S ( Short xx, Short yy )
418 {
419    Int xxi = (Int)xx;
420    Int yyi = (Int)yy;
421    Int r   = (xxi + yyi) >> 1;
422    return (Short)r;
423 }
424 
hsub16U(UShort xx,UShort yy)425 static inline UShort hsub16U ( UShort xx, UShort yy )
426 {
427    UInt xxi = (UInt)xx;
428    UInt yyi = (UInt)yy;
429    UInt r   = (xxi - yyi) >> 1;
430    return (UShort)r;
431 }
432 
hsub16S(Short xx,Short yy)433 static inline Short hsub16S ( Short xx, Short yy )
434 {
435    Int xxi = (Int)xx;
436    Int yyi = (Int)yy;
437    Int r   = (xxi - yyi) >> 1;
438    return (Short)r;
439 }
440 
hadd8U(UChar xx,UChar yy)441 static inline UChar hadd8U ( UChar xx, UChar yy )
442 {
443    UInt xxi = (UInt)xx;
444    UInt yyi = (UInt)yy;
445    UInt r   = (xxi + yyi) >> 1;
446    return (UChar)r;
447 }
448 
hadd8S(Char xx,Char yy)449 static inline Char hadd8S ( Char xx, Char yy )
450 {
451    Int xxi = (Int)xx;
452    Int yyi = (Int)yy;
453    Int r   = (xxi + yyi) >> 1;
454    return (Char)r;
455 }
456 
hsub8U(UChar xx,UChar yy)457 static inline UChar hsub8U ( UChar xx, UChar yy )
458 {
459    UInt xxi = (UInt)xx;
460    UInt yyi = (UInt)yy;
461    UInt r   = (xxi - yyi) >> 1;
462    return (UChar)r;
463 }
464 
hsub8S(Char xx,Char yy)465 static inline Char hsub8S ( Char xx, Char yy )
466 {
467    Int xxi = (Int)xx;
468    Int yyi = (Int)yy;
469    Int r   = (xxi - yyi) >> 1;
470    return (Char)r;
471 }
472 
absdiff8U(UChar xx,UChar yy)473 static inline UInt absdiff8U ( UChar xx, UChar yy )
474 {
475    UInt xxu = (UChar)xx;
476    UInt yyu = (UChar)yy;
477    return xxu >= yyu  ? xxu - yyu  : yyu - xxu;
478 }
479 
480 /* ----------------------------------------------------- */
481 /* Start of the externally visible functions.  These simply
482    implement the corresponding IR primops. */
483 /* ----------------------------------------------------- */
484 
485 /* ------------ Normal addition ------------ */
486 
h_generic_calc_Add32x2(ULong xx,ULong yy)487 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
488 {
489    return mk32x2(
490              sel32x2_1(xx) + sel32x2_1(yy),
491              sel32x2_0(xx) + sel32x2_0(yy)
492           );
493 }
494 
h_generic_calc_Add16x4(ULong xx,ULong yy)495 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
496 {
497    return mk16x4(
498              toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
499              toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
500              toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
501              toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
502           );
503 }
504 
h_generic_calc_Add8x8(ULong xx,ULong yy)505 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
506 {
507    return mk8x8(
508              toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
509              toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
510              toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
511              toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
512              toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
513              toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
514              toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
515              toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
516           );
517 }
518 
519 /* ------------ Saturating addition ------------ */
520 
h_generic_calc_QAdd16Sx4(ULong xx,ULong yy)521 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
522 {
523    return mk16x4(
524              qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
525              qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
526              qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
527              qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
528           );
529 }
530 
h_generic_calc_QAdd8Sx8(ULong xx,ULong yy)531 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
532 {
533    return mk8x8(
534              qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
535              qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
536              qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
537              qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
538              qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
539              qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
540              qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
541              qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
542           );
543 }
544 
h_generic_calc_QAdd16Ux4(ULong xx,ULong yy)545 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
546 {
547    return mk16x4(
548              qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
549              qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
550              qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
551              qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
552           );
553 }
554 
h_generic_calc_QAdd8Ux8(ULong xx,ULong yy)555 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
556 {
557    return mk8x8(
558              qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
559              qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
560              qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
561              qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
562              qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
563              qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
564              qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
565              qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
566           );
567 }
568 
569 /* ------------ Normal subtraction ------------ */
570 
h_generic_calc_Sub32x2(ULong xx,ULong yy)571 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
572 {
573    return mk32x2(
574              sel32x2_1(xx) - sel32x2_1(yy),
575              sel32x2_0(xx) - sel32x2_0(yy)
576           );
577 }
578 
h_generic_calc_Sub16x4(ULong xx,ULong yy)579 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
580 {
581    return mk16x4(
582              toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
583              toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
584              toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
585              toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
586           );
587 }
588 
h_generic_calc_Sub8x8(ULong xx,ULong yy)589 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
590 {
591    return mk8x8(
592              toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
593              toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
594              toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
595              toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
596              toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
597              toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
598              toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
599              toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
600           );
601 }
602 
603 /* ------------ Saturating subtraction ------------ */
604 
h_generic_calc_QSub16Sx4(ULong xx,ULong yy)605 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
606 {
607    return mk16x4(
608              qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
609              qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
610              qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
611              qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
612           );
613 }
614 
h_generic_calc_QSub8Sx8(ULong xx,ULong yy)615 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
616 {
617    return mk8x8(
618              qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
619              qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
620              qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
621              qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
622              qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
623              qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
624              qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
625              qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
626           );
627 }
628 
h_generic_calc_QSub16Ux4(ULong xx,ULong yy)629 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
630 {
631    return mk16x4(
632              qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
633              qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
634              qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
635              qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
636           );
637 }
638 
h_generic_calc_QSub8Ux8(ULong xx,ULong yy)639 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
640 {
641    return mk8x8(
642              qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
643              qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
644              qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
645              qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
646              qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
647              qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
648              qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
649              qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
650           );
651 }
652 
653 /* ------------ Multiplication ------------ */
654 
h_generic_calc_Mul16x4(ULong xx,ULong yy)655 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
656 {
657    return mk16x4(
658              mul16( sel16x4_3(xx), sel16x4_3(yy) ),
659              mul16( sel16x4_2(xx), sel16x4_2(yy) ),
660              mul16( sel16x4_1(xx), sel16x4_1(yy) ),
661              mul16( sel16x4_0(xx), sel16x4_0(yy) )
662           );
663 }
664 
h_generic_calc_Mul32x2(ULong xx,ULong yy)665 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
666 {
667    return mk32x2(
668              mul32( sel32x2_1(xx), sel32x2_1(yy) ),
669              mul32( sel32x2_0(xx), sel32x2_0(yy) )
670           );
671 }
672 
h_generic_calc_MulHi16Sx4(ULong xx,ULong yy)673 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
674 {
675    return mk16x4(
676              mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
677              mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
678              mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
679              mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
680           );
681 }
682 
h_generic_calc_MulHi16Ux4(ULong xx,ULong yy)683 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
684 {
685    return mk16x4(
686              mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
687              mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
688              mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
689              mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
690           );
691 }
692 
693 /* ------------ Comparison ------------ */
694 
h_generic_calc_CmpEQ32x2(ULong xx,ULong yy)695 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
696 {
697    return mk32x2(
698              cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
699              cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
700           );
701 }
702 
h_generic_calc_CmpEQ16x4(ULong xx,ULong yy)703 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
704 {
705    return mk16x4(
706              cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
707              cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
708              cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
709              cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
710           );
711 }
712 
h_generic_calc_CmpEQ8x8(ULong xx,ULong yy)713 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
714 {
715    return mk8x8(
716              cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
717              cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
718              cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
719              cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
720              cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
721              cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
722              cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
723              cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
724           );
725 }
726 
h_generic_calc_CmpGT32Sx2(ULong xx,ULong yy)727 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
728 {
729    return mk32x2(
730              cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
731              cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
732           );
733 }
734 
h_generic_calc_CmpGT16Sx4(ULong xx,ULong yy)735 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
736 {
737    return mk16x4(
738              cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
739              cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
740              cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
741              cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
742           );
743 }
744 
h_generic_calc_CmpGT8Sx8(ULong xx,ULong yy)745 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
746 {
747    return mk8x8(
748              cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
749              cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
750              cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
751              cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
752              cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
753              cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
754              cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
755              cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
756           );
757 }
758 
h_generic_calc_CmpNEZ32x2(ULong xx)759 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
760 {
761    return mk32x2(
762              cmpnez32( sel32x2_1(xx) ),
763              cmpnez32( sel32x2_0(xx) )
764           );
765 }
766 
h_generic_calc_CmpNEZ16x4(ULong xx)767 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
768 {
769    return mk16x4(
770              cmpnez16( sel16x4_3(xx) ),
771              cmpnez16( sel16x4_2(xx) ),
772              cmpnez16( sel16x4_1(xx) ),
773              cmpnez16( sel16x4_0(xx) )
774           );
775 }
776 
h_generic_calc_CmpNEZ8x8(ULong xx)777 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
778 {
779    return mk8x8(
780              cmpnez8( sel8x8_7(xx) ),
781              cmpnez8( sel8x8_6(xx) ),
782              cmpnez8( sel8x8_5(xx) ),
783              cmpnez8( sel8x8_4(xx) ),
784              cmpnez8( sel8x8_3(xx) ),
785              cmpnez8( sel8x8_2(xx) ),
786              cmpnez8( sel8x8_1(xx) ),
787              cmpnez8( sel8x8_0(xx) )
788           );
789 }
790 
791 /* ------------ Saturating narrowing ------------ */
792 
h_generic_calc_QNarrowBin32Sto16Sx4(ULong aa,ULong bb)793 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
794 {
795    UInt d = sel32x2_1(aa);
796    UInt c = sel32x2_0(aa);
797    UInt b = sel32x2_1(bb);
798    UInt a = sel32x2_0(bb);
799    return mk16x4(
800              qnarrow32Sto16S(d),
801              qnarrow32Sto16S(c),
802              qnarrow32Sto16S(b),
803              qnarrow32Sto16S(a)
804           );
805 }
806 
h_generic_calc_QNarrowBin16Sto8Sx8(ULong aa,ULong bb)807 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
808 {
809    UShort h = sel16x4_3(aa);
810    UShort g = sel16x4_2(aa);
811    UShort f = sel16x4_1(aa);
812    UShort e = sel16x4_0(aa);
813    UShort d = sel16x4_3(bb);
814    UShort c = sel16x4_2(bb);
815    UShort b = sel16x4_1(bb);
816    UShort a = sel16x4_0(bb);
817    return mk8x8(
818              qnarrow16Sto8S(h),
819              qnarrow16Sto8S(g),
820              qnarrow16Sto8S(f),
821              qnarrow16Sto8S(e),
822              qnarrow16Sto8S(d),
823              qnarrow16Sto8S(c),
824              qnarrow16Sto8S(b),
825              qnarrow16Sto8S(a)
826           );
827 }
828 
h_generic_calc_QNarrowBin16Sto8Ux8(ULong aa,ULong bb)829 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
830 {
831    UShort h = sel16x4_3(aa);
832    UShort g = sel16x4_2(aa);
833    UShort f = sel16x4_1(aa);
834    UShort e = sel16x4_0(aa);
835    UShort d = sel16x4_3(bb);
836    UShort c = sel16x4_2(bb);
837    UShort b = sel16x4_1(bb);
838    UShort a = sel16x4_0(bb);
839    return mk8x8(
840              qnarrow16Sto8U(h),
841              qnarrow16Sto8U(g),
842              qnarrow16Sto8U(f),
843              qnarrow16Sto8U(e),
844              qnarrow16Sto8U(d),
845              qnarrow16Sto8U(c),
846              qnarrow16Sto8U(b),
847              qnarrow16Sto8U(a)
848           );
849 }
850 
851 /* ------------ Truncating narrowing ------------ */
852 
h_generic_calc_NarrowBin32to16x4(ULong aa,ULong bb)853 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
854 {
855    UInt d = sel32x2_1(aa);
856    UInt c = sel32x2_0(aa);
857    UInt b = sel32x2_1(bb);
858    UInt a = sel32x2_0(bb);
859    return mk16x4(
860              narrow32to16(d),
861              narrow32to16(c),
862              narrow32to16(b),
863              narrow32to16(a)
864           );
865 }
866 
h_generic_calc_NarrowBin16to8x8(ULong aa,ULong bb)867 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
868 {
869    UShort h = sel16x4_3(aa);
870    UShort g = sel16x4_2(aa);
871    UShort f = sel16x4_1(aa);
872    UShort e = sel16x4_0(aa);
873    UShort d = sel16x4_3(bb);
874    UShort c = sel16x4_2(bb);
875    UShort b = sel16x4_1(bb);
876    UShort a = sel16x4_0(bb);
877    return mk8x8(
878              narrow16to8(h),
879              narrow16to8(g),
880              narrow16to8(f),
881              narrow16to8(e),
882              narrow16to8(d),
883              narrow16to8(c),
884              narrow16to8(b),
885              narrow16to8(a)
886           );
887 }
888 
889 /* ------------ Interleaving ------------ */
890 
h_generic_calc_InterleaveHI8x8(ULong aa,ULong bb)891 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
892 {
893    return mk8x8(
894              sel8x8_7(aa),
895              sel8x8_7(bb),
896              sel8x8_6(aa),
897              sel8x8_6(bb),
898              sel8x8_5(aa),
899              sel8x8_5(bb),
900              sel8x8_4(aa),
901              sel8x8_4(bb)
902           );
903 }
904 
h_generic_calc_InterleaveLO8x8(ULong aa,ULong bb)905 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
906 {
907    return mk8x8(
908              sel8x8_3(aa),
909              sel8x8_3(bb),
910              sel8x8_2(aa),
911              sel8x8_2(bb),
912              sel8x8_1(aa),
913              sel8x8_1(bb),
914              sel8x8_0(aa),
915              sel8x8_0(bb)
916           );
917 }
918 
h_generic_calc_InterleaveHI16x4(ULong aa,ULong bb)919 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
920 {
921    return mk16x4(
922              sel16x4_3(aa),
923              sel16x4_3(bb),
924              sel16x4_2(aa),
925              sel16x4_2(bb)
926           );
927 }
928 
h_generic_calc_InterleaveLO16x4(ULong aa,ULong bb)929 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
930 {
931    return mk16x4(
932              sel16x4_1(aa),
933              sel16x4_1(bb),
934              sel16x4_0(aa),
935              sel16x4_0(bb)
936           );
937 }
938 
h_generic_calc_InterleaveHI32x2(ULong aa,ULong bb)939 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
940 {
941    return mk32x2(
942              sel32x2_1(aa),
943              sel32x2_1(bb)
944           );
945 }
946 
h_generic_calc_InterleaveLO32x2(ULong aa,ULong bb)947 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
948 {
949    return mk32x2(
950              sel32x2_0(aa),
951              sel32x2_0(bb)
952           );
953 }
954 
955 /* ------------ Concatenation ------------ */
956 
h_generic_calc_CatOddLanes16x4(ULong aa,ULong bb)957 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
958 {
959    return mk16x4(
960              sel16x4_3(aa),
961              sel16x4_1(aa),
962              sel16x4_3(bb),
963              sel16x4_1(bb)
964           );
965 }
966 
h_generic_calc_CatEvenLanes16x4(ULong aa,ULong bb)967 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
968 {
969    return mk16x4(
970              sel16x4_2(aa),
971              sel16x4_0(aa),
972              sel16x4_2(bb),
973              sel16x4_0(bb)
974           );
975 }
976 
977 /* misc hack looking for a proper home */
h_generic_calc_Perm8x8(ULong aa,ULong bb)978 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
979 {
980    return mk8x8(
981              index8x8(aa, sel8x8_7(bb)),
982              index8x8(aa, sel8x8_6(bb)),
983              index8x8(aa, sel8x8_5(bb)),
984              index8x8(aa, sel8x8_4(bb)),
985              index8x8(aa, sel8x8_3(bb)),
986              index8x8(aa, sel8x8_2(bb)),
987              index8x8(aa, sel8x8_1(bb)),
988              index8x8(aa, sel8x8_0(bb))
989           );
990 }
991 
992 /* ------------ Shifting ------------ */
993 /* Note that because these primops are undefined if the shift amount
994    equals or exceeds the lane width, the shift amount is masked so
995    that the scalar shifts are always in range.  In fact, given the
996    semantics of these primops (ShlN16x4, etc) it is an error if in
997    fact we are ever given an out-of-range shift amount.
998 */
h_generic_calc_ShlN32x2(ULong xx,UInt nn)999 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
1000 {
1001    /* vassert(nn < 32); */
1002    nn &= 31;
1003    return mk32x2(
1004              shl32( sel32x2_1(xx), nn ),
1005              shl32( sel32x2_0(xx), nn )
1006           );
1007 }
1008 
h_generic_calc_ShlN16x4(ULong xx,UInt nn)1009 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
1010 {
1011    /* vassert(nn < 16); */
1012    nn &= 15;
1013    return mk16x4(
1014              shl16( sel16x4_3(xx), nn ),
1015              shl16( sel16x4_2(xx), nn ),
1016              shl16( sel16x4_1(xx), nn ),
1017              shl16( sel16x4_0(xx), nn )
1018           );
1019 }
1020 
h_generic_calc_ShlN8x8(ULong xx,UInt nn)1021 ULong h_generic_calc_ShlN8x8  ( ULong xx, UInt nn )
1022 {
1023    /* vassert(nn < 8); */
1024    nn &= 7;
1025    return mk8x8(
1026              shl8( sel8x8_7(xx), nn ),
1027              shl8( sel8x8_6(xx), nn ),
1028              shl8( sel8x8_5(xx), nn ),
1029              shl8( sel8x8_4(xx), nn ),
1030              shl8( sel8x8_3(xx), nn ),
1031              shl8( sel8x8_2(xx), nn ),
1032              shl8( sel8x8_1(xx), nn ),
1033              shl8( sel8x8_0(xx), nn )
1034           );
1035 }
1036 
h_generic_calc_ShrN32x2(ULong xx,UInt nn)1037 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
1038 {
1039    /* vassert(nn < 32); */
1040    nn &= 31;
1041    return mk32x2(
1042              shr32( sel32x2_1(xx), nn ),
1043              shr32( sel32x2_0(xx), nn )
1044           );
1045 }
1046 
h_generic_calc_ShrN16x4(ULong xx,UInt nn)1047 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
1048 {
1049    /* vassert(nn < 16); */
1050    nn &= 15;
1051    return mk16x4(
1052              shr16( sel16x4_3(xx), nn ),
1053              shr16( sel16x4_2(xx), nn ),
1054              shr16( sel16x4_1(xx), nn ),
1055              shr16( sel16x4_0(xx), nn )
1056           );
1057 }
1058 
h_generic_calc_SarN32x2(ULong xx,UInt nn)1059 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
1060 {
1061    /* vassert(nn < 32); */
1062    nn &= 31;
1063    return mk32x2(
1064              sar32( sel32x2_1(xx), nn ),
1065              sar32( sel32x2_0(xx), nn )
1066           );
1067 }
1068 
h_generic_calc_SarN16x4(ULong xx,UInt nn)1069 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1070 {
1071    /* vassert(nn < 16); */
1072    nn &= 15;
1073    return mk16x4(
1074              sar16( sel16x4_3(xx), nn ),
1075              sar16( sel16x4_2(xx), nn ),
1076              sar16( sel16x4_1(xx), nn ),
1077              sar16( sel16x4_0(xx), nn )
1078           );
1079 }
1080 
h_generic_calc_SarN8x8(ULong xx,UInt nn)1081 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1082 {
1083    /* vassert(nn < 8); */
1084    nn &= 7;
1085    return mk8x8(
1086              sar8( sel8x8_7(xx), nn ),
1087              sar8( sel8x8_6(xx), nn ),
1088              sar8( sel8x8_5(xx), nn ),
1089              sar8( sel8x8_4(xx), nn ),
1090              sar8( sel8x8_3(xx), nn ),
1091              sar8( sel8x8_2(xx), nn ),
1092              sar8( sel8x8_1(xx), nn ),
1093              sar8( sel8x8_0(xx), nn )
1094           );
1095 }
1096 
1097 /* ------------ Averaging ------------ */
1098 
h_generic_calc_Avg8Ux8(ULong xx,ULong yy)1099 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1100 {
1101    return mk8x8(
1102              avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1103              avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1104              avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1105              avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1106              avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1107              avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1108              avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1109              avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1110           );
1111 }
1112 
h_generic_calc_Avg16Ux4(ULong xx,ULong yy)1113 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1114 {
1115    return mk16x4(
1116              avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1117              avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1118              avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1119              avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1120           );
1121 }
1122 
1123 /* ------------ max/min ------------ */
1124 
h_generic_calc_Max16Sx4(ULong xx,ULong yy)1125 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1126 {
1127    return mk16x4(
1128              max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1129              max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1130              max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1131              max16S( sel16x4_0(xx), sel16x4_0(yy) )
1132           );
1133 }
1134 
h_generic_calc_Max8Ux8(ULong xx,ULong yy)1135 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1136 {
1137    return mk8x8(
1138              max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1139              max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1140              max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1141              max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1142              max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1143              max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1144              max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1145              max8U( sel8x8_0(xx), sel8x8_0(yy) )
1146           );
1147 }
1148 
h_generic_calc_Min16Sx4(ULong xx,ULong yy)1149 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1150 {
1151    return mk16x4(
1152              min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1153              min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1154              min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1155              min16S( sel16x4_0(xx), sel16x4_0(yy) )
1156           );
1157 }
1158 
h_generic_calc_Min8Ux8(ULong xx,ULong yy)1159 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1160 {
1161    return mk8x8(
1162              min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1163              min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1164              min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1165              min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1166              min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1167              min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1168              min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1169              min8U( sel8x8_0(xx), sel8x8_0(yy) )
1170           );
1171 }
1172 
h_generic_calc_GetMSBs8x8(ULong xx)1173 UInt h_generic_calc_GetMSBs8x8 ( ULong xx )
1174 {
1175    UInt r = 0;
1176    if (xx & (1ULL << (64-1))) r |= (1<<7);
1177    if (xx & (1ULL << (56-1))) r |= (1<<6);
1178    if (xx & (1ULL << (48-1))) r |= (1<<5);
1179    if (xx & (1ULL << (40-1))) r |= (1<<4);
1180    if (xx & (1ULL << (32-1))) r |= (1<<3);
1181    if (xx & (1ULL << (24-1))) r |= (1<<2);
1182    if (xx & (1ULL << (16-1))) r |= (1<<1);
1183    if (xx & (1ULL << ( 8-1))) r |= (1<<0);
1184    return r;
1185 }
1186 
1187 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1188 
1189 /* Tuple/select functions for 16x2 vectors. */
mk16x2(UShort w1,UShort w2)1190 static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1191    return (((UInt)w1) << 16) | ((UInt)w2);
1192 }
1193 
sel16x2_1(UInt w32)1194 static inline UShort sel16x2_1 ( UInt w32 ) {
1195    return 0xFFFF & (UShort)(w32 >> 16);
1196 }
sel16x2_0(UInt w32)1197 static inline UShort sel16x2_0 ( UInt w32 ) {
1198    return 0xFFFF & (UShort)(w32);
1199 }
1200 
mk8x4(UChar w3,UChar w2,UChar w1,UChar w0)1201 static inline UInt mk8x4 ( UChar w3, UChar w2,
1202                            UChar w1, UChar w0 ) {
1203    UInt w32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
1204               | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
1205    return w32;
1206 }
1207 
sel8x4_3(UInt w32)1208 static inline UChar sel8x4_3 ( UInt w32 ) {
1209    return toUChar(0xFF & (w32 >> 24));
1210 }
sel8x4_2(UInt w32)1211 static inline UChar sel8x4_2 ( UInt w32 ) {
1212    return toUChar(0xFF & (w32 >> 16));
1213 }
sel8x4_1(UInt w32)1214 static inline UChar sel8x4_1 ( UInt w32 ) {
1215    return toUChar(0xFF & (w32 >> 8));
1216 }
sel8x4_0(UInt w32)1217 static inline UChar sel8x4_0 ( UInt w32 ) {
1218    return toUChar(0xFF & (w32 >> 0));
1219 }
1220 
1221 
1222 /* ----------------------------------------------------- */
1223 /* More externally visible functions.  These simply
1224    implement the corresponding IR primops. */
1225 /* ----------------------------------------------------- */
1226 
1227 /* ------ 16x2 ------ */
1228 
h_generic_calc_Add16x2(UInt xx,UInt yy)1229 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1230 {
1231    return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1232                   sel16x2_0(xx) + sel16x2_0(yy) );
1233 }
1234 
h_generic_calc_Sub16x2(UInt xx,UInt yy)1235 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1236 {
1237    return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1238                   sel16x2_0(xx) - sel16x2_0(yy) );
1239 }
1240 
h_generic_calc_HAdd16Ux2(UInt xx,UInt yy)1241 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1242 {
1243    return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1244                   hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1245 }
1246 
h_generic_calc_HAdd16Sx2(UInt xx,UInt yy)1247 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1248 {
1249    return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1250                   hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1251 }
1252 
h_generic_calc_HSub16Ux2(UInt xx,UInt yy)1253 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1254 {
1255    return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1256                   hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1257 }
1258 
h_generic_calc_HSub16Sx2(UInt xx,UInt yy)1259 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1260 {
1261    return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1262                   hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1263 }
1264 
h_generic_calc_QAdd16Ux2(UInt xx,UInt yy)1265 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1266 {
1267    return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1268                   qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1269 }
1270 
h_generic_calc_QAdd16Sx2(UInt xx,UInt yy)1271 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1272 {
1273    return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1274                   qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1275 }
1276 
h_generic_calc_QSub16Ux2(UInt xx,UInt yy)1277 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1278 {
1279    return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1280                   qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1281 }
1282 
h_generic_calc_QSub16Sx2(UInt xx,UInt yy)1283 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1284 {
1285    return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1286                   qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1287 }
1288 
1289 /* ------ 8x4 ------ */
1290 
h_generic_calc_Add8x4(UInt xx,UInt yy)1291 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1292 {
1293    return mk8x4(
1294              sel8x4_3(xx) + sel8x4_3(yy),
1295              sel8x4_2(xx) + sel8x4_2(yy),
1296              sel8x4_1(xx) + sel8x4_1(yy),
1297              sel8x4_0(xx) + sel8x4_0(yy)
1298           );
1299 }
1300 
h_generic_calc_Sub8x4(UInt xx,UInt yy)1301 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1302 {
1303    return mk8x4(
1304              sel8x4_3(xx) - sel8x4_3(yy),
1305              sel8x4_2(xx) - sel8x4_2(yy),
1306              sel8x4_1(xx) - sel8x4_1(yy),
1307              sel8x4_0(xx) - sel8x4_0(yy)
1308           );
1309 }
1310 
h_generic_calc_HAdd8Ux4(UInt xx,UInt yy)1311 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1312 {
1313    return mk8x4(
1314              hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1315              hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1316              hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1317              hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1318           );
1319 }
1320 
h_generic_calc_HAdd8Sx4(UInt xx,UInt yy)1321 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1322 {
1323    return mk8x4(
1324              hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1325              hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1326              hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1327              hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1328           );
1329 }
1330 
h_generic_calc_HSub8Ux4(UInt xx,UInt yy)1331 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1332 {
1333    return mk8x4(
1334              hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1335              hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1336              hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1337              hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1338           );
1339 }
1340 
h_generic_calc_HSub8Sx4(UInt xx,UInt yy)1341 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1342 {
1343    return mk8x4(
1344              hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1345              hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1346              hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1347              hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1348           );
1349 }
1350 
h_generic_calc_QAdd8Ux4(UInt xx,UInt yy)1351 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1352 {
1353    return mk8x4(
1354              qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1355              qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1356              qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1357              qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1358           );
1359 }
1360 
h_generic_calc_QAdd8Sx4(UInt xx,UInt yy)1361 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1362 {
1363    return mk8x4(
1364              qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1365              qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1366              qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1367              qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1368           );
1369 }
1370 
h_generic_calc_QSub8Ux4(UInt xx,UInt yy)1371 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1372 {
1373    return mk8x4(
1374              qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1375              qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1376              qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1377              qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1378           );
1379 }
1380 
h_generic_calc_QSub8Sx4(UInt xx,UInt yy)1381 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1382 {
1383    return mk8x4(
1384              qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1385              qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1386              qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1387              qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1388           );
1389 }
1390 
h_generic_calc_CmpNEZ16x2(UInt xx)1391 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1392 {
1393    return mk16x2(
1394              cmpnez16( sel16x2_1(xx) ),
1395              cmpnez16( sel16x2_0(xx) )
1396           );
1397 }
1398 
h_generic_calc_CmpNEZ8x4(UInt xx)1399 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1400 {
1401    return mk8x4(
1402              cmpnez8( sel8x4_3(xx) ),
1403              cmpnez8( sel8x4_2(xx) ),
1404              cmpnez8( sel8x4_1(xx) ),
1405              cmpnez8( sel8x4_0(xx) )
1406           );
1407 }
1408 
h_generic_calc_Sad8Ux4(UInt xx,UInt yy)1409 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1410 {
1411    return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1412           + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1413           + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1414           + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1415 }
1416 
h_generic_calc_QAdd32S(UInt xx,UInt yy)1417 UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy )
1418 {
1419    return qadd32S( xx, yy );
1420 }
1421 
h_generic_calc_QSub32S(UInt xx,UInt yy)1422 UInt h_generic_calc_QSub32S ( UInt xx, UInt yy )
1423 {
1424    return qsub32S( xx, yy );
1425 }
1426 
1427 
1428 /*------------------------------------------------------------------*/
1429 /* Decimal Floating Point (DFP) externally visible helper functions */
1430 /* that implement Iop_BCDtoDPB and Iop_DPBtoBCD                     */
1431 /*------------------------------------------------------------------*/
1432 
1433 #define NOT( x )    ( ( ( x ) == 0) ? 1 : 0)
1434 #define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
1435 #define PUT( x, y ) ( ( x )<< ( y ) )
1436 
dpb_to_bcd(ULong chunk)1437 static ULong dpb_to_bcd( ULong chunk )
1438 {
1439    Short a, b, c, d, e, f, g, h, i, j, k, m;
1440    Short p, q, r, s, t, u, v, w, x, y;
1441    ULong value;
1442 
1443    /* convert 10 bit densely packed BCD to BCD */
1444    p = GET( chunk, 9 );
1445    q = GET( chunk, 8 );
1446    r = GET( chunk, 7 );
1447    s = GET( chunk, 6 );
1448    t = GET( chunk, 5 );
1449    u = GET( chunk, 4 );
1450    v = GET( chunk, 3 );
1451    w = GET( chunk, 2 );
1452    x = GET( chunk, 1 );
1453    y = GET( chunk, 0 );
1454 
1455    /* The BCD bit values are given by the following boolean equations.*/
1456    a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) );
1457    b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) );
1458    c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) );
1459    d = r;
1460    e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w );
1461    f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) );
1462    g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) );
1463    h = u;
1464    i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) );
1465    j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x )
1466             | ( p & w & NOT(x) & v ) | ( w & NOT(v) );
1467    k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x )
1468             | ( q & v & w & NOT(x) ) | ( x & NOT(v) );
1469    m = y;
1470 
1471    value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7)
1472             | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2)
1473             | PUT(k, 1) | PUT(m, 0);
1474    return value;
1475 }
1476 
bcd_to_dpb(ULong chunk)1477 static ULong bcd_to_dpb( ULong chunk )
1478 {
1479    Short a, b, c, d, e, f, g, h, i, j, k, m;
1480    Short p, q, r, s, t, u, v, w, x, y;
1481    ULong value;
1482    /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
1483     The boolean equations to calculate the value of each of the DPD bit
1484     is given in Appendix B  of Book 1: Power ISA User Instruction set.  The
1485     bits for the DPD number are [abcdefghijkm].  The bits for the BCD value
1486     are [pqrstuvwxy].  The boolean logic equations in psuedo C code are:
1487     */
1488    a = GET( chunk, 11 );
1489    b = GET( chunk, 10 );
1490    c = GET( chunk, 9 );
1491    d = GET( chunk, 8 );
1492    e = GET( chunk, 7 );
1493    f = GET( chunk, 6 );
1494    g = GET( chunk, 5 );
1495    h = GET( chunk, 4 );
1496    i = GET( chunk, 3 );
1497    j = GET( chunk, 2 );
1498    k = GET( chunk, 1 );
1499    m = GET( chunk, 0 );
1500 
1501    p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) );
1502    q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) );
1503    r = d;
1504    s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) )
1505             | ( f & NOT(a) & NOT(e) ) | ( e & i );
1506    t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) )
1507             | ( g & NOT(a) & NOT(e) ) | ( a & i );
1508    u = h;
1509    v = a | e | i;
1510    w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a;
1511    x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e;
1512    y = m;
1513 
1514    value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5)
1515             | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y;
1516 
1517    return value;
1518 }
1519 
h_calc_DPBtoBCD(ULong dpb)1520 ULong h_calc_DPBtoBCD( ULong dpb )
1521 {
1522    ULong result, chunk;
1523    Int i;
1524 
1525    result = 0;
1526 
1527    for (i = 0; i < 5; i++) {
1528       chunk = dpb >> ( 4 - i ) * 10;
1529       result = result << 12;
1530       result |= dpb_to_bcd( chunk & 0x3FF );
1531    }
1532    return result;
1533 }
1534 
h_calc_BCDtoDPB(ULong bcd)1535 ULong h_calc_BCDtoDPB( ULong bcd )
1536 {
1537    ULong result, chunk;
1538    Int i;
1539 
1540    result = 0;
1541 
1542    for (i = 0; i < 5; i++) {
1543       chunk = bcd >> ( 4 - i ) * 12;
1544       result = result << 10;
1545       result |= bcd_to_dpb( chunk & 0xFFF );
1546    }
1547    return result;
1548 }
1549 #undef NOT
1550 #undef GET
1551 #undef PUT
1552 
1553 
1554 /* ----------------------------------------------------- */
1555 /* Signed and unsigned integer division, that behave like
1556    the ARMv7 UDIV ansd SDIV instructions.
1557 
1558    sdiv32 also behaves like 64-bit v8 SDIV on w-regs.
1559    udiv32 also behaves like 64-bit v8 UDIV on w-regs.
1560 */
1561 /* ----------------------------------------------------- */
1562 
h_calc_udiv32_w_arm_semantics(UInt x,UInt y)1563 UInt h_calc_udiv32_w_arm_semantics ( UInt x, UInt y )
1564 {
1565    // Division by zero --> zero
1566    if (UNLIKELY(y == 0)) return 0;
1567    // C requires rounding towards zero, which is also what we need.
1568    return x / y;
1569 }
1570 
h_calc_udiv64_w_arm_semantics(ULong x,ULong y)1571 ULong h_calc_udiv64_w_arm_semantics ( ULong x, ULong y )
1572 {
1573    // Division by zero --> zero
1574    if (UNLIKELY(y == 0)) return 0;
1575    // C requires rounding towards zero, which is also what we need.
1576    return x / y;
1577 }
1578 
h_calc_sdiv32_w_arm_semantics(Int x,Int y)1579 Int h_calc_sdiv32_w_arm_semantics ( Int x, Int y )
1580 {
1581    // Division by zero --> zero
1582    if (UNLIKELY(y == 0)) return 0;
1583    // The single case that produces an unrepresentable result
1584    if (UNLIKELY( ((UInt)x) == ((UInt)0x80000000)
1585                  && ((UInt)y) == ((UInt)0xFFFFFFFF) ))
1586       return (Int)(UInt)0x80000000;
1587    // Else return the result rounded towards zero.  C89 says
1588    // this is implementation defined (in the signed case), but gcc
1589    // promises to round towards zero.  Nevertheless, at startup,
1590    // in main_main.c, do a check for that.
1591    return x / y;
1592 }
1593 
h_calc_sdiv64_w_arm_semantics(Long x,Long y)1594 Long h_calc_sdiv64_w_arm_semantics ( Long x, Long y )
1595 {
1596    // Division by zero --> zero
1597    if (UNLIKELY(y == 0)) return 0;
1598    // The single case that produces an unrepresentable result
1599    if (UNLIKELY( ((ULong)x) == ((ULong)0x8000000000000000ULL )
1600                  && ((ULong)y) == ((ULong)0xFFFFFFFFFFFFFFFFULL ) ))
1601       return (Long)(ULong)0x8000000000000000ULL;
1602    // Else return the result rounded towards zero.  C89 says
1603    // this is implementation defined (in the signed case), but gcc
1604    // promises to round towards zero.  Nevertheless, at startup,
1605    // in main_main.c, do a check for that.
1606    return x / y;
1607 }
1608 
1609 
1610 /*---------------------------------------------------------------*/
1611 /*--- end                               host_generic_simd64.c ---*/
1612 /*---------------------------------------------------------------*/
1613