1
2 /*---------------------------------------------------------------*/
3 /*--- begin host_generic_simd64.c ---*/
4 /*---------------------------------------------------------------*/
5
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
9
10 Copyright (C) 2004-2015 OpenWorks LLP
11 info@open-works.net
12
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 02110-1301, USA.
27
28 The GNU General Public License is contained in the file COPYING.
29
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
34 */
35
36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
37 where the instruction selectors cannot generate code in-line.
38 These are purely back-end entities and cannot be seen/referenced
39 from IR. There are also helpers for 32-bit arithmetic in here. */
40
41 #include "libvex_basictypes.h"
42 #include "main_util.h" // LIKELY, UNLIKELY
43 #include "host_generic_simd64.h"
44
45
46
47 /* Tuple/select functions for 32x2 vectors. */
48
mk32x2(UInt w1,UInt w0)49 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
50 return (((ULong)w1) << 32) | ((ULong)w0);
51 }
52
sel32x2_1(ULong w64)53 static inline UInt sel32x2_1 ( ULong w64 ) {
54 return 0xFFFFFFFF & toUInt(w64 >> 32);
55 }
sel32x2_0(ULong w64)56 static inline UInt sel32x2_0 ( ULong w64 ) {
57 return 0xFFFFFFFF & toUInt(w64);
58 }
59
60
61 /* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless
62 with 64-bit shifts so we give it a hand. */
63
mk16x4(UShort w3,UShort w2,UShort w1,UShort w0)64 static inline ULong mk16x4 ( UShort w3, UShort w2,
65 UShort w1, UShort w0 ) {
66 UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
67 UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
68 return mk32x2(hi32, lo32);
69 }
70
sel16x4_3(ULong w64)71 static inline UShort sel16x4_3 ( ULong w64 ) {
72 UInt hi32 = toUInt(w64 >> 32);
73 return toUShort(0xFFFF & (hi32 >> 16));
74 }
sel16x4_2(ULong w64)75 static inline UShort sel16x4_2 ( ULong w64 ) {
76 UInt hi32 = toUInt(w64 >> 32);
77 return toUShort(0xFFFF & hi32);
78 }
sel16x4_1(ULong w64)79 static inline UShort sel16x4_1 ( ULong w64 ) {
80 UInt lo32 = (UInt)w64;
81 return toUShort(0xFFFF & (lo32 >> 16));
82 }
sel16x4_0(ULong w64)83 static inline UShort sel16x4_0 ( ULong w64 ) {
84 UInt lo32 = (UInt)w64;
85 return toUShort(0xFFFF & lo32);
86 }
87
88
89 /* Tuple/select functions for 8x8 vectors. */
90
mk8x8(UChar w7,UChar w6,UChar w5,UChar w4,UChar w3,UChar w2,UChar w1,UChar w0)91 static inline ULong mk8x8 ( UChar w7, UChar w6,
92 UChar w5, UChar w4,
93 UChar w3, UChar w2,
94 UChar w1, UChar w0 ) {
95 UInt hi32 = (((UInt)w7) << 24) | (((UInt)w6) << 16)
96 | (((UInt)w5) << 8) | (((UInt)w4) << 0);
97 UInt lo32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
98 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
99 return mk32x2(hi32, lo32);
100 }
101
sel8x8_7(ULong w64)102 static inline UChar sel8x8_7 ( ULong w64 ) {
103 UInt hi32 = toUInt(w64 >> 32);
104 return toUChar(0xFF & (hi32 >> 24));
105 }
sel8x8_6(ULong w64)106 static inline UChar sel8x8_6 ( ULong w64 ) {
107 UInt hi32 = toUInt(w64 >> 32);
108 return toUChar(0xFF & (hi32 >> 16));
109 }
sel8x8_5(ULong w64)110 static inline UChar sel8x8_5 ( ULong w64 ) {
111 UInt hi32 = toUInt(w64 >> 32);
112 return toUChar(0xFF & (hi32 >> 8));
113 }
sel8x8_4(ULong w64)114 static inline UChar sel8x8_4 ( ULong w64 ) {
115 UInt hi32 = toUInt(w64 >> 32);
116 return toUChar(0xFF & (hi32 >> 0));
117 }
sel8x8_3(ULong w64)118 static inline UChar sel8x8_3 ( ULong w64 ) {
119 UInt lo32 = (UInt)w64;
120 return toUChar(0xFF & (lo32 >> 24));
121 }
sel8x8_2(ULong w64)122 static inline UChar sel8x8_2 ( ULong w64 ) {
123 UInt lo32 = (UInt)w64;
124 return toUChar(0xFF & (lo32 >> 16));
125 }
sel8x8_1(ULong w64)126 static inline UChar sel8x8_1 ( ULong w64 ) {
127 UInt lo32 = (UInt)w64;
128 return toUChar(0xFF & (lo32 >> 8));
129 }
sel8x8_0(ULong w64)130 static inline UChar sel8x8_0 ( ULong w64 ) {
131 UInt lo32 = (UInt)w64;
132 return toUChar(0xFF & (lo32 >> 0));
133 }
134
index8x8(ULong w64,UChar ix)135 static inline UChar index8x8 ( ULong w64, UChar ix ) {
136 ix &= 7;
137 return toUChar((w64 >> (8*ix)) & 0xFF);
138 }
139
140
141 /* Scalar helpers. */
142
qadd32S(Int xx,Int yy)143 static inline Int qadd32S ( Int xx, Int yy )
144 {
145 Long t = ((Long)xx) + ((Long)yy);
146 const Long loLim = -0x80000000LL;
147 const Long hiLim = 0x7FFFFFFFLL;
148 if (t < loLim) t = loLim;
149 if (t > hiLim) t = hiLim;
150 return (Int)t;
151 }
152
qadd16S(Short xx,Short yy)153 static inline Short qadd16S ( Short xx, Short yy )
154 {
155 Int t = ((Int)xx) + ((Int)yy);
156 if (t < -32768) t = -32768;
157 if (t > 32767) t = 32767;
158 return (Short)t;
159 }
160
qadd8S(Char xx,Char yy)161 static inline Char qadd8S ( Char xx, Char yy )
162 {
163 Int t = ((Int)xx) + ((Int)yy);
164 if (t < -128) t = -128;
165 if (t > 127) t = 127;
166 return (Char)t;
167 }
168
qadd16U(UShort xx,UShort yy)169 static inline UShort qadd16U ( UShort xx, UShort yy )
170 {
171 UInt t = ((UInt)xx) + ((UInt)yy);
172 if (t > 0xFFFF) t = 0xFFFF;
173 return (UShort)t;
174 }
175
qadd8U(UChar xx,UChar yy)176 static inline UChar qadd8U ( UChar xx, UChar yy )
177 {
178 UInt t = ((UInt)xx) + ((UInt)yy);
179 if (t > 0xFF) t = 0xFF;
180 return (UChar)t;
181 }
182
qsub32S(Int xx,Int yy)183 static inline Int qsub32S ( Int xx, Int yy )
184 {
185 Long t = ((Long)xx) - ((Long)yy);
186 const Long loLim = -0x80000000LL;
187 const Long hiLim = 0x7FFFFFFFLL;
188 if (t < loLim) t = loLim;
189 if (t > hiLim) t = hiLim;
190 return (Int)t;
191 }
192
qsub16S(Short xx,Short yy)193 static inline Short qsub16S ( Short xx, Short yy )
194 {
195 Int t = ((Int)xx) - ((Int)yy);
196 if (t < -32768) t = -32768;
197 if (t > 32767) t = 32767;
198 return (Short)t;
199 }
200
qsub8S(Char xx,Char yy)201 static inline Char qsub8S ( Char xx, Char yy )
202 {
203 Int t = ((Int)xx) - ((Int)yy);
204 if (t < -128) t = -128;
205 if (t > 127) t = 127;
206 return (Char)t;
207 }
208
qsub16U(UShort xx,UShort yy)209 static inline UShort qsub16U ( UShort xx, UShort yy )
210 {
211 Int t = ((Int)xx) - ((Int)yy);
212 if (t < 0) t = 0;
213 if (t > 0xFFFF) t = 0xFFFF;
214 return (UShort)t;
215 }
216
qsub8U(UChar xx,UChar yy)217 static inline UChar qsub8U ( UChar xx, UChar yy )
218 {
219 Int t = ((Int)xx) - ((Int)yy);
220 if (t < 0) t = 0;
221 if (t > 0xFF) t = 0xFF;
222 return (UChar)t;
223 }
224
mul16(Short xx,Short yy)225 static inline Short mul16 ( Short xx, Short yy )
226 {
227 Int t = ((Int)xx) * ((Int)yy);
228 return (Short)t;
229 }
230
mul32(Int xx,Int yy)231 static inline Int mul32 ( Int xx, Int yy )
232 {
233 Int t = ((Int)xx) * ((Int)yy);
234 return (Int)t;
235 }
236
mulhi16S(Short xx,Short yy)237 static inline Short mulhi16S ( Short xx, Short yy )
238 {
239 Int t = ((Int)xx) * ((Int)yy);
240 t >>=/*s*/ 16;
241 return (Short)t;
242 }
243
mulhi16U(UShort xx,UShort yy)244 static inline UShort mulhi16U ( UShort xx, UShort yy )
245 {
246 UInt t = ((UInt)xx) * ((UInt)yy);
247 t >>=/*u*/ 16;
248 return (UShort)t;
249 }
250
cmpeq32(UInt xx,UInt yy)251 static inline UInt cmpeq32 ( UInt xx, UInt yy )
252 {
253 return xx==yy ? 0xFFFFFFFF : 0;
254 }
255
cmpeq16(UShort xx,UShort yy)256 static inline UShort cmpeq16 ( UShort xx, UShort yy )
257 {
258 return toUShort(xx==yy ? 0xFFFF : 0);
259 }
260
cmpeq8(UChar xx,UChar yy)261 static inline UChar cmpeq8 ( UChar xx, UChar yy )
262 {
263 return toUChar(xx==yy ? 0xFF : 0);
264 }
265
cmpgt32S(Int xx,Int yy)266 static inline UInt cmpgt32S ( Int xx, Int yy )
267 {
268 return xx>yy ? 0xFFFFFFFF : 0;
269 }
270
cmpgt16S(Short xx,Short yy)271 static inline UShort cmpgt16S ( Short xx, Short yy )
272 {
273 return toUShort(xx>yy ? 0xFFFF : 0);
274 }
275
cmpgt8S(Char xx,Char yy)276 static inline UChar cmpgt8S ( Char xx, Char yy )
277 {
278 return toUChar(xx>yy ? 0xFF : 0);
279 }
280
cmpnez32(UInt xx)281 static inline UInt cmpnez32 ( UInt xx )
282 {
283 return xx==0 ? 0 : 0xFFFFFFFF;
284 }
285
cmpnez16(UShort xx)286 static inline UShort cmpnez16 ( UShort xx )
287 {
288 return toUShort(xx==0 ? 0 : 0xFFFF);
289 }
290
cmpnez8(UChar xx)291 static inline UChar cmpnez8 ( UChar xx )
292 {
293 return toUChar(xx==0 ? 0 : 0xFF);
294 }
295
qnarrow32Sto16S(UInt xx0)296 static inline Short qnarrow32Sto16S ( UInt xx0 )
297 {
298 Int xx = (Int)xx0;
299 if (xx < -32768) xx = -32768;
300 if (xx > 32767) xx = 32767;
301 return (Short)xx;
302 }
303
qnarrow16Sto8S(UShort xx0)304 static inline Char qnarrow16Sto8S ( UShort xx0 )
305 {
306 Short xx = (Short)xx0;
307 if (xx < -128) xx = -128;
308 if (xx > 127) xx = 127;
309 return (Char)xx;
310 }
311
qnarrow16Sto8U(UShort xx0)312 static inline UChar qnarrow16Sto8U ( UShort xx0 )
313 {
314 Short xx = (Short)xx0;
315 if (xx < 0) xx = 0;
316 if (xx > 255) xx = 255;
317 return (UChar)xx;
318 }
319
narrow32to16(UInt xx)320 static inline UShort narrow32to16 ( UInt xx )
321 {
322 return (UShort)xx;
323 }
324
narrow16to8(UShort xx)325 static inline UChar narrow16to8 ( UShort xx )
326 {
327 return (UChar)xx;
328 }
329
330 /* shifts: we don't care about out-of-range ones, since
331 that is dealt with at a higher level. */
332
shl8(UChar v,UInt n)333 static inline UChar shl8 ( UChar v, UInt n )
334 {
335 return toUChar(v << n);
336 }
337
sar8(UChar v,UInt n)338 static inline UChar sar8 ( UChar v, UInt n )
339 {
340 return toUChar(((Char)v) >> n);
341 }
342
shl16(UShort v,UInt n)343 static inline UShort shl16 ( UShort v, UInt n )
344 {
345 return toUShort(v << n);
346 }
347
shr16(UShort v,UInt n)348 static inline UShort shr16 ( UShort v, UInt n )
349 {
350 return toUShort((((UShort)v) >> n));
351 }
352
sar16(UShort v,UInt n)353 static inline UShort sar16 ( UShort v, UInt n )
354 {
355 return toUShort(((Short)v) >> n);
356 }
357
shl32(UInt v,UInt n)358 static inline UInt shl32 ( UInt v, UInt n )
359 {
360 return v << n;
361 }
362
shr32(UInt v,UInt n)363 static inline UInt shr32 ( UInt v, UInt n )
364 {
365 return (((UInt)v) >> n);
366 }
367
sar32(UInt v,UInt n)368 static inline UInt sar32 ( UInt v, UInt n )
369 {
370 return ((Int)v) >> n;
371 }
372
avg8U(UChar xx,UChar yy)373 static inline UChar avg8U ( UChar xx, UChar yy )
374 {
375 UInt xxi = (UInt)xx;
376 UInt yyi = (UInt)yy;
377 UInt r = (xxi + yyi + 1) >> 1;
378 return (UChar)r;
379 }
380
avg16U(UShort xx,UShort yy)381 static inline UShort avg16U ( UShort xx, UShort yy )
382 {
383 UInt xxi = (UInt)xx;
384 UInt yyi = (UInt)yy;
385 UInt r = (xxi + yyi + 1) >> 1;
386 return (UShort)r;
387 }
388
max16S(Short xx,Short yy)389 static inline Short max16S ( Short xx, Short yy )
390 {
391 return toUShort((xx > yy) ? xx : yy);
392 }
393
max8U(UChar xx,UChar yy)394 static inline UChar max8U ( UChar xx, UChar yy )
395 {
396 return toUChar((xx > yy) ? xx : yy);
397 }
398
min16S(Short xx,Short yy)399 static inline Short min16S ( Short xx, Short yy )
400 {
401 return toUShort((xx < yy) ? xx : yy);
402 }
403
min8U(UChar xx,UChar yy)404 static inline UChar min8U ( UChar xx, UChar yy )
405 {
406 return toUChar((xx < yy) ? xx : yy);
407 }
408
hadd16U(UShort xx,UShort yy)409 static inline UShort hadd16U ( UShort xx, UShort yy )
410 {
411 UInt xxi = (UInt)xx;
412 UInt yyi = (UInt)yy;
413 UInt r = (xxi + yyi) >> 1;
414 return (UShort)r;
415 }
416
hadd16S(Short xx,Short yy)417 static inline Short hadd16S ( Short xx, Short yy )
418 {
419 Int xxi = (Int)xx;
420 Int yyi = (Int)yy;
421 Int r = (xxi + yyi) >> 1;
422 return (Short)r;
423 }
424
hsub16U(UShort xx,UShort yy)425 static inline UShort hsub16U ( UShort xx, UShort yy )
426 {
427 UInt xxi = (UInt)xx;
428 UInt yyi = (UInt)yy;
429 UInt r = (xxi - yyi) >> 1;
430 return (UShort)r;
431 }
432
hsub16S(Short xx,Short yy)433 static inline Short hsub16S ( Short xx, Short yy )
434 {
435 Int xxi = (Int)xx;
436 Int yyi = (Int)yy;
437 Int r = (xxi - yyi) >> 1;
438 return (Short)r;
439 }
440
hadd8U(UChar xx,UChar yy)441 static inline UChar hadd8U ( UChar xx, UChar yy )
442 {
443 UInt xxi = (UInt)xx;
444 UInt yyi = (UInt)yy;
445 UInt r = (xxi + yyi) >> 1;
446 return (UChar)r;
447 }
448
hadd8S(Char xx,Char yy)449 static inline Char hadd8S ( Char xx, Char yy )
450 {
451 Int xxi = (Int)xx;
452 Int yyi = (Int)yy;
453 Int r = (xxi + yyi) >> 1;
454 return (Char)r;
455 }
456
hsub8U(UChar xx,UChar yy)457 static inline UChar hsub8U ( UChar xx, UChar yy )
458 {
459 UInt xxi = (UInt)xx;
460 UInt yyi = (UInt)yy;
461 UInt r = (xxi - yyi) >> 1;
462 return (UChar)r;
463 }
464
hsub8S(Char xx,Char yy)465 static inline Char hsub8S ( Char xx, Char yy )
466 {
467 Int xxi = (Int)xx;
468 Int yyi = (Int)yy;
469 Int r = (xxi - yyi) >> 1;
470 return (Char)r;
471 }
472
absdiff8U(UChar xx,UChar yy)473 static inline UInt absdiff8U ( UChar xx, UChar yy )
474 {
475 UInt xxu = (UChar)xx;
476 UInt yyu = (UChar)yy;
477 return xxu >= yyu ? xxu - yyu : yyu - xxu;
478 }
479
480 /* ----------------------------------------------------- */
481 /* Start of the externally visible functions. These simply
482 implement the corresponding IR primops. */
483 /* ----------------------------------------------------- */
484
485 /* ------------ Normal addition ------------ */
486
h_generic_calc_Add32x2(ULong xx,ULong yy)487 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
488 {
489 return mk32x2(
490 sel32x2_1(xx) + sel32x2_1(yy),
491 sel32x2_0(xx) + sel32x2_0(yy)
492 );
493 }
494
h_generic_calc_Add16x4(ULong xx,ULong yy)495 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
496 {
497 return mk16x4(
498 toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
499 toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
500 toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
501 toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
502 );
503 }
504
h_generic_calc_Add8x8(ULong xx,ULong yy)505 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
506 {
507 return mk8x8(
508 toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
509 toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
510 toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
511 toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
512 toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
513 toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
514 toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
515 toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
516 );
517 }
518
519 /* ------------ Saturating addition ------------ */
520
h_generic_calc_QAdd16Sx4(ULong xx,ULong yy)521 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
522 {
523 return mk16x4(
524 qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
525 qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
526 qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
527 qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
528 );
529 }
530
h_generic_calc_QAdd8Sx8(ULong xx,ULong yy)531 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
532 {
533 return mk8x8(
534 qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
535 qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
536 qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
537 qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
538 qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
539 qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
540 qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
541 qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
542 );
543 }
544
h_generic_calc_QAdd16Ux4(ULong xx,ULong yy)545 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
546 {
547 return mk16x4(
548 qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
549 qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
550 qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
551 qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
552 );
553 }
554
h_generic_calc_QAdd8Ux8(ULong xx,ULong yy)555 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
556 {
557 return mk8x8(
558 qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
559 qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
560 qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
561 qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
562 qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
563 qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
564 qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
565 qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
566 );
567 }
568
569 /* ------------ Normal subtraction ------------ */
570
h_generic_calc_Sub32x2(ULong xx,ULong yy)571 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
572 {
573 return mk32x2(
574 sel32x2_1(xx) - sel32x2_1(yy),
575 sel32x2_0(xx) - sel32x2_0(yy)
576 );
577 }
578
h_generic_calc_Sub16x4(ULong xx,ULong yy)579 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
580 {
581 return mk16x4(
582 toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
583 toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
584 toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
585 toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
586 );
587 }
588
h_generic_calc_Sub8x8(ULong xx,ULong yy)589 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
590 {
591 return mk8x8(
592 toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
593 toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
594 toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
595 toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
596 toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
597 toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
598 toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
599 toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
600 );
601 }
602
603 /* ------------ Saturating subtraction ------------ */
604
h_generic_calc_QSub16Sx4(ULong xx,ULong yy)605 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
606 {
607 return mk16x4(
608 qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
609 qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
610 qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
611 qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
612 );
613 }
614
h_generic_calc_QSub8Sx8(ULong xx,ULong yy)615 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
616 {
617 return mk8x8(
618 qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
619 qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
620 qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
621 qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
622 qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
623 qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
624 qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
625 qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
626 );
627 }
628
h_generic_calc_QSub16Ux4(ULong xx,ULong yy)629 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
630 {
631 return mk16x4(
632 qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
633 qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
634 qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
635 qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
636 );
637 }
638
h_generic_calc_QSub8Ux8(ULong xx,ULong yy)639 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
640 {
641 return mk8x8(
642 qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
643 qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
644 qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
645 qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
646 qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
647 qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
648 qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
649 qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
650 );
651 }
652
653 /* ------------ Multiplication ------------ */
654
h_generic_calc_Mul16x4(ULong xx,ULong yy)655 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
656 {
657 return mk16x4(
658 mul16( sel16x4_3(xx), sel16x4_3(yy) ),
659 mul16( sel16x4_2(xx), sel16x4_2(yy) ),
660 mul16( sel16x4_1(xx), sel16x4_1(yy) ),
661 mul16( sel16x4_0(xx), sel16x4_0(yy) )
662 );
663 }
664
h_generic_calc_Mul32x2(ULong xx,ULong yy)665 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
666 {
667 return mk32x2(
668 mul32( sel32x2_1(xx), sel32x2_1(yy) ),
669 mul32( sel32x2_0(xx), sel32x2_0(yy) )
670 );
671 }
672
h_generic_calc_MulHi16Sx4(ULong xx,ULong yy)673 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
674 {
675 return mk16x4(
676 mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
677 mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
678 mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
679 mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
680 );
681 }
682
h_generic_calc_MulHi16Ux4(ULong xx,ULong yy)683 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
684 {
685 return mk16x4(
686 mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
687 mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
688 mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
689 mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
690 );
691 }
692
693 /* ------------ Comparison ------------ */
694
h_generic_calc_CmpEQ32x2(ULong xx,ULong yy)695 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
696 {
697 return mk32x2(
698 cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
699 cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
700 );
701 }
702
h_generic_calc_CmpEQ16x4(ULong xx,ULong yy)703 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
704 {
705 return mk16x4(
706 cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
707 cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
708 cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
709 cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
710 );
711 }
712
h_generic_calc_CmpEQ8x8(ULong xx,ULong yy)713 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
714 {
715 return mk8x8(
716 cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
717 cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
718 cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
719 cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
720 cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
721 cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
722 cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
723 cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
724 );
725 }
726
h_generic_calc_CmpGT32Sx2(ULong xx,ULong yy)727 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
728 {
729 return mk32x2(
730 cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
731 cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
732 );
733 }
734
h_generic_calc_CmpGT16Sx4(ULong xx,ULong yy)735 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
736 {
737 return mk16x4(
738 cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
739 cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
740 cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
741 cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
742 );
743 }
744
h_generic_calc_CmpGT8Sx8(ULong xx,ULong yy)745 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
746 {
747 return mk8x8(
748 cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
749 cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
750 cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
751 cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
752 cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
753 cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
754 cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
755 cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
756 );
757 }
758
h_generic_calc_CmpNEZ32x2(ULong xx)759 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
760 {
761 return mk32x2(
762 cmpnez32( sel32x2_1(xx) ),
763 cmpnez32( sel32x2_0(xx) )
764 );
765 }
766
h_generic_calc_CmpNEZ16x4(ULong xx)767 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
768 {
769 return mk16x4(
770 cmpnez16( sel16x4_3(xx) ),
771 cmpnez16( sel16x4_2(xx) ),
772 cmpnez16( sel16x4_1(xx) ),
773 cmpnez16( sel16x4_0(xx) )
774 );
775 }
776
h_generic_calc_CmpNEZ8x8(ULong xx)777 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
778 {
779 return mk8x8(
780 cmpnez8( sel8x8_7(xx) ),
781 cmpnez8( sel8x8_6(xx) ),
782 cmpnez8( sel8x8_5(xx) ),
783 cmpnez8( sel8x8_4(xx) ),
784 cmpnez8( sel8x8_3(xx) ),
785 cmpnez8( sel8x8_2(xx) ),
786 cmpnez8( sel8x8_1(xx) ),
787 cmpnez8( sel8x8_0(xx) )
788 );
789 }
790
791 /* ------------ Saturating narrowing ------------ */
792
h_generic_calc_QNarrowBin32Sto16Sx4(ULong aa,ULong bb)793 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
794 {
795 UInt d = sel32x2_1(aa);
796 UInt c = sel32x2_0(aa);
797 UInt b = sel32x2_1(bb);
798 UInt a = sel32x2_0(bb);
799 return mk16x4(
800 qnarrow32Sto16S(d),
801 qnarrow32Sto16S(c),
802 qnarrow32Sto16S(b),
803 qnarrow32Sto16S(a)
804 );
805 }
806
h_generic_calc_QNarrowBin16Sto8Sx8(ULong aa,ULong bb)807 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
808 {
809 UShort h = sel16x4_3(aa);
810 UShort g = sel16x4_2(aa);
811 UShort f = sel16x4_1(aa);
812 UShort e = sel16x4_0(aa);
813 UShort d = sel16x4_3(bb);
814 UShort c = sel16x4_2(bb);
815 UShort b = sel16x4_1(bb);
816 UShort a = sel16x4_0(bb);
817 return mk8x8(
818 qnarrow16Sto8S(h),
819 qnarrow16Sto8S(g),
820 qnarrow16Sto8S(f),
821 qnarrow16Sto8S(e),
822 qnarrow16Sto8S(d),
823 qnarrow16Sto8S(c),
824 qnarrow16Sto8S(b),
825 qnarrow16Sto8S(a)
826 );
827 }
828
h_generic_calc_QNarrowBin16Sto8Ux8(ULong aa,ULong bb)829 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
830 {
831 UShort h = sel16x4_3(aa);
832 UShort g = sel16x4_2(aa);
833 UShort f = sel16x4_1(aa);
834 UShort e = sel16x4_0(aa);
835 UShort d = sel16x4_3(bb);
836 UShort c = sel16x4_2(bb);
837 UShort b = sel16x4_1(bb);
838 UShort a = sel16x4_0(bb);
839 return mk8x8(
840 qnarrow16Sto8U(h),
841 qnarrow16Sto8U(g),
842 qnarrow16Sto8U(f),
843 qnarrow16Sto8U(e),
844 qnarrow16Sto8U(d),
845 qnarrow16Sto8U(c),
846 qnarrow16Sto8U(b),
847 qnarrow16Sto8U(a)
848 );
849 }
850
851 /* ------------ Truncating narrowing ------------ */
852
h_generic_calc_NarrowBin32to16x4(ULong aa,ULong bb)853 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
854 {
855 UInt d = sel32x2_1(aa);
856 UInt c = sel32x2_0(aa);
857 UInt b = sel32x2_1(bb);
858 UInt a = sel32x2_0(bb);
859 return mk16x4(
860 narrow32to16(d),
861 narrow32to16(c),
862 narrow32to16(b),
863 narrow32to16(a)
864 );
865 }
866
h_generic_calc_NarrowBin16to8x8(ULong aa,ULong bb)867 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
868 {
869 UShort h = sel16x4_3(aa);
870 UShort g = sel16x4_2(aa);
871 UShort f = sel16x4_1(aa);
872 UShort e = sel16x4_0(aa);
873 UShort d = sel16x4_3(bb);
874 UShort c = sel16x4_2(bb);
875 UShort b = sel16x4_1(bb);
876 UShort a = sel16x4_0(bb);
877 return mk8x8(
878 narrow16to8(h),
879 narrow16to8(g),
880 narrow16to8(f),
881 narrow16to8(e),
882 narrow16to8(d),
883 narrow16to8(c),
884 narrow16to8(b),
885 narrow16to8(a)
886 );
887 }
888
889 /* ------------ Interleaving ------------ */
890
h_generic_calc_InterleaveHI8x8(ULong aa,ULong bb)891 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
892 {
893 return mk8x8(
894 sel8x8_7(aa),
895 sel8x8_7(bb),
896 sel8x8_6(aa),
897 sel8x8_6(bb),
898 sel8x8_5(aa),
899 sel8x8_5(bb),
900 sel8x8_4(aa),
901 sel8x8_4(bb)
902 );
903 }
904
h_generic_calc_InterleaveLO8x8(ULong aa,ULong bb)905 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
906 {
907 return mk8x8(
908 sel8x8_3(aa),
909 sel8x8_3(bb),
910 sel8x8_2(aa),
911 sel8x8_2(bb),
912 sel8x8_1(aa),
913 sel8x8_1(bb),
914 sel8x8_0(aa),
915 sel8x8_0(bb)
916 );
917 }
918
h_generic_calc_InterleaveHI16x4(ULong aa,ULong bb)919 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
920 {
921 return mk16x4(
922 sel16x4_3(aa),
923 sel16x4_3(bb),
924 sel16x4_2(aa),
925 sel16x4_2(bb)
926 );
927 }
928
h_generic_calc_InterleaveLO16x4(ULong aa,ULong bb)929 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
930 {
931 return mk16x4(
932 sel16x4_1(aa),
933 sel16x4_1(bb),
934 sel16x4_0(aa),
935 sel16x4_0(bb)
936 );
937 }
938
h_generic_calc_InterleaveHI32x2(ULong aa,ULong bb)939 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
940 {
941 return mk32x2(
942 sel32x2_1(aa),
943 sel32x2_1(bb)
944 );
945 }
946
h_generic_calc_InterleaveLO32x2(ULong aa,ULong bb)947 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
948 {
949 return mk32x2(
950 sel32x2_0(aa),
951 sel32x2_0(bb)
952 );
953 }
954
955 /* ------------ Concatenation ------------ */
956
h_generic_calc_CatOddLanes16x4(ULong aa,ULong bb)957 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
958 {
959 return mk16x4(
960 sel16x4_3(aa),
961 sel16x4_1(aa),
962 sel16x4_3(bb),
963 sel16x4_1(bb)
964 );
965 }
966
h_generic_calc_CatEvenLanes16x4(ULong aa,ULong bb)967 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
968 {
969 return mk16x4(
970 sel16x4_2(aa),
971 sel16x4_0(aa),
972 sel16x4_2(bb),
973 sel16x4_0(bb)
974 );
975 }
976
977 /* misc hack looking for a proper home */
h_generic_calc_Perm8x8(ULong aa,ULong bb)978 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
979 {
980 return mk8x8(
981 index8x8(aa, sel8x8_7(bb)),
982 index8x8(aa, sel8x8_6(bb)),
983 index8x8(aa, sel8x8_5(bb)),
984 index8x8(aa, sel8x8_4(bb)),
985 index8x8(aa, sel8x8_3(bb)),
986 index8x8(aa, sel8x8_2(bb)),
987 index8x8(aa, sel8x8_1(bb)),
988 index8x8(aa, sel8x8_0(bb))
989 );
990 }
991
992 /* ------------ Shifting ------------ */
993 /* Note that because these primops are undefined if the shift amount
994 equals or exceeds the lane width, the shift amount is masked so
995 that the scalar shifts are always in range. In fact, given the
996 semantics of these primops (ShlN16x4, etc) it is an error if in
997 fact we are ever given an out-of-range shift amount.
998 */
h_generic_calc_ShlN32x2(ULong xx,UInt nn)999 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
1000 {
1001 /* vassert(nn < 32); */
1002 nn &= 31;
1003 return mk32x2(
1004 shl32( sel32x2_1(xx), nn ),
1005 shl32( sel32x2_0(xx), nn )
1006 );
1007 }
1008
h_generic_calc_ShlN16x4(ULong xx,UInt nn)1009 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
1010 {
1011 /* vassert(nn < 16); */
1012 nn &= 15;
1013 return mk16x4(
1014 shl16( sel16x4_3(xx), nn ),
1015 shl16( sel16x4_2(xx), nn ),
1016 shl16( sel16x4_1(xx), nn ),
1017 shl16( sel16x4_0(xx), nn )
1018 );
1019 }
1020
h_generic_calc_ShlN8x8(ULong xx,UInt nn)1021 ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn )
1022 {
1023 /* vassert(nn < 8); */
1024 nn &= 7;
1025 return mk8x8(
1026 shl8( sel8x8_7(xx), nn ),
1027 shl8( sel8x8_6(xx), nn ),
1028 shl8( sel8x8_5(xx), nn ),
1029 shl8( sel8x8_4(xx), nn ),
1030 shl8( sel8x8_3(xx), nn ),
1031 shl8( sel8x8_2(xx), nn ),
1032 shl8( sel8x8_1(xx), nn ),
1033 shl8( sel8x8_0(xx), nn )
1034 );
1035 }
1036
h_generic_calc_ShrN32x2(ULong xx,UInt nn)1037 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
1038 {
1039 /* vassert(nn < 32); */
1040 nn &= 31;
1041 return mk32x2(
1042 shr32( sel32x2_1(xx), nn ),
1043 shr32( sel32x2_0(xx), nn )
1044 );
1045 }
1046
h_generic_calc_ShrN16x4(ULong xx,UInt nn)1047 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
1048 {
1049 /* vassert(nn < 16); */
1050 nn &= 15;
1051 return mk16x4(
1052 shr16( sel16x4_3(xx), nn ),
1053 shr16( sel16x4_2(xx), nn ),
1054 shr16( sel16x4_1(xx), nn ),
1055 shr16( sel16x4_0(xx), nn )
1056 );
1057 }
1058
h_generic_calc_SarN32x2(ULong xx,UInt nn)1059 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
1060 {
1061 /* vassert(nn < 32); */
1062 nn &= 31;
1063 return mk32x2(
1064 sar32( sel32x2_1(xx), nn ),
1065 sar32( sel32x2_0(xx), nn )
1066 );
1067 }
1068
h_generic_calc_SarN16x4(ULong xx,UInt nn)1069 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1070 {
1071 /* vassert(nn < 16); */
1072 nn &= 15;
1073 return mk16x4(
1074 sar16( sel16x4_3(xx), nn ),
1075 sar16( sel16x4_2(xx), nn ),
1076 sar16( sel16x4_1(xx), nn ),
1077 sar16( sel16x4_0(xx), nn )
1078 );
1079 }
1080
h_generic_calc_SarN8x8(ULong xx,UInt nn)1081 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1082 {
1083 /* vassert(nn < 8); */
1084 nn &= 7;
1085 return mk8x8(
1086 sar8( sel8x8_7(xx), nn ),
1087 sar8( sel8x8_6(xx), nn ),
1088 sar8( sel8x8_5(xx), nn ),
1089 sar8( sel8x8_4(xx), nn ),
1090 sar8( sel8x8_3(xx), nn ),
1091 sar8( sel8x8_2(xx), nn ),
1092 sar8( sel8x8_1(xx), nn ),
1093 sar8( sel8x8_0(xx), nn )
1094 );
1095 }
1096
1097 /* ------------ Averaging ------------ */
1098
h_generic_calc_Avg8Ux8(ULong xx,ULong yy)1099 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1100 {
1101 return mk8x8(
1102 avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1103 avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1104 avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1105 avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1106 avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1107 avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1108 avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1109 avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1110 );
1111 }
1112
h_generic_calc_Avg16Ux4(ULong xx,ULong yy)1113 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1114 {
1115 return mk16x4(
1116 avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1117 avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1118 avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1119 avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1120 );
1121 }
1122
1123 /* ------------ max/min ------------ */
1124
h_generic_calc_Max16Sx4(ULong xx,ULong yy)1125 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1126 {
1127 return mk16x4(
1128 max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1129 max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1130 max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1131 max16S( sel16x4_0(xx), sel16x4_0(yy) )
1132 );
1133 }
1134
h_generic_calc_Max8Ux8(ULong xx,ULong yy)1135 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1136 {
1137 return mk8x8(
1138 max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1139 max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1140 max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1141 max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1142 max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1143 max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1144 max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1145 max8U( sel8x8_0(xx), sel8x8_0(yy) )
1146 );
1147 }
1148
h_generic_calc_Min16Sx4(ULong xx,ULong yy)1149 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1150 {
1151 return mk16x4(
1152 min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1153 min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1154 min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1155 min16S( sel16x4_0(xx), sel16x4_0(yy) )
1156 );
1157 }
1158
h_generic_calc_Min8Ux8(ULong xx,ULong yy)1159 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1160 {
1161 return mk8x8(
1162 min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1163 min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1164 min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1165 min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1166 min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1167 min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1168 min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1169 min8U( sel8x8_0(xx), sel8x8_0(yy) )
1170 );
1171 }
1172
h_generic_calc_GetMSBs8x8(ULong xx)1173 UInt h_generic_calc_GetMSBs8x8 ( ULong xx )
1174 {
1175 UInt r = 0;
1176 if (xx & (1ULL << (64-1))) r |= (1<<7);
1177 if (xx & (1ULL << (56-1))) r |= (1<<6);
1178 if (xx & (1ULL << (48-1))) r |= (1<<5);
1179 if (xx & (1ULL << (40-1))) r |= (1<<4);
1180 if (xx & (1ULL << (32-1))) r |= (1<<3);
1181 if (xx & (1ULL << (24-1))) r |= (1<<2);
1182 if (xx & (1ULL << (16-1))) r |= (1<<1);
1183 if (xx & (1ULL << ( 8-1))) r |= (1<<0);
1184 return r;
1185 }
1186
1187 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1188
1189 /* Tuple/select functions for 16x2 vectors. */
mk16x2(UShort w1,UShort w2)1190 static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1191 return (((UInt)w1) << 16) | ((UInt)w2);
1192 }
1193
sel16x2_1(UInt w32)1194 static inline UShort sel16x2_1 ( UInt w32 ) {
1195 return 0xFFFF & (UShort)(w32 >> 16);
1196 }
sel16x2_0(UInt w32)1197 static inline UShort sel16x2_0 ( UInt w32 ) {
1198 return 0xFFFF & (UShort)(w32);
1199 }
1200
mk8x4(UChar w3,UChar w2,UChar w1,UChar w0)1201 static inline UInt mk8x4 ( UChar w3, UChar w2,
1202 UChar w1, UChar w0 ) {
1203 UInt w32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
1204 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
1205 return w32;
1206 }
1207
sel8x4_3(UInt w32)1208 static inline UChar sel8x4_3 ( UInt w32 ) {
1209 return toUChar(0xFF & (w32 >> 24));
1210 }
sel8x4_2(UInt w32)1211 static inline UChar sel8x4_2 ( UInt w32 ) {
1212 return toUChar(0xFF & (w32 >> 16));
1213 }
sel8x4_1(UInt w32)1214 static inline UChar sel8x4_1 ( UInt w32 ) {
1215 return toUChar(0xFF & (w32 >> 8));
1216 }
sel8x4_0(UInt w32)1217 static inline UChar sel8x4_0 ( UInt w32 ) {
1218 return toUChar(0xFF & (w32 >> 0));
1219 }
1220
1221
1222 /* ----------------------------------------------------- */
1223 /* More externally visible functions. These simply
1224 implement the corresponding IR primops. */
1225 /* ----------------------------------------------------- */
1226
1227 /* ------ 16x2 ------ */
1228
h_generic_calc_Add16x2(UInt xx,UInt yy)1229 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1230 {
1231 return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1232 sel16x2_0(xx) + sel16x2_0(yy) );
1233 }
1234
h_generic_calc_Sub16x2(UInt xx,UInt yy)1235 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1236 {
1237 return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1238 sel16x2_0(xx) - sel16x2_0(yy) );
1239 }
1240
h_generic_calc_HAdd16Ux2(UInt xx,UInt yy)1241 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1242 {
1243 return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1244 hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1245 }
1246
h_generic_calc_HAdd16Sx2(UInt xx,UInt yy)1247 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1248 {
1249 return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1250 hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1251 }
1252
h_generic_calc_HSub16Ux2(UInt xx,UInt yy)1253 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1254 {
1255 return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1256 hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1257 }
1258
h_generic_calc_HSub16Sx2(UInt xx,UInt yy)1259 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1260 {
1261 return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1262 hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1263 }
1264
h_generic_calc_QAdd16Ux2(UInt xx,UInt yy)1265 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1266 {
1267 return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1268 qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1269 }
1270
h_generic_calc_QAdd16Sx2(UInt xx,UInt yy)1271 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1272 {
1273 return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1274 qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1275 }
1276
h_generic_calc_QSub16Ux2(UInt xx,UInt yy)1277 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1278 {
1279 return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1280 qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1281 }
1282
h_generic_calc_QSub16Sx2(UInt xx,UInt yy)1283 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1284 {
1285 return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1286 qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1287 }
1288
1289 /* ------ 8x4 ------ */
1290
h_generic_calc_Add8x4(UInt xx,UInt yy)1291 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1292 {
1293 return mk8x4(
1294 sel8x4_3(xx) + sel8x4_3(yy),
1295 sel8x4_2(xx) + sel8x4_2(yy),
1296 sel8x4_1(xx) + sel8x4_1(yy),
1297 sel8x4_0(xx) + sel8x4_0(yy)
1298 );
1299 }
1300
h_generic_calc_Sub8x4(UInt xx,UInt yy)1301 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1302 {
1303 return mk8x4(
1304 sel8x4_3(xx) - sel8x4_3(yy),
1305 sel8x4_2(xx) - sel8x4_2(yy),
1306 sel8x4_1(xx) - sel8x4_1(yy),
1307 sel8x4_0(xx) - sel8x4_0(yy)
1308 );
1309 }
1310
h_generic_calc_HAdd8Ux4(UInt xx,UInt yy)1311 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1312 {
1313 return mk8x4(
1314 hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1315 hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1316 hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1317 hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1318 );
1319 }
1320
h_generic_calc_HAdd8Sx4(UInt xx,UInt yy)1321 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1322 {
1323 return mk8x4(
1324 hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1325 hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1326 hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1327 hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1328 );
1329 }
1330
h_generic_calc_HSub8Ux4(UInt xx,UInt yy)1331 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1332 {
1333 return mk8x4(
1334 hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1335 hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1336 hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1337 hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1338 );
1339 }
1340
h_generic_calc_HSub8Sx4(UInt xx,UInt yy)1341 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1342 {
1343 return mk8x4(
1344 hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1345 hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1346 hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1347 hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1348 );
1349 }
1350
h_generic_calc_QAdd8Ux4(UInt xx,UInt yy)1351 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1352 {
1353 return mk8x4(
1354 qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1355 qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1356 qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1357 qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1358 );
1359 }
1360
h_generic_calc_QAdd8Sx4(UInt xx,UInt yy)1361 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1362 {
1363 return mk8x4(
1364 qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1365 qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1366 qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1367 qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1368 );
1369 }
1370
h_generic_calc_QSub8Ux4(UInt xx,UInt yy)1371 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1372 {
1373 return mk8x4(
1374 qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1375 qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1376 qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1377 qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1378 );
1379 }
1380
h_generic_calc_QSub8Sx4(UInt xx,UInt yy)1381 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1382 {
1383 return mk8x4(
1384 qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1385 qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1386 qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1387 qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1388 );
1389 }
1390
h_generic_calc_CmpNEZ16x2(UInt xx)1391 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1392 {
1393 return mk16x2(
1394 cmpnez16( sel16x2_1(xx) ),
1395 cmpnez16( sel16x2_0(xx) )
1396 );
1397 }
1398
h_generic_calc_CmpNEZ8x4(UInt xx)1399 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1400 {
1401 return mk8x4(
1402 cmpnez8( sel8x4_3(xx) ),
1403 cmpnez8( sel8x4_2(xx) ),
1404 cmpnez8( sel8x4_1(xx) ),
1405 cmpnez8( sel8x4_0(xx) )
1406 );
1407 }
1408
h_generic_calc_Sad8Ux4(UInt xx,UInt yy)1409 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1410 {
1411 return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1412 + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1413 + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1414 + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1415 }
1416
h_generic_calc_QAdd32S(UInt xx,UInt yy)1417 UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy )
1418 {
1419 return qadd32S( xx, yy );
1420 }
1421
h_generic_calc_QSub32S(UInt xx,UInt yy)1422 UInt h_generic_calc_QSub32S ( UInt xx, UInt yy )
1423 {
1424 return qsub32S( xx, yy );
1425 }
1426
1427
1428 /*------------------------------------------------------------------*/
1429 /* Decimal Floating Point (DFP) externally visible helper functions */
1430 /* that implement Iop_BCDtoDPB and Iop_DPBtoBCD */
1431 /*------------------------------------------------------------------*/
1432
1433 #define NOT( x ) ( ( ( x ) == 0) ? 1 : 0)
1434 #define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
1435 #define PUT( x, y ) ( ( x )<< ( y ) )
1436
dpb_to_bcd(ULong chunk)1437 static ULong dpb_to_bcd( ULong chunk )
1438 {
1439 Short a, b, c, d, e, f, g, h, i, j, k, m;
1440 Short p, q, r, s, t, u, v, w, x, y;
1441 ULong value;
1442
1443 /* convert 10 bit densely packed BCD to BCD */
1444 p = GET( chunk, 9 );
1445 q = GET( chunk, 8 );
1446 r = GET( chunk, 7 );
1447 s = GET( chunk, 6 );
1448 t = GET( chunk, 5 );
1449 u = GET( chunk, 4 );
1450 v = GET( chunk, 3 );
1451 w = GET( chunk, 2 );
1452 x = GET( chunk, 1 );
1453 y = GET( chunk, 0 );
1454
1455 /* The BCD bit values are given by the following boolean equations.*/
1456 a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) );
1457 b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) );
1458 c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) );
1459 d = r;
1460 e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w );
1461 f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) );
1462 g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) );
1463 h = u;
1464 i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) );
1465 j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x )
1466 | ( p & w & NOT(x) & v ) | ( w & NOT(v) );
1467 k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x )
1468 | ( q & v & w & NOT(x) ) | ( x & NOT(v) );
1469 m = y;
1470
1471 value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7)
1472 | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2)
1473 | PUT(k, 1) | PUT(m, 0);
1474 return value;
1475 }
1476
bcd_to_dpb(ULong chunk)1477 static ULong bcd_to_dpb( ULong chunk )
1478 {
1479 Short a, b, c, d, e, f, g, h, i, j, k, m;
1480 Short p, q, r, s, t, u, v, w, x, y;
1481 ULong value;
1482 /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
1483 The boolean equations to calculate the value of each of the DPD bit
1484 is given in Appendix B of Book 1: Power ISA User Instruction set. The
1485 bits for the DPD number are [abcdefghijkm]. The bits for the BCD value
1486 are [pqrstuvwxy]. The boolean logic equations in psuedo C code are:
1487 */
1488 a = GET( chunk, 11 );
1489 b = GET( chunk, 10 );
1490 c = GET( chunk, 9 );
1491 d = GET( chunk, 8 );
1492 e = GET( chunk, 7 );
1493 f = GET( chunk, 6 );
1494 g = GET( chunk, 5 );
1495 h = GET( chunk, 4 );
1496 i = GET( chunk, 3 );
1497 j = GET( chunk, 2 );
1498 k = GET( chunk, 1 );
1499 m = GET( chunk, 0 );
1500
1501 p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) );
1502 q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) );
1503 r = d;
1504 s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) )
1505 | ( f & NOT(a) & NOT(e) ) | ( e & i );
1506 t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) )
1507 | ( g & NOT(a) & NOT(e) ) | ( a & i );
1508 u = h;
1509 v = a | e | i;
1510 w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a;
1511 x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e;
1512 y = m;
1513
1514 value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5)
1515 | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y;
1516
1517 return value;
1518 }
1519
h_calc_DPBtoBCD(ULong dpb)1520 ULong h_calc_DPBtoBCD( ULong dpb )
1521 {
1522 ULong result, chunk;
1523 Int i;
1524
1525 result = 0;
1526
1527 for (i = 0; i < 5; i++) {
1528 chunk = dpb >> ( 4 - i ) * 10;
1529 result = result << 12;
1530 result |= dpb_to_bcd( chunk & 0x3FF );
1531 }
1532 return result;
1533 }
1534
h_calc_BCDtoDPB(ULong bcd)1535 ULong h_calc_BCDtoDPB( ULong bcd )
1536 {
1537 ULong result, chunk;
1538 Int i;
1539
1540 result = 0;
1541
1542 for (i = 0; i < 5; i++) {
1543 chunk = bcd >> ( 4 - i ) * 12;
1544 result = result << 10;
1545 result |= bcd_to_dpb( chunk & 0xFFF );
1546 }
1547 return result;
1548 }
1549 #undef NOT
1550 #undef GET
1551 #undef PUT
1552
1553
1554 /* ----------------------------------------------------- */
1555 /* Signed and unsigned integer division, that behave like
1556 the ARMv7 UDIV ansd SDIV instructions.
1557
1558 sdiv32 also behaves like 64-bit v8 SDIV on w-regs.
1559 udiv32 also behaves like 64-bit v8 UDIV on w-regs.
1560 */
1561 /* ----------------------------------------------------- */
1562
h_calc_udiv32_w_arm_semantics(UInt x,UInt y)1563 UInt h_calc_udiv32_w_arm_semantics ( UInt x, UInt y )
1564 {
1565 // Division by zero --> zero
1566 if (UNLIKELY(y == 0)) return 0;
1567 // C requires rounding towards zero, which is also what we need.
1568 return x / y;
1569 }
1570
h_calc_udiv64_w_arm_semantics(ULong x,ULong y)1571 ULong h_calc_udiv64_w_arm_semantics ( ULong x, ULong y )
1572 {
1573 // Division by zero --> zero
1574 if (UNLIKELY(y == 0)) return 0;
1575 // C requires rounding towards zero, which is also what we need.
1576 return x / y;
1577 }
1578
h_calc_sdiv32_w_arm_semantics(Int x,Int y)1579 Int h_calc_sdiv32_w_arm_semantics ( Int x, Int y )
1580 {
1581 // Division by zero --> zero
1582 if (UNLIKELY(y == 0)) return 0;
1583 // The single case that produces an unrepresentable result
1584 if (UNLIKELY( ((UInt)x) == ((UInt)0x80000000)
1585 && ((UInt)y) == ((UInt)0xFFFFFFFF) ))
1586 return (Int)(UInt)0x80000000;
1587 // Else return the result rounded towards zero. C89 says
1588 // this is implementation defined (in the signed case), but gcc
1589 // promises to round towards zero. Nevertheless, at startup,
1590 // in main_main.c, do a check for that.
1591 return x / y;
1592 }
1593
h_calc_sdiv64_w_arm_semantics(Long x,Long y)1594 Long h_calc_sdiv64_w_arm_semantics ( Long x, Long y )
1595 {
1596 // Division by zero --> zero
1597 if (UNLIKELY(y == 0)) return 0;
1598 // The single case that produces an unrepresentable result
1599 if (UNLIKELY( ((ULong)x) == ((ULong)0x8000000000000000ULL )
1600 && ((ULong)y) == ((ULong)0xFFFFFFFFFFFFFFFFULL ) ))
1601 return (Long)(ULong)0x8000000000000000ULL;
1602 // Else return the result rounded towards zero. C89 says
1603 // this is implementation defined (in the signed case), but gcc
1604 // promises to round towards zero. Nevertheless, at startup,
1605 // in main_main.c, do a check for that.
1606 return x / y;
1607 }
1608
1609
1610 /*---------------------------------------------------------------*/
1611 /*--- end host_generic_simd64.c ---*/
1612 /*---------------------------------------------------------------*/
1613