1
2 /* A program to test SSE4.1/SSE4.2 instructions.
3 Revisions: Nov.208 - wrote this file
4 Apr.10.2010 - added PEXTR* tests
5 Apr.16.2010 - added PINS* tests
6 */
7
8 /* HOW TO COMPILE:
9 gcc -m64 -g -O -Wall -o sse4-64 sse4-64.c
10 */
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <assert.h>
15 #include "tests/malloc.h"
16 #include <string.h>
17
18
19 typedef unsigned char V128[16];
20 typedef unsigned int UInt;
21 typedef signed int Int;
22 typedef unsigned char UChar;
23 typedef unsigned long long int ULong;
24
25 typedef unsigned char Bool;
26 #define False ((Bool)0)
27 #define True ((Bool)1)
28
29
30 typedef
31 struct {
32 V128 arg1;
33 V128 arg2;
34 V128 res;
35 }
36 RRArgs;
37
38 typedef
39 struct {
40 V128 arg1;
41 V128 res;
42 }
43 RMArgs;
44
do64HLtoV128(V128 * res,ULong wHi,ULong wLo)45 static void do64HLtoV128 ( /*OUT*/V128* res, ULong wHi, ULong wLo )
46 {
47 // try to sidestep strict-aliasing snafus by memcpying explicitly
48 UChar* p = (UChar*)res;
49 memcpy(&p[8], (UChar*)&wHi, 8);
50 memcpy(&p[0], (UChar*)&wLo, 8);
51 }
52
randUChar(void)53 static UChar randUChar ( void )
54 {
55 static UInt seed = 80021;
56 seed = 1103515245 * seed + 12345;
57 return (seed >> 17) & 0xFF;
58 }
59
randULong(void)60 static ULong randULong ( void )
61 {
62 Int i;
63 ULong r = 0;
64 for (i = 0; i < 8; i++) {
65 r = (r << 8) | (ULong)(0xFF & randUChar());
66 }
67 return r;
68 }
69
randV128(V128 * v)70 static void randV128 ( V128* v )
71 {
72 Int i;
73 for (i = 0; i < 16; i++)
74 (*v)[i] = randUChar();
75 }
76
showV128(V128 * v)77 static void showV128 ( V128* v )
78 {
79 Int i;
80 for (i = 15; i >= 0; i--)
81 printf("%02x", (Int)(*v)[i]);
82 }
83
showMaskedV128(V128 * v,V128 * mask)84 static void showMaskedV128 ( V128* v, V128* mask )
85 {
86 Int i;
87 for (i = 15; i >= 0; i--)
88 printf("%02x", (Int)( ((*v)[i]) & ((*mask)[i]) ));
89 }
90
showIGVV(char * rOrM,char * op,Int imm,ULong src64,V128 * dst,V128 * res)91 static void showIGVV( char* rOrM, char* op, Int imm,
92 ULong src64, V128* dst, V128* res )
93 {
94 printf("%s %10s $%d ", rOrM, op, imm);
95 printf("%016llx", src64);
96 printf(" ");
97 showV128(dst);
98 printf(" ");
99 showV128(res);
100 printf("\n");
101 }
102
showIAG(char * rOrM,char * op,Int imm,V128 * argL,ULong argR,ULong res)103 static void showIAG ( char* rOrM, char* op, Int imm,
104 V128* argL, ULong argR, ULong res )
105 {
106 printf("%s %10s $%d ", rOrM, op, imm);
107 showV128(argL);
108 printf(" ");
109 printf("%016llx", argR);
110 printf(" ");
111 printf("%016llx", res);
112 printf("\n");
113 }
114
showIAA(char * rOrM,char * op,Int imm,RRArgs * rra,V128 * rmask)115 static void showIAA ( char* rOrM, char* op, Int imm, RRArgs* rra, V128* rmask )
116 {
117 printf("%s %10s $%d ", rOrM, op, imm);
118 showV128(&rra->arg1);
119 printf(" ");
120 showV128(&rra->arg2);
121 printf(" ");
122 showMaskedV128(&rra->res, rmask);
123 printf("\n");
124 }
125
showAA(char * rOrM,char * op,RRArgs * rra,V128 * rmask)126 static void showAA ( char* rOrM, char* op, RRArgs* rra, V128* rmask )
127 {
128 printf("%s %10s ", rOrM, op);
129 showV128(&rra->arg1);
130 printf(" ");
131 showV128(&rra->arg2);
132 printf(" ");
133 showMaskedV128(&rra->res, rmask);
134 printf("\n");
135 }
136
137 /* Note: these are little endian. Hence first byte is the least
138 significant byte of lane zero. */
139
140 /* Mask for insns where all result bits are non-approximated. */
141 static V128 AllMask = { 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
142 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF };
143
144 /* Mark for insns which produce approximated vector short results. */
145 __attribute__((unused))
146 static V128 ApproxPS = { 0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF,
147 0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF };
148
149 /* Mark for insns which produce approximated scalar short results. */
150 __attribute__((unused))
151 static V128 ApproxSS = { 0x00,0x00,0x80,0xFF, 0xFF,0xFF,0xFF,0xFF,
152 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF };
153
154 static V128 fives = { 0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55,
155 0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55 };
156
157 static V128 zeroes = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
158 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
159
mkPosInf(void)160 double mkPosInf ( void ) { return 1.0 / 0.0; }
mkNegInf(void)161 double mkNegInf ( void ) { return -mkPosInf(); }
mkPosNan(void)162 double mkPosNan ( void ) { return 0.0 / 0.0; }
mkNegNan(void)163 double mkNegNan ( void ) { return -mkPosNan(); }
164
165 __attribute__((noinline))
get_mxcsr(void)166 UInt get_mxcsr ( void )
167 {
168 ULong w64;
169 __asm__ __volatile__(
170 "subq $8, %%rsp" "\n\t"
171 "stmxcsr (%%rsp)" "\n\t"
172 "movq (%%rsp), %0" "\n"
173 "addq $8, %%rsp"
174 : /*OUT*/"=r"(w64) : /*IN*/ : "memory","cc"
175 );
176 if (0) printf("get %08x\n", (UInt)w64);
177 return (UInt)w64;
178 }
179
180 __attribute__((noinline))
set_mxcsr(UInt w32)181 void set_mxcsr ( UInt w32 )
182 {
183 if (0) printf("set %08x\n", w32);
184 ULong w64 = (ULong)w32;
185 __asm__ __volatile__(
186 "subq $8, %%rsp" "\n\t"
187 "movq %0, (%%rsp)" "\n\t"
188 "ldmxcsr (%%rsp)" "\n\t"
189 "addq $8, %%rsp"
190 : /*OUT*/ : /*IN*/"r"(w64) : "memory",/*"mxcsr",*/"cc"
191 );
192 }
193
get_sse_roundingmode(void)194 UInt get_sse_roundingmode ( void )
195 {
196 UInt w = get_mxcsr();
197 return (w >> 13) & 3;
198 }
199
set_sse_roundingmode(UInt m)200 void set_sse_roundingmode ( UInt m )
201 {
202 UInt w;
203 assert(0 == (m & ~3));
204 w = get_mxcsr();
205 w &= ~(3 << 13);
206 w |= (m << 13);
207 set_mxcsr(w);
208 }
209
210
211 #define DO_imm_r_r(_opname, _imm, _src, _dst) \
212 { \
213 V128 _tmp; \
214 __asm__ __volatile__( \
215 "movupd (%0), %%xmm2" "\n\t" \
216 "movupd (%1), %%xmm11" "\n\t" \
217 _opname " $" #_imm ", %%xmm2, %%xmm11" "\n\t" \
218 "movupd %%xmm11, (%2)" "\n" \
219 : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp)) \
220 : "cc", "memory", "xmm2", "xmm11" \
221 ); \
222 RRArgs rra; \
223 memcpy(&rra.arg1, &(_src), sizeof(V128)); \
224 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \
225 memcpy(&rra.res, &(_tmp), sizeof(V128)); \
226 showIAA("r", (_opname), (_imm), &rra, &AllMask); \
227 }
228
229 #define DO_imm_m_r(_opname, _imm, _src, _dst) \
230 { \
231 V128 _tmp; \
232 V128* _srcM = memalign16(sizeof(V128)); \
233 memcpy(_srcM, &(_src), sizeof(V128)); \
234 __asm__ __volatile__( \
235 "movupd (%1), %%xmm11" "\n\t" \
236 _opname " $" #_imm ", (%0), %%xmm11" "\n\t" \
237 "movupd %%xmm11, (%2)" "\n" \
238 : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp)) \
239 : "cc", "memory", "xmm11" \
240 ); \
241 RRArgs rra; \
242 memcpy(&rra.arg1, &(_src), sizeof(V128)); \
243 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \
244 memcpy(&rra.res, &(_tmp), sizeof(V128)); \
245 showIAA("m", (_opname), (_imm), &rra, &AllMask); \
246 free(_srcM); \
247 }
248
249 #define DO_imm_mandr_r(_opname, _imm, _src, _dst) \
250 DO_imm_r_r( _opname, _imm, _src, _dst ) \
251 DO_imm_m_r( _opname, _imm, _src, _dst )
252
253
254
255
256
257 #define DO_r_r(_opname, _src, _dst) \
258 { \
259 V128 _tmp; \
260 __asm__ __volatile__( \
261 "movupd (%0), %%xmm2" "\n\t" \
262 "movupd (%1), %%xmm11" "\n\t" \
263 _opname " %%xmm2, %%xmm11" "\n\t" \
264 "movupd %%xmm11, (%2)" "\n" \
265 : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp)) \
266 : "cc", "memory", "xmm2", "xmm11" \
267 ); \
268 RRArgs rra; \
269 memcpy(&rra.arg1, &(_src), sizeof(V128)); \
270 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \
271 memcpy(&rra.res, &(_tmp), sizeof(V128)); \
272 showAA("r", (_opname), &rra, &AllMask); \
273 }
274
275 #define DO_m_r(_opname, _src, _dst) \
276 { \
277 V128 _tmp; \
278 V128* _srcM = memalign16(sizeof(V128)); \
279 memcpy(_srcM, &(_src), sizeof(V128)); \
280 __asm__ __volatile__( \
281 "movupd (%1), %%xmm11" "\n\t" \
282 _opname " (%0), %%xmm11" "\n\t" \
283 "movupd %%xmm11, (%2)" "\n" \
284 : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp)) \
285 : "cc", "memory", "xmm11" \
286 ); \
287 RRArgs rra; \
288 memcpy(&rra.arg1, &(_src), sizeof(V128)); \
289 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \
290 memcpy(&rra.res, &(_tmp), sizeof(V128)); \
291 showAA("m", (_opname), &rra, &AllMask); \
292 free(_srcM); \
293 }
294
295 #define DO_mandr_r(_opname, _src, _dst) \
296 DO_r_r(_opname, _src, _dst) \
297 DO_m_r(_opname, _src, _dst)
298
299
300
301
302 #define DO_imm_r_to_rscalar(_opname, _imm, _src, _dstsuffix) \
303 { \
304 ULong _scbefore = 0x5555555555555555ULL; \
305 ULong _scafter = 0xAAAAAAAAAAAAAAAAULL; \
306 /* This assumes that gcc won't make any of %0, %1, %2 */ \
307 /* be r11. That should be ensured (cough, cough) */ \
308 /* by declaring r11 to be clobbered. */ \
309 __asm__ __volatile__( \
310 "movupd (%0), %%xmm2" "\n\t" \
311 "movq (%1), %%r11" "\n\t" \
312 _opname " $" #_imm ", %%xmm2, %%r11" _dstsuffix "\n\t" \
313 "movq %%r11, (%2)" "\n" \
314 : /*out*/ \
315 : /*in*/ "r"(&(_src)), "r"(&(_scbefore)), "r"(&(_scafter)) \
316 : "cc", "memory", "xmm2", "r11" \
317 ); \
318 showIAG("r", (_opname), (_imm), &(_src), (_scbefore), (_scafter)); \
319 }
320
321 #define DO_imm_r_to_mscalar(_opname, _imm, _src) \
322 { \
323 ULong _scbefore = 0x5555555555555555ULL; \
324 ULong _scafter = _scbefore; \
325 __asm__ __volatile__( \
326 "movupd (%0), %%xmm2" "\n\t" \
327 _opname " $" #_imm ", %%xmm2, (%1)" "\n\t" \
328 : /*out*/ \
329 : /*in*/ "r"(&(_src)), "r"(&(_scafter)) \
330 : "cc", "memory", "xmm2" \
331 ); \
332 showIAG("m", (_opname), (_imm), &(_src), (_scbefore), (_scafter)); \
333 }
334
335 #define DO_imm_r_to_mandrscalar(_opname, _imm, _src, _dstsuffix) \
336 DO_imm_r_to_rscalar( _opname, _imm, _src, _dstsuffix ) \
337 DO_imm_r_to_mscalar( _opname, _imm, _src )
338
339
340
341
342
343
344
345
346 #define DO_imm_rscalar_to_r(_opname, _imm, _src, _srcsuffix) \
347 { \
348 V128 dstv; \
349 V128 res; \
350 ULong src64 = (ULong)(_src); \
351 memcpy(dstv, fives, sizeof(dstv)); \
352 memcpy(res, zeroes, sizeof(res)); \
353 /* This assumes that gcc won't make any of %0, %1, %2 */ \
354 /* be r11. That should be ensured (cough, cough) */ \
355 /* by declaring r11 to be clobbered. */ \
356 __asm__ __volatile__( \
357 "movupd (%0), %%xmm2" "\n\t" /*dstv*/ \
358 "movq (%1), %%r11" "\n\t" /*src64*/ \
359 _opname " $" #_imm ", %%r11" _srcsuffix ", %%xmm2" "\n\t" \
360 "movupd %%xmm2, (%2)" "\n" /*res*/ \
361 : /*out*/ \
362 : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res) \
363 : "cc", "memory", "xmm2", "r11" \
364 ); \
365 showIGVV("r", (_opname), (_imm), src64, &dstv, &res); \
366 }
367 #define DO_imm_mscalar_to_r(_opname, _imm, _src) \
368 { \
369 V128 dstv; \
370 V128 res; \
371 ULong src64 = (ULong)(_src); \
372 memcpy(dstv, fives, sizeof(dstv)); \
373 memcpy(res, zeroes, sizeof(res)); \
374 __asm__ __volatile__( \
375 "movupd (%0), %%xmm2" "\n\t" /*dstv*/ \
376 _opname " $" #_imm ", (%1), %%xmm2" "\n\t" \
377 "movupd %%xmm2, (%2)" "\n" /*res*/ \
378 : /*out*/ \
379 : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res) \
380 : "cc", "memory", "xmm2" \
381 ); \
382 showIGVV("m", (_opname), (_imm), src64, &dstv, &res); \
383 }
384
385 #define DO_imm_mandrscalar_to_r(_opname, _imm, _src, _dstsuffix) \
386 DO_imm_rscalar_to_r( _opname, _imm, _src, _dstsuffix ) \
387 DO_imm_mscalar_to_r( _opname, _imm, _src )
388
389
390
391
392
test_BLENDPD(void)393 void test_BLENDPD ( void )
394 {
395 V128 src, dst;
396 Int i;
397 for (i = 0; i < 10; i++) {
398 randV128(&src);
399 randV128(&dst);
400 DO_imm_mandr_r("blendpd", 0, src, dst);
401 DO_imm_mandr_r("blendpd", 1, src, dst);
402 DO_imm_mandr_r("blendpd", 2, src, dst);
403 DO_imm_mandr_r("blendpd", 3, src, dst);
404 }
405 }
406
test_BLENDPS(void)407 void test_BLENDPS ( void )
408 {
409 V128 src, dst;
410 Int i;
411 for (i = 0; i < 10; i++) {
412 randV128(&src);
413 randV128(&dst);
414 DO_imm_mandr_r("blendps", 0, src, dst);
415 DO_imm_mandr_r("blendps", 1, src, dst);
416 DO_imm_mandr_r("blendps", 2, src, dst);
417 DO_imm_mandr_r("blendps", 3, src, dst);
418 DO_imm_mandr_r("blendps", 4, src, dst);
419 DO_imm_mandr_r("blendps", 5, src, dst);
420 DO_imm_mandr_r("blendps", 6, src, dst);
421 DO_imm_mandr_r("blendps", 7, src, dst);
422 DO_imm_mandr_r("blendps", 8, src, dst);
423 DO_imm_mandr_r("blendps", 9, src, dst);
424 DO_imm_mandr_r("blendps", 10, src, dst);
425 DO_imm_mandr_r("blendps", 11, src, dst);
426 DO_imm_mandr_r("blendps", 12, src, dst);
427 DO_imm_mandr_r("blendps", 13, src, dst);
428 DO_imm_mandr_r("blendps", 14, src, dst);
429 DO_imm_mandr_r("blendps", 15, src, dst);
430 }
431 }
432
test_DPPD(void)433 void test_DPPD ( void )
434 {
435 V128 src, dst;
436 {
437 *(double*)(&src[0]) = 1.2345;
438 *(double*)(&src[8]) = -6.78910;
439 *(double*)(&dst[0]) = -11.121314;
440 *(double*)(&dst[8]) = 15.161718;
441 DO_imm_mandr_r("dppd", 0, src, dst);
442 DO_imm_mandr_r("dppd", 1, src, dst);
443 DO_imm_mandr_r("dppd", 2, src, dst);
444 DO_imm_mandr_r("dppd", 3, src, dst);
445 DO_imm_mandr_r("dppd", 4, src, dst);
446 DO_imm_mandr_r("dppd", 5, src, dst);
447 DO_imm_mandr_r("dppd", 6, src, dst);
448 DO_imm_mandr_r("dppd", 7, src, dst);
449 DO_imm_mandr_r("dppd", 8, src, dst);
450 DO_imm_mandr_r("dppd", 9, src, dst);
451 DO_imm_mandr_r("dppd", 10, src, dst);
452 DO_imm_mandr_r("dppd", 11, src, dst);
453 DO_imm_mandr_r("dppd", 12, src, dst);
454 DO_imm_mandr_r("dppd", 13, src, dst);
455 DO_imm_mandr_r("dppd", 14, src, dst);
456 DO_imm_mandr_r("dppd", 15, src, dst);
457 DO_imm_mandr_r("dppd", 16, src, dst);
458 DO_imm_mandr_r("dppd", 17, src, dst);
459 DO_imm_mandr_r("dppd", 18, src, dst);
460 DO_imm_mandr_r("dppd", 19, src, dst);
461 DO_imm_mandr_r("dppd", 20, src, dst);
462 DO_imm_mandr_r("dppd", 21, src, dst);
463 DO_imm_mandr_r("dppd", 22, src, dst);
464 DO_imm_mandr_r("dppd", 23, src, dst);
465 DO_imm_mandr_r("dppd", 24, src, dst);
466 DO_imm_mandr_r("dppd", 25, src, dst);
467 DO_imm_mandr_r("dppd", 26, src, dst);
468 DO_imm_mandr_r("dppd", 27, src, dst);
469 DO_imm_mandr_r("dppd", 28, src, dst);
470 DO_imm_mandr_r("dppd", 29, src, dst);
471 DO_imm_mandr_r("dppd", 30, src, dst);
472 DO_imm_mandr_r("dppd", 31, src, dst);
473 DO_imm_mandr_r("dppd", 32, src, dst);
474 DO_imm_mandr_r("dppd", 33, src, dst);
475 DO_imm_mandr_r("dppd", 34, src, dst);
476 DO_imm_mandr_r("dppd", 35, src, dst);
477 DO_imm_mandr_r("dppd", 36, src, dst);
478 DO_imm_mandr_r("dppd", 37, src, dst);
479 DO_imm_mandr_r("dppd", 38, src, dst);
480 DO_imm_mandr_r("dppd", 39, src, dst);
481 DO_imm_mandr_r("dppd", 40, src, dst);
482 DO_imm_mandr_r("dppd", 41, src, dst);
483 DO_imm_mandr_r("dppd", 42, src, dst);
484 DO_imm_mandr_r("dppd", 43, src, dst);
485 DO_imm_mandr_r("dppd", 44, src, dst);
486 DO_imm_mandr_r("dppd", 45, src, dst);
487 DO_imm_mandr_r("dppd", 46, src, dst);
488 DO_imm_mandr_r("dppd", 47, src, dst);
489 DO_imm_mandr_r("dppd", 48, src, dst);
490 DO_imm_mandr_r("dppd", 49, src, dst);
491 DO_imm_mandr_r("dppd", 50, src, dst);
492 DO_imm_mandr_r("dppd", 51, src, dst);
493 DO_imm_mandr_r("dppd", 52, src, dst);
494 DO_imm_mandr_r("dppd", 53, src, dst);
495 DO_imm_mandr_r("dppd", 54, src, dst);
496 DO_imm_mandr_r("dppd", 55, src, dst);
497 DO_imm_mandr_r("dppd", 56, src, dst);
498 DO_imm_mandr_r("dppd", 57, src, dst);
499 DO_imm_mandr_r("dppd", 58, src, dst);
500 DO_imm_mandr_r("dppd", 59, src, dst);
501 DO_imm_mandr_r("dppd", 60, src, dst);
502 DO_imm_mandr_r("dppd", 61, src, dst);
503 DO_imm_mandr_r("dppd", 62, src, dst);
504 DO_imm_mandr_r("dppd", 63, src, dst);
505 DO_imm_mandr_r("dppd", 64, src, dst);
506 DO_imm_mandr_r("dppd", 65, src, dst);
507 DO_imm_mandr_r("dppd", 66, src, dst);
508 DO_imm_mandr_r("dppd", 67, src, dst);
509 DO_imm_mandr_r("dppd", 68, src, dst);
510 DO_imm_mandr_r("dppd", 69, src, dst);
511 DO_imm_mandr_r("dppd", 70, src, dst);
512 DO_imm_mandr_r("dppd", 71, src, dst);
513 DO_imm_mandr_r("dppd", 72, src, dst);
514 DO_imm_mandr_r("dppd", 73, src, dst);
515 DO_imm_mandr_r("dppd", 74, src, dst);
516 DO_imm_mandr_r("dppd", 75, src, dst);
517 DO_imm_mandr_r("dppd", 76, src, dst);
518 DO_imm_mandr_r("dppd", 77, src, dst);
519 DO_imm_mandr_r("dppd", 78, src, dst);
520 DO_imm_mandr_r("dppd", 79, src, dst);
521 DO_imm_mandr_r("dppd", 80, src, dst);
522 DO_imm_mandr_r("dppd", 81, src, dst);
523 DO_imm_mandr_r("dppd", 82, src, dst);
524 DO_imm_mandr_r("dppd", 83, src, dst);
525 DO_imm_mandr_r("dppd", 84, src, dst);
526 DO_imm_mandr_r("dppd", 85, src, dst);
527 DO_imm_mandr_r("dppd", 86, src, dst);
528 DO_imm_mandr_r("dppd", 87, src, dst);
529 DO_imm_mandr_r("dppd", 88, src, dst);
530 DO_imm_mandr_r("dppd", 89, src, dst);
531 DO_imm_mandr_r("dppd", 90, src, dst);
532 DO_imm_mandr_r("dppd", 91, src, dst);
533 DO_imm_mandr_r("dppd", 92, src, dst);
534 DO_imm_mandr_r("dppd", 93, src, dst);
535 DO_imm_mandr_r("dppd", 94, src, dst);
536 DO_imm_mandr_r("dppd", 95, src, dst);
537 DO_imm_mandr_r("dppd", 96, src, dst);
538 DO_imm_mandr_r("dppd", 97, src, dst);
539 DO_imm_mandr_r("dppd", 98, src, dst);
540 DO_imm_mandr_r("dppd", 99, src, dst);
541 DO_imm_mandr_r("dppd", 100, src, dst);
542 DO_imm_mandr_r("dppd", 101, src, dst);
543 DO_imm_mandr_r("dppd", 102, src, dst);
544 DO_imm_mandr_r("dppd", 103, src, dst);
545 DO_imm_mandr_r("dppd", 104, src, dst);
546 DO_imm_mandr_r("dppd", 105, src, dst);
547 DO_imm_mandr_r("dppd", 106, src, dst);
548 DO_imm_mandr_r("dppd", 107, src, dst);
549 DO_imm_mandr_r("dppd", 108, src, dst);
550 DO_imm_mandr_r("dppd", 109, src, dst);
551 DO_imm_mandr_r("dppd", 110, src, dst);
552 DO_imm_mandr_r("dppd", 111, src, dst);
553 DO_imm_mandr_r("dppd", 112, src, dst);
554 DO_imm_mandr_r("dppd", 113, src, dst);
555 DO_imm_mandr_r("dppd", 114, src, dst);
556 DO_imm_mandr_r("dppd", 115, src, dst);
557 DO_imm_mandr_r("dppd", 116, src, dst);
558 DO_imm_mandr_r("dppd", 117, src, dst);
559 DO_imm_mandr_r("dppd", 118, src, dst);
560 DO_imm_mandr_r("dppd", 119, src, dst);
561 DO_imm_mandr_r("dppd", 120, src, dst);
562 DO_imm_mandr_r("dppd", 121, src, dst);
563 DO_imm_mandr_r("dppd", 122, src, dst);
564 DO_imm_mandr_r("dppd", 123, src, dst);
565 DO_imm_mandr_r("dppd", 124, src, dst);
566 DO_imm_mandr_r("dppd", 125, src, dst);
567 DO_imm_mandr_r("dppd", 126, src, dst);
568 DO_imm_mandr_r("dppd", 127, src, dst);
569 DO_imm_mandr_r("dppd", 128, src, dst);
570 DO_imm_mandr_r("dppd", 129, src, dst);
571 DO_imm_mandr_r("dppd", 130, src, dst);
572 DO_imm_mandr_r("dppd", 131, src, dst);
573 DO_imm_mandr_r("dppd", 132, src, dst);
574 DO_imm_mandr_r("dppd", 133, src, dst);
575 DO_imm_mandr_r("dppd", 134, src, dst);
576 DO_imm_mandr_r("dppd", 135, src, dst);
577 DO_imm_mandr_r("dppd", 136, src, dst);
578 DO_imm_mandr_r("dppd", 137, src, dst);
579 DO_imm_mandr_r("dppd", 138, src, dst);
580 DO_imm_mandr_r("dppd", 139, src, dst);
581 DO_imm_mandr_r("dppd", 140, src, dst);
582 DO_imm_mandr_r("dppd", 141, src, dst);
583 DO_imm_mandr_r("dppd", 142, src, dst);
584 DO_imm_mandr_r("dppd", 143, src, dst);
585 DO_imm_mandr_r("dppd", 144, src, dst);
586 DO_imm_mandr_r("dppd", 145, src, dst);
587 DO_imm_mandr_r("dppd", 146, src, dst);
588 DO_imm_mandr_r("dppd", 147, src, dst);
589 DO_imm_mandr_r("dppd", 148, src, dst);
590 DO_imm_mandr_r("dppd", 149, src, dst);
591 DO_imm_mandr_r("dppd", 150, src, dst);
592 DO_imm_mandr_r("dppd", 151, src, dst);
593 DO_imm_mandr_r("dppd", 152, src, dst);
594 DO_imm_mandr_r("dppd", 153, src, dst);
595 DO_imm_mandr_r("dppd", 154, src, dst);
596 DO_imm_mandr_r("dppd", 155, src, dst);
597 DO_imm_mandr_r("dppd", 156, src, dst);
598 DO_imm_mandr_r("dppd", 157, src, dst);
599 DO_imm_mandr_r("dppd", 158, src, dst);
600 DO_imm_mandr_r("dppd", 159, src, dst);
601 DO_imm_mandr_r("dppd", 160, src, dst);
602 DO_imm_mandr_r("dppd", 161, src, dst);
603 DO_imm_mandr_r("dppd", 162, src, dst);
604 DO_imm_mandr_r("dppd", 163, src, dst);
605 DO_imm_mandr_r("dppd", 164, src, dst);
606 DO_imm_mandr_r("dppd", 165, src, dst);
607 DO_imm_mandr_r("dppd", 166, src, dst);
608 DO_imm_mandr_r("dppd", 167, src, dst);
609 DO_imm_mandr_r("dppd", 168, src, dst);
610 DO_imm_mandr_r("dppd", 169, src, dst);
611 DO_imm_mandr_r("dppd", 170, src, dst);
612 DO_imm_mandr_r("dppd", 171, src, dst);
613 DO_imm_mandr_r("dppd", 172, src, dst);
614 DO_imm_mandr_r("dppd", 173, src, dst);
615 DO_imm_mandr_r("dppd", 174, src, dst);
616 DO_imm_mandr_r("dppd", 175, src, dst);
617 DO_imm_mandr_r("dppd", 176, src, dst);
618 DO_imm_mandr_r("dppd", 177, src, dst);
619 DO_imm_mandr_r("dppd", 178, src, dst);
620 DO_imm_mandr_r("dppd", 179, src, dst);
621 DO_imm_mandr_r("dppd", 180, src, dst);
622 DO_imm_mandr_r("dppd", 181, src, dst);
623 DO_imm_mandr_r("dppd", 182, src, dst);
624 DO_imm_mandr_r("dppd", 183, src, dst);
625 DO_imm_mandr_r("dppd", 184, src, dst);
626 DO_imm_mandr_r("dppd", 185, src, dst);
627 DO_imm_mandr_r("dppd", 186, src, dst);
628 DO_imm_mandr_r("dppd", 187, src, dst);
629 DO_imm_mandr_r("dppd", 188, src, dst);
630 DO_imm_mandr_r("dppd", 189, src, dst);
631 DO_imm_mandr_r("dppd", 190, src, dst);
632 DO_imm_mandr_r("dppd", 191, src, dst);
633 DO_imm_mandr_r("dppd", 192, src, dst);
634 DO_imm_mandr_r("dppd", 193, src, dst);
635 DO_imm_mandr_r("dppd", 194, src, dst);
636 DO_imm_mandr_r("dppd", 195, src, dst);
637 DO_imm_mandr_r("dppd", 196, src, dst);
638 DO_imm_mandr_r("dppd", 197, src, dst);
639 DO_imm_mandr_r("dppd", 198, src, dst);
640 DO_imm_mandr_r("dppd", 199, src, dst);
641 DO_imm_mandr_r("dppd", 200, src, dst);
642 DO_imm_mandr_r("dppd", 201, src, dst);
643 DO_imm_mandr_r("dppd", 202, src, dst);
644 DO_imm_mandr_r("dppd", 203, src, dst);
645 DO_imm_mandr_r("dppd", 204, src, dst);
646 DO_imm_mandr_r("dppd", 205, src, dst);
647 DO_imm_mandr_r("dppd", 206, src, dst);
648 DO_imm_mandr_r("dppd", 207, src, dst);
649 DO_imm_mandr_r("dppd", 208, src, dst);
650 DO_imm_mandr_r("dppd", 209, src, dst);
651 DO_imm_mandr_r("dppd", 210, src, dst);
652 DO_imm_mandr_r("dppd", 211, src, dst);
653 DO_imm_mandr_r("dppd", 212, src, dst);
654 DO_imm_mandr_r("dppd", 213, src, dst);
655 DO_imm_mandr_r("dppd", 214, src, dst);
656 DO_imm_mandr_r("dppd", 215, src, dst);
657 DO_imm_mandr_r("dppd", 216, src, dst);
658 DO_imm_mandr_r("dppd", 217, src, dst);
659 DO_imm_mandr_r("dppd", 218, src, dst);
660 DO_imm_mandr_r("dppd", 219, src, dst);
661 DO_imm_mandr_r("dppd", 220, src, dst);
662 DO_imm_mandr_r("dppd", 221, src, dst);
663 DO_imm_mandr_r("dppd", 222, src, dst);
664 DO_imm_mandr_r("dppd", 223, src, dst);
665 DO_imm_mandr_r("dppd", 224, src, dst);
666 DO_imm_mandr_r("dppd", 225, src, dst);
667 DO_imm_mandr_r("dppd", 226, src, dst);
668 DO_imm_mandr_r("dppd", 227, src, dst);
669 DO_imm_mandr_r("dppd", 228, src, dst);
670 DO_imm_mandr_r("dppd", 229, src, dst);
671 DO_imm_mandr_r("dppd", 230, src, dst);
672 DO_imm_mandr_r("dppd", 231, src, dst);
673 DO_imm_mandr_r("dppd", 232, src, dst);
674 DO_imm_mandr_r("dppd", 233, src, dst);
675 DO_imm_mandr_r("dppd", 234, src, dst);
676 DO_imm_mandr_r("dppd", 235, src, dst);
677 DO_imm_mandr_r("dppd", 236, src, dst);
678 DO_imm_mandr_r("dppd", 237, src, dst);
679 DO_imm_mandr_r("dppd", 238, src, dst);
680 DO_imm_mandr_r("dppd", 239, src, dst);
681 DO_imm_mandr_r("dppd", 240, src, dst);
682 DO_imm_mandr_r("dppd", 241, src, dst);
683 DO_imm_mandr_r("dppd", 242, src, dst);
684 DO_imm_mandr_r("dppd", 243, src, dst);
685 DO_imm_mandr_r("dppd", 244, src, dst);
686 DO_imm_mandr_r("dppd", 245, src, dst);
687 DO_imm_mandr_r("dppd", 246, src, dst);
688 DO_imm_mandr_r("dppd", 247, src, dst);
689 DO_imm_mandr_r("dppd", 248, src, dst);
690 DO_imm_mandr_r("dppd", 249, src, dst);
691 DO_imm_mandr_r("dppd", 250, src, dst);
692 DO_imm_mandr_r("dppd", 251, src, dst);
693 DO_imm_mandr_r("dppd", 252, src, dst);
694 DO_imm_mandr_r("dppd", 253, src, dst);
695 DO_imm_mandr_r("dppd", 254, src, dst);
696 DO_imm_mandr_r("dppd", 255, src, dst);
697 }
698 }
699
test_DPPS(void)700 void test_DPPS ( void )
701 {
702 V128 src, dst;
703 {
704 *(float*)(&src[0]) = 1.2;
705 *(float*)(&src[4]) = -3.4;
706 *(float*)(&src[8]) = -6.7;
707 *(float*)(&src[12]) = 8.9;
708 *(float*)(&dst[0]) = -10.11;
709 *(float*)(&dst[4]) = 12.13;
710 *(float*)(&dst[8]) = 14.15;
711 *(float*)(&dst[12]) = -16.17;
712 DO_imm_mandr_r("dpps", 0, src, dst);
713 DO_imm_mandr_r("dpps", 1, src, dst);
714 DO_imm_mandr_r("dpps", 2, src, dst);
715 DO_imm_mandr_r("dpps", 3, src, dst);
716 DO_imm_mandr_r("dpps", 4, src, dst);
717 DO_imm_mandr_r("dpps", 5, src, dst);
718 DO_imm_mandr_r("dpps", 6, src, dst);
719 DO_imm_mandr_r("dpps", 7, src, dst);
720 DO_imm_mandr_r("dpps", 8, src, dst);
721 DO_imm_mandr_r("dpps", 9, src, dst);
722 DO_imm_mandr_r("dpps", 10, src, dst);
723 DO_imm_mandr_r("dpps", 11, src, dst);
724 DO_imm_mandr_r("dpps", 12, src, dst);
725 DO_imm_mandr_r("dpps", 13, src, dst);
726 DO_imm_mandr_r("dpps", 14, src, dst);
727 DO_imm_mandr_r("dpps", 15, src, dst);
728 DO_imm_mandr_r("dpps", 16, src, dst);
729 DO_imm_mandr_r("dpps", 17, src, dst);
730 DO_imm_mandr_r("dpps", 18, src, dst);
731 DO_imm_mandr_r("dpps", 19, src, dst);
732 DO_imm_mandr_r("dpps", 20, src, dst);
733 DO_imm_mandr_r("dpps", 21, src, dst);
734 DO_imm_mandr_r("dpps", 22, src, dst);
735 DO_imm_mandr_r("dpps", 23, src, dst);
736 DO_imm_mandr_r("dpps", 24, src, dst);
737 DO_imm_mandr_r("dpps", 25, src, dst);
738 DO_imm_mandr_r("dpps", 26, src, dst);
739 DO_imm_mandr_r("dpps", 27, src, dst);
740 DO_imm_mandr_r("dpps", 28, src, dst);
741 DO_imm_mandr_r("dpps", 29, src, dst);
742 DO_imm_mandr_r("dpps", 30, src, dst);
743 DO_imm_mandr_r("dpps", 31, src, dst);
744 DO_imm_mandr_r("dpps", 32, src, dst);
745 DO_imm_mandr_r("dpps", 33, src, dst);
746 DO_imm_mandr_r("dpps", 34, src, dst);
747 DO_imm_mandr_r("dpps", 35, src, dst);
748 DO_imm_mandr_r("dpps", 36, src, dst);
749 DO_imm_mandr_r("dpps", 37, src, dst);
750 DO_imm_mandr_r("dpps", 38, src, dst);
751 DO_imm_mandr_r("dpps", 39, src, dst);
752 DO_imm_mandr_r("dpps", 40, src, dst);
753 DO_imm_mandr_r("dpps", 41, src, dst);
754 DO_imm_mandr_r("dpps", 42, src, dst);
755 DO_imm_mandr_r("dpps", 43, src, dst);
756 DO_imm_mandr_r("dpps", 44, src, dst);
757 DO_imm_mandr_r("dpps", 45, src, dst);
758 DO_imm_mandr_r("dpps", 46, src, dst);
759 DO_imm_mandr_r("dpps", 47, src, dst);
760 DO_imm_mandr_r("dpps", 48, src, dst);
761 DO_imm_mandr_r("dpps", 49, src, dst);
762 DO_imm_mandr_r("dpps", 50, src, dst);
763 DO_imm_mandr_r("dpps", 51, src, dst);
764 DO_imm_mandr_r("dpps", 52, src, dst);
765 DO_imm_mandr_r("dpps", 53, src, dst);
766 DO_imm_mandr_r("dpps", 54, src, dst);
767 DO_imm_mandr_r("dpps", 55, src, dst);
768 DO_imm_mandr_r("dpps", 56, src, dst);
769 DO_imm_mandr_r("dpps", 57, src, dst);
770 DO_imm_mandr_r("dpps", 58, src, dst);
771 DO_imm_mandr_r("dpps", 59, src, dst);
772 DO_imm_mandr_r("dpps", 60, src, dst);
773 DO_imm_mandr_r("dpps", 61, src, dst);
774 DO_imm_mandr_r("dpps", 62, src, dst);
775 DO_imm_mandr_r("dpps", 63, src, dst);
776 DO_imm_mandr_r("dpps", 64, src, dst);
777 DO_imm_mandr_r("dpps", 65, src, dst);
778 DO_imm_mandr_r("dpps", 66, src, dst);
779 DO_imm_mandr_r("dpps", 67, src, dst);
780 DO_imm_mandr_r("dpps", 68, src, dst);
781 DO_imm_mandr_r("dpps", 69, src, dst);
782 DO_imm_mandr_r("dpps", 70, src, dst);
783 DO_imm_mandr_r("dpps", 71, src, dst);
784 DO_imm_mandr_r("dpps", 72, src, dst);
785 DO_imm_mandr_r("dpps", 73, src, dst);
786 DO_imm_mandr_r("dpps", 74, src, dst);
787 DO_imm_mandr_r("dpps", 75, src, dst);
788 DO_imm_mandr_r("dpps", 76, src, dst);
789 DO_imm_mandr_r("dpps", 77, src, dst);
790 DO_imm_mandr_r("dpps", 78, src, dst);
791 DO_imm_mandr_r("dpps", 79, src, dst);
792 DO_imm_mandr_r("dpps", 80, src, dst);
793 DO_imm_mandr_r("dpps", 81, src, dst);
794 DO_imm_mandr_r("dpps", 82, src, dst);
795 DO_imm_mandr_r("dpps", 83, src, dst);
796 DO_imm_mandr_r("dpps", 84, src, dst);
797 DO_imm_mandr_r("dpps", 85, src, dst);
798 DO_imm_mandr_r("dpps", 86, src, dst);
799 DO_imm_mandr_r("dpps", 87, src, dst);
800 DO_imm_mandr_r("dpps", 88, src, dst);
801 DO_imm_mandr_r("dpps", 89, src, dst);
802 DO_imm_mandr_r("dpps", 90, src, dst);
803 DO_imm_mandr_r("dpps", 91, src, dst);
804 DO_imm_mandr_r("dpps", 92, src, dst);
805 DO_imm_mandr_r("dpps", 93, src, dst);
806 DO_imm_mandr_r("dpps", 94, src, dst);
807 DO_imm_mandr_r("dpps", 95, src, dst);
808 DO_imm_mandr_r("dpps", 96, src, dst);
809 DO_imm_mandr_r("dpps", 97, src, dst);
810 DO_imm_mandr_r("dpps", 98, src, dst);
811 DO_imm_mandr_r("dpps", 99, src, dst);
812 DO_imm_mandr_r("dpps", 100, src, dst);
813 DO_imm_mandr_r("dpps", 101, src, dst);
814 DO_imm_mandr_r("dpps", 102, src, dst);
815 DO_imm_mandr_r("dpps", 103, src, dst);
816 DO_imm_mandr_r("dpps", 104, src, dst);
817 DO_imm_mandr_r("dpps", 105, src, dst);
818 DO_imm_mandr_r("dpps", 106, src, dst);
819 DO_imm_mandr_r("dpps", 107, src, dst);
820 DO_imm_mandr_r("dpps", 108, src, dst);
821 DO_imm_mandr_r("dpps", 109, src, dst);
822 DO_imm_mandr_r("dpps", 110, src, dst);
823 DO_imm_mandr_r("dpps", 111, src, dst);
824 DO_imm_mandr_r("dpps", 112, src, dst);
825 DO_imm_mandr_r("dpps", 113, src, dst);
826 DO_imm_mandr_r("dpps", 114, src, dst);
827 DO_imm_mandr_r("dpps", 115, src, dst);
828 DO_imm_mandr_r("dpps", 116, src, dst);
829 DO_imm_mandr_r("dpps", 117, src, dst);
830 DO_imm_mandr_r("dpps", 118, src, dst);
831 DO_imm_mandr_r("dpps", 119, src, dst);
832 DO_imm_mandr_r("dpps", 120, src, dst);
833 DO_imm_mandr_r("dpps", 121, src, dst);
834 DO_imm_mandr_r("dpps", 122, src, dst);
835 DO_imm_mandr_r("dpps", 123, src, dst);
836 DO_imm_mandr_r("dpps", 124, src, dst);
837 DO_imm_mandr_r("dpps", 125, src, dst);
838 DO_imm_mandr_r("dpps", 126, src, dst);
839 DO_imm_mandr_r("dpps", 127, src, dst);
840 DO_imm_mandr_r("dpps", 128, src, dst);
841 DO_imm_mandr_r("dpps", 129, src, dst);
842 DO_imm_mandr_r("dpps", 130, src, dst);
843 DO_imm_mandr_r("dpps", 131, src, dst);
844 DO_imm_mandr_r("dpps", 132, src, dst);
845 DO_imm_mandr_r("dpps", 133, src, dst);
846 DO_imm_mandr_r("dpps", 134, src, dst);
847 DO_imm_mandr_r("dpps", 135, src, dst);
848 DO_imm_mandr_r("dpps", 136, src, dst);
849 DO_imm_mandr_r("dpps", 137, src, dst);
850 DO_imm_mandr_r("dpps", 138, src, dst);
851 DO_imm_mandr_r("dpps", 139, src, dst);
852 DO_imm_mandr_r("dpps", 140, src, dst);
853 DO_imm_mandr_r("dpps", 141, src, dst);
854 DO_imm_mandr_r("dpps", 142, src, dst);
855 DO_imm_mandr_r("dpps", 143, src, dst);
856 DO_imm_mandr_r("dpps", 144, src, dst);
857 DO_imm_mandr_r("dpps", 145, src, dst);
858 DO_imm_mandr_r("dpps", 146, src, dst);
859 DO_imm_mandr_r("dpps", 147, src, dst);
860 DO_imm_mandr_r("dpps", 148, src, dst);
861 DO_imm_mandr_r("dpps", 149, src, dst);
862 DO_imm_mandr_r("dpps", 150, src, dst);
863 DO_imm_mandr_r("dpps", 151, src, dst);
864 DO_imm_mandr_r("dpps", 152, src, dst);
865 DO_imm_mandr_r("dpps", 153, src, dst);
866 DO_imm_mandr_r("dpps", 154, src, dst);
867 DO_imm_mandr_r("dpps", 155, src, dst);
868 DO_imm_mandr_r("dpps", 156, src, dst);
869 DO_imm_mandr_r("dpps", 157, src, dst);
870 DO_imm_mandr_r("dpps", 158, src, dst);
871 DO_imm_mandr_r("dpps", 159, src, dst);
872 DO_imm_mandr_r("dpps", 160, src, dst);
873 DO_imm_mandr_r("dpps", 161, src, dst);
874 DO_imm_mandr_r("dpps", 162, src, dst);
875 DO_imm_mandr_r("dpps", 163, src, dst);
876 DO_imm_mandr_r("dpps", 164, src, dst);
877 DO_imm_mandr_r("dpps", 165, src, dst);
878 DO_imm_mandr_r("dpps", 166, src, dst);
879 DO_imm_mandr_r("dpps", 167, src, dst);
880 DO_imm_mandr_r("dpps", 168, src, dst);
881 DO_imm_mandr_r("dpps", 169, src, dst);
882 DO_imm_mandr_r("dpps", 170, src, dst);
883 DO_imm_mandr_r("dpps", 171, src, dst);
884 DO_imm_mandr_r("dpps", 172, src, dst);
885 DO_imm_mandr_r("dpps", 173, src, dst);
886 DO_imm_mandr_r("dpps", 174, src, dst);
887 DO_imm_mandr_r("dpps", 175, src, dst);
888 DO_imm_mandr_r("dpps", 176, src, dst);
889 DO_imm_mandr_r("dpps", 177, src, dst);
890 DO_imm_mandr_r("dpps", 178, src, dst);
891 DO_imm_mandr_r("dpps", 179, src, dst);
892 DO_imm_mandr_r("dpps", 180, src, dst);
893 DO_imm_mandr_r("dpps", 181, src, dst);
894 DO_imm_mandr_r("dpps", 182, src, dst);
895 DO_imm_mandr_r("dpps", 183, src, dst);
896 DO_imm_mandr_r("dpps", 184, src, dst);
897 DO_imm_mandr_r("dpps", 185, src, dst);
898 DO_imm_mandr_r("dpps", 186, src, dst);
899 DO_imm_mandr_r("dpps", 187, src, dst);
900 DO_imm_mandr_r("dpps", 188, src, dst);
901 DO_imm_mandr_r("dpps", 189, src, dst);
902 DO_imm_mandr_r("dpps", 190, src, dst);
903 DO_imm_mandr_r("dpps", 191, src, dst);
904 DO_imm_mandr_r("dpps", 192, src, dst);
905 DO_imm_mandr_r("dpps", 193, src, dst);
906 DO_imm_mandr_r("dpps", 194, src, dst);
907 DO_imm_mandr_r("dpps", 195, src, dst);
908 DO_imm_mandr_r("dpps", 196, src, dst);
909 DO_imm_mandr_r("dpps", 197, src, dst);
910 DO_imm_mandr_r("dpps", 198, src, dst);
911 DO_imm_mandr_r("dpps", 199, src, dst);
912 DO_imm_mandr_r("dpps", 200, src, dst);
913 DO_imm_mandr_r("dpps", 201, src, dst);
914 DO_imm_mandr_r("dpps", 202, src, dst);
915 DO_imm_mandr_r("dpps", 203, src, dst);
916 DO_imm_mandr_r("dpps", 204, src, dst);
917 DO_imm_mandr_r("dpps", 205, src, dst);
918 DO_imm_mandr_r("dpps", 206, src, dst);
919 DO_imm_mandr_r("dpps", 207, src, dst);
920 DO_imm_mandr_r("dpps", 208, src, dst);
921 DO_imm_mandr_r("dpps", 209, src, dst);
922 DO_imm_mandr_r("dpps", 210, src, dst);
923 DO_imm_mandr_r("dpps", 211, src, dst);
924 DO_imm_mandr_r("dpps", 212, src, dst);
925 DO_imm_mandr_r("dpps", 213, src, dst);
926 DO_imm_mandr_r("dpps", 214, src, dst);
927 DO_imm_mandr_r("dpps", 215, src, dst);
928 DO_imm_mandr_r("dpps", 216, src, dst);
929 DO_imm_mandr_r("dpps", 217, src, dst);
930 DO_imm_mandr_r("dpps", 218, src, dst);
931 DO_imm_mandr_r("dpps", 219, src, dst);
932 DO_imm_mandr_r("dpps", 220, src, dst);
933 DO_imm_mandr_r("dpps", 221, src, dst);
934 DO_imm_mandr_r("dpps", 222, src, dst);
935 DO_imm_mandr_r("dpps", 223, src, dst);
936 DO_imm_mandr_r("dpps", 224, src, dst);
937 DO_imm_mandr_r("dpps", 225, src, dst);
938 DO_imm_mandr_r("dpps", 226, src, dst);
939 DO_imm_mandr_r("dpps", 227, src, dst);
940 DO_imm_mandr_r("dpps", 228, src, dst);
941 DO_imm_mandr_r("dpps", 229, src, dst);
942 DO_imm_mandr_r("dpps", 230, src, dst);
943 DO_imm_mandr_r("dpps", 231, src, dst);
944 DO_imm_mandr_r("dpps", 232, src, dst);
945 DO_imm_mandr_r("dpps", 233, src, dst);
946 DO_imm_mandr_r("dpps", 234, src, dst);
947 DO_imm_mandr_r("dpps", 235, src, dst);
948 DO_imm_mandr_r("dpps", 236, src, dst);
949 DO_imm_mandr_r("dpps", 237, src, dst);
950 DO_imm_mandr_r("dpps", 238, src, dst);
951 DO_imm_mandr_r("dpps", 239, src, dst);
952 DO_imm_mandr_r("dpps", 240, src, dst);
953 DO_imm_mandr_r("dpps", 241, src, dst);
954 DO_imm_mandr_r("dpps", 242, src, dst);
955 DO_imm_mandr_r("dpps", 243, src, dst);
956 DO_imm_mandr_r("dpps", 244, src, dst);
957 DO_imm_mandr_r("dpps", 245, src, dst);
958 DO_imm_mandr_r("dpps", 246, src, dst);
959 DO_imm_mandr_r("dpps", 247, src, dst);
960 DO_imm_mandr_r("dpps", 248, src, dst);
961 DO_imm_mandr_r("dpps", 249, src, dst);
962 DO_imm_mandr_r("dpps", 250, src, dst);
963 DO_imm_mandr_r("dpps", 251, src, dst);
964 DO_imm_mandr_r("dpps", 252, src, dst);
965 DO_imm_mandr_r("dpps", 253, src, dst);
966 DO_imm_mandr_r("dpps", 254, src, dst);
967 DO_imm_mandr_r("dpps", 255, src, dst);
968 }
969 }
970
test_INSERTPS(void)971 void test_INSERTPS ( void )
972 {
973 V128 src, dst;
974 {
975 *(float*)(&src[0]) = 1.2;
976 *(float*)(&src[4]) = -3.4;
977 *(float*)(&src[8]) = -6.7;
978 *(float*)(&src[12]) = 8.9;
979 *(float*)(&dst[0]) = -10.11;
980 *(float*)(&dst[4]) = 12.13;
981 *(float*)(&dst[8]) = 14.15;
982 *(float*)(&dst[12]) = -16.17;
983 DO_imm_mandr_r("insertps", 0, src, dst);
984 DO_imm_mandr_r("insertps", 1, src, dst);
985 DO_imm_mandr_r("insertps", 2, src, dst);
986 DO_imm_mandr_r("insertps", 3, src, dst);
987 DO_imm_mandr_r("insertps", 4, src, dst);
988 DO_imm_mandr_r("insertps", 5, src, dst);
989 DO_imm_mandr_r("insertps", 6, src, dst);
990 DO_imm_mandr_r("insertps", 7, src, dst);
991 DO_imm_mandr_r("insertps", 8, src, dst);
992 DO_imm_mandr_r("insertps", 9, src, dst);
993 DO_imm_mandr_r("insertps", 10, src, dst);
994 DO_imm_mandr_r("insertps", 11, src, dst);
995 DO_imm_mandr_r("insertps", 12, src, dst);
996 DO_imm_mandr_r("insertps", 13, src, dst);
997 DO_imm_mandr_r("insertps", 14, src, dst);
998 DO_imm_mandr_r("insertps", 15, src, dst);
999 DO_imm_mandr_r("insertps", 16, src, dst);
1000 DO_imm_mandr_r("insertps", 17, src, dst);
1001 DO_imm_mandr_r("insertps", 18, src, dst);
1002 DO_imm_mandr_r("insertps", 19, src, dst);
1003 DO_imm_mandr_r("insertps", 20, src, dst);
1004 DO_imm_mandr_r("insertps", 21, src, dst);
1005 DO_imm_mandr_r("insertps", 22, src, dst);
1006 DO_imm_mandr_r("insertps", 23, src, dst);
1007 DO_imm_mandr_r("insertps", 24, src, dst);
1008 DO_imm_mandr_r("insertps", 25, src, dst);
1009 DO_imm_mandr_r("insertps", 26, src, dst);
1010 DO_imm_mandr_r("insertps", 27, src, dst);
1011 DO_imm_mandr_r("insertps", 28, src, dst);
1012 DO_imm_mandr_r("insertps", 29, src, dst);
1013 DO_imm_mandr_r("insertps", 30, src, dst);
1014 DO_imm_mandr_r("insertps", 31, src, dst);
1015 DO_imm_mandr_r("insertps", 32, src, dst);
1016 DO_imm_mandr_r("insertps", 33, src, dst);
1017 DO_imm_mandr_r("insertps", 34, src, dst);
1018 DO_imm_mandr_r("insertps", 35, src, dst);
1019 DO_imm_mandr_r("insertps", 36, src, dst);
1020 DO_imm_mandr_r("insertps", 37, src, dst);
1021 DO_imm_mandr_r("insertps", 38, src, dst);
1022 DO_imm_mandr_r("insertps", 39, src, dst);
1023 DO_imm_mandr_r("insertps", 40, src, dst);
1024 DO_imm_mandr_r("insertps", 41, src, dst);
1025 DO_imm_mandr_r("insertps", 42, src, dst);
1026 DO_imm_mandr_r("insertps", 43, src, dst);
1027 DO_imm_mandr_r("insertps", 44, src, dst);
1028 DO_imm_mandr_r("insertps", 45, src, dst);
1029 DO_imm_mandr_r("insertps", 46, src, dst);
1030 DO_imm_mandr_r("insertps", 47, src, dst);
1031 DO_imm_mandr_r("insertps", 48, src, dst);
1032 DO_imm_mandr_r("insertps", 49, src, dst);
1033 DO_imm_mandr_r("insertps", 50, src, dst);
1034 DO_imm_mandr_r("insertps", 51, src, dst);
1035 DO_imm_mandr_r("insertps", 52, src, dst);
1036 DO_imm_mandr_r("insertps", 53, src, dst);
1037 DO_imm_mandr_r("insertps", 54, src, dst);
1038 DO_imm_mandr_r("insertps", 55, src, dst);
1039 DO_imm_mandr_r("insertps", 56, src, dst);
1040 DO_imm_mandr_r("insertps", 57, src, dst);
1041 DO_imm_mandr_r("insertps", 58, src, dst);
1042 DO_imm_mandr_r("insertps", 59, src, dst);
1043 DO_imm_mandr_r("insertps", 60, src, dst);
1044 DO_imm_mandr_r("insertps", 61, src, dst);
1045 DO_imm_mandr_r("insertps", 62, src, dst);
1046 DO_imm_mandr_r("insertps", 63, src, dst);
1047 DO_imm_mandr_r("insertps", 64, src, dst);
1048 DO_imm_mandr_r("insertps", 65, src, dst);
1049 DO_imm_mandr_r("insertps", 66, src, dst);
1050 DO_imm_mandr_r("insertps", 67, src, dst);
1051 DO_imm_mandr_r("insertps", 68, src, dst);
1052 DO_imm_mandr_r("insertps", 69, src, dst);
1053 DO_imm_mandr_r("insertps", 70, src, dst);
1054 DO_imm_mandr_r("insertps", 71, src, dst);
1055 DO_imm_mandr_r("insertps", 72, src, dst);
1056 DO_imm_mandr_r("insertps", 73, src, dst);
1057 DO_imm_mandr_r("insertps", 74, src, dst);
1058 DO_imm_mandr_r("insertps", 75, src, dst);
1059 DO_imm_mandr_r("insertps", 76, src, dst);
1060 DO_imm_mandr_r("insertps", 77, src, dst);
1061 DO_imm_mandr_r("insertps", 78, src, dst);
1062 DO_imm_mandr_r("insertps", 79, src, dst);
1063 DO_imm_mandr_r("insertps", 80, src, dst);
1064 DO_imm_mandr_r("insertps", 81, src, dst);
1065 DO_imm_mandr_r("insertps", 82, src, dst);
1066 DO_imm_mandr_r("insertps", 83, src, dst);
1067 DO_imm_mandr_r("insertps", 84, src, dst);
1068 DO_imm_mandr_r("insertps", 85, src, dst);
1069 DO_imm_mandr_r("insertps", 86, src, dst);
1070 DO_imm_mandr_r("insertps", 87, src, dst);
1071 DO_imm_mandr_r("insertps", 88, src, dst);
1072 DO_imm_mandr_r("insertps", 89, src, dst);
1073 DO_imm_mandr_r("insertps", 90, src, dst);
1074 DO_imm_mandr_r("insertps", 91, src, dst);
1075 DO_imm_mandr_r("insertps", 92, src, dst);
1076 DO_imm_mandr_r("insertps", 93, src, dst);
1077 DO_imm_mandr_r("insertps", 94, src, dst);
1078 DO_imm_mandr_r("insertps", 95, src, dst);
1079 DO_imm_mandr_r("insertps", 96, src, dst);
1080 DO_imm_mandr_r("insertps", 97, src, dst);
1081 DO_imm_mandr_r("insertps", 98, src, dst);
1082 DO_imm_mandr_r("insertps", 99, src, dst);
1083 DO_imm_mandr_r("insertps", 100, src, dst);
1084 DO_imm_mandr_r("insertps", 101, src, dst);
1085 DO_imm_mandr_r("insertps", 102, src, dst);
1086 DO_imm_mandr_r("insertps", 103, src, dst);
1087 DO_imm_mandr_r("insertps", 104, src, dst);
1088 DO_imm_mandr_r("insertps", 105, src, dst);
1089 DO_imm_mandr_r("insertps", 106, src, dst);
1090 DO_imm_mandr_r("insertps", 107, src, dst);
1091 DO_imm_mandr_r("insertps", 108, src, dst);
1092 DO_imm_mandr_r("insertps", 109, src, dst);
1093 DO_imm_mandr_r("insertps", 110, src, dst);
1094 DO_imm_mandr_r("insertps", 111, src, dst);
1095 DO_imm_mandr_r("insertps", 112, src, dst);
1096 DO_imm_mandr_r("insertps", 113, src, dst);
1097 DO_imm_mandr_r("insertps", 114, src, dst);
1098 DO_imm_mandr_r("insertps", 115, src, dst);
1099 DO_imm_mandr_r("insertps", 116, src, dst);
1100 DO_imm_mandr_r("insertps", 117, src, dst);
1101 DO_imm_mandr_r("insertps", 118, src, dst);
1102 DO_imm_mandr_r("insertps", 119, src, dst);
1103 DO_imm_mandr_r("insertps", 120, src, dst);
1104 DO_imm_mandr_r("insertps", 121, src, dst);
1105 DO_imm_mandr_r("insertps", 122, src, dst);
1106 DO_imm_mandr_r("insertps", 123, src, dst);
1107 DO_imm_mandr_r("insertps", 124, src, dst);
1108 DO_imm_mandr_r("insertps", 125, src, dst);
1109 DO_imm_mandr_r("insertps", 126, src, dst);
1110 DO_imm_mandr_r("insertps", 127, src, dst);
1111 DO_imm_mandr_r("insertps", 128, src, dst);
1112 DO_imm_mandr_r("insertps", 129, src, dst);
1113 DO_imm_mandr_r("insertps", 130, src, dst);
1114 DO_imm_mandr_r("insertps", 131, src, dst);
1115 DO_imm_mandr_r("insertps", 132, src, dst);
1116 DO_imm_mandr_r("insertps", 133, src, dst);
1117 DO_imm_mandr_r("insertps", 134, src, dst);
1118 DO_imm_mandr_r("insertps", 135, src, dst);
1119 DO_imm_mandr_r("insertps", 136, src, dst);
1120 DO_imm_mandr_r("insertps", 137, src, dst);
1121 DO_imm_mandr_r("insertps", 138, src, dst);
1122 DO_imm_mandr_r("insertps", 139, src, dst);
1123 DO_imm_mandr_r("insertps", 140, src, dst);
1124 DO_imm_mandr_r("insertps", 141, src, dst);
1125 DO_imm_mandr_r("insertps", 142, src, dst);
1126 DO_imm_mandr_r("insertps", 143, src, dst);
1127 DO_imm_mandr_r("insertps", 144, src, dst);
1128 DO_imm_mandr_r("insertps", 145, src, dst);
1129 DO_imm_mandr_r("insertps", 146, src, dst);
1130 DO_imm_mandr_r("insertps", 147, src, dst);
1131 DO_imm_mandr_r("insertps", 148, src, dst);
1132 DO_imm_mandr_r("insertps", 149, src, dst);
1133 DO_imm_mandr_r("insertps", 150, src, dst);
1134 DO_imm_mandr_r("insertps", 151, src, dst);
1135 DO_imm_mandr_r("insertps", 152, src, dst);
1136 DO_imm_mandr_r("insertps", 153, src, dst);
1137 DO_imm_mandr_r("insertps", 154, src, dst);
1138 DO_imm_mandr_r("insertps", 155, src, dst);
1139 DO_imm_mandr_r("insertps", 156, src, dst);
1140 DO_imm_mandr_r("insertps", 157, src, dst);
1141 DO_imm_mandr_r("insertps", 158, src, dst);
1142 DO_imm_mandr_r("insertps", 159, src, dst);
1143 DO_imm_mandr_r("insertps", 160, src, dst);
1144 DO_imm_mandr_r("insertps", 161, src, dst);
1145 DO_imm_mandr_r("insertps", 162, src, dst);
1146 DO_imm_mandr_r("insertps", 163, src, dst);
1147 DO_imm_mandr_r("insertps", 164, src, dst);
1148 DO_imm_mandr_r("insertps", 165, src, dst);
1149 DO_imm_mandr_r("insertps", 166, src, dst);
1150 DO_imm_mandr_r("insertps", 167, src, dst);
1151 DO_imm_mandr_r("insertps", 168, src, dst);
1152 DO_imm_mandr_r("insertps", 169, src, dst);
1153 DO_imm_mandr_r("insertps", 170, src, dst);
1154 DO_imm_mandr_r("insertps", 171, src, dst);
1155 DO_imm_mandr_r("insertps", 172, src, dst);
1156 DO_imm_mandr_r("insertps", 173, src, dst);
1157 DO_imm_mandr_r("insertps", 174, src, dst);
1158 DO_imm_mandr_r("insertps", 175, src, dst);
1159 DO_imm_mandr_r("insertps", 176, src, dst);
1160 DO_imm_mandr_r("insertps", 177, src, dst);
1161 DO_imm_mandr_r("insertps", 178, src, dst);
1162 DO_imm_mandr_r("insertps", 179, src, dst);
1163 DO_imm_mandr_r("insertps", 180, src, dst);
1164 DO_imm_mandr_r("insertps", 181, src, dst);
1165 DO_imm_mandr_r("insertps", 182, src, dst);
1166 DO_imm_mandr_r("insertps", 183, src, dst);
1167 DO_imm_mandr_r("insertps", 184, src, dst);
1168 DO_imm_mandr_r("insertps", 185, src, dst);
1169 DO_imm_mandr_r("insertps", 186, src, dst);
1170 DO_imm_mandr_r("insertps", 187, src, dst);
1171 DO_imm_mandr_r("insertps", 188, src, dst);
1172 DO_imm_mandr_r("insertps", 189, src, dst);
1173 DO_imm_mandr_r("insertps", 190, src, dst);
1174 DO_imm_mandr_r("insertps", 191, src, dst);
1175 DO_imm_mandr_r("insertps", 192, src, dst);
1176 DO_imm_mandr_r("insertps", 193, src, dst);
1177 DO_imm_mandr_r("insertps", 194, src, dst);
1178 DO_imm_mandr_r("insertps", 195, src, dst);
1179 DO_imm_mandr_r("insertps", 196, src, dst);
1180 DO_imm_mandr_r("insertps", 197, src, dst);
1181 DO_imm_mandr_r("insertps", 198, src, dst);
1182 DO_imm_mandr_r("insertps", 199, src, dst);
1183 DO_imm_mandr_r("insertps", 200, src, dst);
1184 DO_imm_mandr_r("insertps", 201, src, dst);
1185 DO_imm_mandr_r("insertps", 202, src, dst);
1186 DO_imm_mandr_r("insertps", 203, src, dst);
1187 DO_imm_mandr_r("insertps", 204, src, dst);
1188 DO_imm_mandr_r("insertps", 205, src, dst);
1189 DO_imm_mandr_r("insertps", 206, src, dst);
1190 DO_imm_mandr_r("insertps", 207, src, dst);
1191 DO_imm_mandr_r("insertps", 208, src, dst);
1192 DO_imm_mandr_r("insertps", 209, src, dst);
1193 DO_imm_mandr_r("insertps", 210, src, dst);
1194 DO_imm_mandr_r("insertps", 211, src, dst);
1195 DO_imm_mandr_r("insertps", 212, src, dst);
1196 DO_imm_mandr_r("insertps", 213, src, dst);
1197 DO_imm_mandr_r("insertps", 214, src, dst);
1198 DO_imm_mandr_r("insertps", 215, src, dst);
1199 DO_imm_mandr_r("insertps", 216, src, dst);
1200 DO_imm_mandr_r("insertps", 217, src, dst);
1201 DO_imm_mandr_r("insertps", 218, src, dst);
1202 DO_imm_mandr_r("insertps", 219, src, dst);
1203 DO_imm_mandr_r("insertps", 220, src, dst);
1204 DO_imm_mandr_r("insertps", 221, src, dst);
1205 DO_imm_mandr_r("insertps", 222, src, dst);
1206 DO_imm_mandr_r("insertps", 223, src, dst);
1207 DO_imm_mandr_r("insertps", 224, src, dst);
1208 DO_imm_mandr_r("insertps", 225, src, dst);
1209 DO_imm_mandr_r("insertps", 226, src, dst);
1210 DO_imm_mandr_r("insertps", 227, src, dst);
1211 DO_imm_mandr_r("insertps", 228, src, dst);
1212 DO_imm_mandr_r("insertps", 229, src, dst);
1213 DO_imm_mandr_r("insertps", 230, src, dst);
1214 DO_imm_mandr_r("insertps", 231, src, dst);
1215 DO_imm_mandr_r("insertps", 232, src, dst);
1216 DO_imm_mandr_r("insertps", 233, src, dst);
1217 DO_imm_mandr_r("insertps", 234, src, dst);
1218 DO_imm_mandr_r("insertps", 235, src, dst);
1219 DO_imm_mandr_r("insertps", 236, src, dst);
1220 DO_imm_mandr_r("insertps", 237, src, dst);
1221 DO_imm_mandr_r("insertps", 238, src, dst);
1222 DO_imm_mandr_r("insertps", 239, src, dst);
1223 DO_imm_mandr_r("insertps", 240, src, dst);
1224 DO_imm_mandr_r("insertps", 241, src, dst);
1225 DO_imm_mandr_r("insertps", 242, src, dst);
1226 DO_imm_mandr_r("insertps", 243, src, dst);
1227 DO_imm_mandr_r("insertps", 244, src, dst);
1228 DO_imm_mandr_r("insertps", 245, src, dst);
1229 DO_imm_mandr_r("insertps", 246, src, dst);
1230 DO_imm_mandr_r("insertps", 247, src, dst);
1231 DO_imm_mandr_r("insertps", 248, src, dst);
1232 DO_imm_mandr_r("insertps", 249, src, dst);
1233 DO_imm_mandr_r("insertps", 250, src, dst);
1234 DO_imm_mandr_r("insertps", 251, src, dst);
1235 DO_imm_mandr_r("insertps", 252, src, dst);
1236 DO_imm_mandr_r("insertps", 253, src, dst);
1237 DO_imm_mandr_r("insertps", 254, src, dst);
1238 DO_imm_mandr_r("insertps", 255, src, dst);
1239 }
1240 }
1241
test_MPSADBW(void)1242 void test_MPSADBW ( void )
1243 {
1244 V128 src, dst;
1245 Int i;
1246 for (i = 0; i < 50; i++) {
1247 randV128(&src);
1248 randV128(&dst);
1249 DO_imm_mandr_r("mpsadbw", 0, src, dst);
1250 DO_imm_mandr_r("mpsadbw", 1, src, dst);
1251 DO_imm_mandr_r("mpsadbw", 2, src, dst);
1252 DO_imm_mandr_r("mpsadbw", 3, src, dst);
1253 DO_imm_mandr_r("mpsadbw", 4, src, dst);
1254 DO_imm_mandr_r("mpsadbw", 5, src, dst);
1255 DO_imm_mandr_r("mpsadbw", 6, src, dst);
1256 DO_imm_mandr_r("mpsadbw", 7, src, dst);
1257 }
1258 }
1259
test_PACKUSDW(void)1260 void test_PACKUSDW ( void )
1261 {
1262 V128 src, dst;
1263 Int i;
1264 for (i = 0; i < 10; i++) {
1265 if (i < 9) {
1266 randV128(&src);
1267 randV128(&dst);
1268 } else {
1269 memset(&src, 0, sizeof(src));
1270 memset(&dst, 0, sizeof(src));
1271 src[0] = 0x11; src[1] = 0x22;
1272 src[4] = 0x33; src[5] = 0x44;
1273 src[8] = 0x55; src[9] = 0x66;
1274 src[12] = 0x77; src[13] = 0x88;
1275 dst[0] = 0xaa; dst[1] = 0xbb;
1276 dst[4] = 0xcc; dst[5] = 0xdd;
1277 dst[8] = 0xee; dst[9] = 0xff;
1278 dst[12] = 0xa1; dst[13] = 0xb2;
1279 }
1280 DO_mandr_r("packusdw", src, dst);
1281 }
1282 }
1283
test_PBLENDW(void)1284 void test_PBLENDW ( void )
1285 {
1286 V128 src, dst;
1287 randV128(&src);
1288 randV128(&dst);
1289 {
1290 DO_imm_mandr_r("pblendw", 0, src, dst);
1291 DO_imm_mandr_r("pblendw", 1, src, dst);
1292 DO_imm_mandr_r("pblendw", 2, src, dst);
1293 DO_imm_mandr_r("pblendw", 3, src, dst);
1294 DO_imm_mandr_r("pblendw", 4, src, dst);
1295 DO_imm_mandr_r("pblendw", 5, src, dst);
1296 DO_imm_mandr_r("pblendw", 6, src, dst);
1297 DO_imm_mandr_r("pblendw", 7, src, dst);
1298 DO_imm_mandr_r("pblendw", 8, src, dst);
1299 DO_imm_mandr_r("pblendw", 9, src, dst);
1300 DO_imm_mandr_r("pblendw", 10, src, dst);
1301 DO_imm_mandr_r("pblendw", 11, src, dst);
1302 DO_imm_mandr_r("pblendw", 12, src, dst);
1303 DO_imm_mandr_r("pblendw", 13, src, dst);
1304 DO_imm_mandr_r("pblendw", 14, src, dst);
1305 DO_imm_mandr_r("pblendw", 15, src, dst);
1306 DO_imm_mandr_r("pblendw", 16, src, dst);
1307 DO_imm_mandr_r("pblendw", 17, src, dst);
1308 DO_imm_mandr_r("pblendw", 18, src, dst);
1309 DO_imm_mandr_r("pblendw", 19, src, dst);
1310 DO_imm_mandr_r("pblendw", 20, src, dst);
1311 DO_imm_mandr_r("pblendw", 21, src, dst);
1312 DO_imm_mandr_r("pblendw", 22, src, dst);
1313 DO_imm_mandr_r("pblendw", 23, src, dst);
1314 DO_imm_mandr_r("pblendw", 24, src, dst);
1315 DO_imm_mandr_r("pblendw", 25, src, dst);
1316 DO_imm_mandr_r("pblendw", 26, src, dst);
1317 DO_imm_mandr_r("pblendw", 27, src, dst);
1318 DO_imm_mandr_r("pblendw", 28, src, dst);
1319 DO_imm_mandr_r("pblendw", 29, src, dst);
1320 DO_imm_mandr_r("pblendw", 30, src, dst);
1321 DO_imm_mandr_r("pblendw", 31, src, dst);
1322 DO_imm_mandr_r("pblendw", 32, src, dst);
1323 DO_imm_mandr_r("pblendw", 33, src, dst);
1324 DO_imm_mandr_r("pblendw", 34, src, dst);
1325 DO_imm_mandr_r("pblendw", 35, src, dst);
1326 DO_imm_mandr_r("pblendw", 36, src, dst);
1327 DO_imm_mandr_r("pblendw", 37, src, dst);
1328 DO_imm_mandr_r("pblendw", 38, src, dst);
1329 DO_imm_mandr_r("pblendw", 39, src, dst);
1330 DO_imm_mandr_r("pblendw", 40, src, dst);
1331 DO_imm_mandr_r("pblendw", 41, src, dst);
1332 DO_imm_mandr_r("pblendw", 42, src, dst);
1333 DO_imm_mandr_r("pblendw", 43, src, dst);
1334 DO_imm_mandr_r("pblendw", 44, src, dst);
1335 DO_imm_mandr_r("pblendw", 45, src, dst);
1336 DO_imm_mandr_r("pblendw", 46, src, dst);
1337 DO_imm_mandr_r("pblendw", 47, src, dst);
1338 DO_imm_mandr_r("pblendw", 48, src, dst);
1339 DO_imm_mandr_r("pblendw", 49, src, dst);
1340 DO_imm_mandr_r("pblendw", 50, src, dst);
1341 DO_imm_mandr_r("pblendw", 51, src, dst);
1342 DO_imm_mandr_r("pblendw", 52, src, dst);
1343 DO_imm_mandr_r("pblendw", 53, src, dst);
1344 DO_imm_mandr_r("pblendw", 54, src, dst);
1345 DO_imm_mandr_r("pblendw", 55, src, dst);
1346 DO_imm_mandr_r("pblendw", 56, src, dst);
1347 DO_imm_mandr_r("pblendw", 57, src, dst);
1348 DO_imm_mandr_r("pblendw", 58, src, dst);
1349 DO_imm_mandr_r("pblendw", 59, src, dst);
1350 DO_imm_mandr_r("pblendw", 60, src, dst);
1351 DO_imm_mandr_r("pblendw", 61, src, dst);
1352 DO_imm_mandr_r("pblendw", 62, src, dst);
1353 DO_imm_mandr_r("pblendw", 63, src, dst);
1354 DO_imm_mandr_r("pblendw", 64, src, dst);
1355 DO_imm_mandr_r("pblendw", 65, src, dst);
1356 DO_imm_mandr_r("pblendw", 66, src, dst);
1357 DO_imm_mandr_r("pblendw", 67, src, dst);
1358 DO_imm_mandr_r("pblendw", 68, src, dst);
1359 DO_imm_mandr_r("pblendw", 69, src, dst);
1360 DO_imm_mandr_r("pblendw", 70, src, dst);
1361 DO_imm_mandr_r("pblendw", 71, src, dst);
1362 DO_imm_mandr_r("pblendw", 72, src, dst);
1363 DO_imm_mandr_r("pblendw", 73, src, dst);
1364 DO_imm_mandr_r("pblendw", 74, src, dst);
1365 DO_imm_mandr_r("pblendw", 75, src, dst);
1366 DO_imm_mandr_r("pblendw", 76, src, dst);
1367 DO_imm_mandr_r("pblendw", 77, src, dst);
1368 DO_imm_mandr_r("pblendw", 78, src, dst);
1369 DO_imm_mandr_r("pblendw", 79, src, dst);
1370 DO_imm_mandr_r("pblendw", 80, src, dst);
1371 DO_imm_mandr_r("pblendw", 81, src, dst);
1372 DO_imm_mandr_r("pblendw", 82, src, dst);
1373 DO_imm_mandr_r("pblendw", 83, src, dst);
1374 DO_imm_mandr_r("pblendw", 84, src, dst);
1375 DO_imm_mandr_r("pblendw", 85, src, dst);
1376 DO_imm_mandr_r("pblendw", 86, src, dst);
1377 DO_imm_mandr_r("pblendw", 87, src, dst);
1378 DO_imm_mandr_r("pblendw", 88, src, dst);
1379 DO_imm_mandr_r("pblendw", 89, src, dst);
1380 DO_imm_mandr_r("pblendw", 90, src, dst);
1381 DO_imm_mandr_r("pblendw", 91, src, dst);
1382 DO_imm_mandr_r("pblendw", 92, src, dst);
1383 DO_imm_mandr_r("pblendw", 93, src, dst);
1384 DO_imm_mandr_r("pblendw", 94, src, dst);
1385 DO_imm_mandr_r("pblendw", 95, src, dst);
1386 DO_imm_mandr_r("pblendw", 96, src, dst);
1387 DO_imm_mandr_r("pblendw", 97, src, dst);
1388 DO_imm_mandr_r("pblendw", 98, src, dst);
1389 DO_imm_mandr_r("pblendw", 99, src, dst);
1390 DO_imm_mandr_r("pblendw", 100, src, dst);
1391 DO_imm_mandr_r("pblendw", 101, src, dst);
1392 DO_imm_mandr_r("pblendw", 102, src, dst);
1393 DO_imm_mandr_r("pblendw", 103, src, dst);
1394 DO_imm_mandr_r("pblendw", 104, src, dst);
1395 DO_imm_mandr_r("pblendw", 105, src, dst);
1396 DO_imm_mandr_r("pblendw", 106, src, dst);
1397 DO_imm_mandr_r("pblendw", 107, src, dst);
1398 DO_imm_mandr_r("pblendw", 108, src, dst);
1399 DO_imm_mandr_r("pblendw", 109, src, dst);
1400 DO_imm_mandr_r("pblendw", 110, src, dst);
1401 DO_imm_mandr_r("pblendw", 111, src, dst);
1402 DO_imm_mandr_r("pblendw", 112, src, dst);
1403 DO_imm_mandr_r("pblendw", 113, src, dst);
1404 DO_imm_mandr_r("pblendw", 114, src, dst);
1405 DO_imm_mandr_r("pblendw", 115, src, dst);
1406 DO_imm_mandr_r("pblendw", 116, src, dst);
1407 DO_imm_mandr_r("pblendw", 117, src, dst);
1408 DO_imm_mandr_r("pblendw", 118, src, dst);
1409 DO_imm_mandr_r("pblendw", 119, src, dst);
1410 DO_imm_mandr_r("pblendw", 120, src, dst);
1411 DO_imm_mandr_r("pblendw", 121, src, dst);
1412 DO_imm_mandr_r("pblendw", 122, src, dst);
1413 DO_imm_mandr_r("pblendw", 123, src, dst);
1414 DO_imm_mandr_r("pblendw", 124, src, dst);
1415 DO_imm_mandr_r("pblendw", 125, src, dst);
1416 DO_imm_mandr_r("pblendw", 126, src, dst);
1417 DO_imm_mandr_r("pblendw", 127, src, dst);
1418 DO_imm_mandr_r("pblendw", 128, src, dst);
1419 DO_imm_mandr_r("pblendw", 129, src, dst);
1420 DO_imm_mandr_r("pblendw", 130, src, dst);
1421 DO_imm_mandr_r("pblendw", 131, src, dst);
1422 DO_imm_mandr_r("pblendw", 132, src, dst);
1423 DO_imm_mandr_r("pblendw", 133, src, dst);
1424 DO_imm_mandr_r("pblendw", 134, src, dst);
1425 DO_imm_mandr_r("pblendw", 135, src, dst);
1426 DO_imm_mandr_r("pblendw", 136, src, dst);
1427 DO_imm_mandr_r("pblendw", 137, src, dst);
1428 DO_imm_mandr_r("pblendw", 138, src, dst);
1429 DO_imm_mandr_r("pblendw", 139, src, dst);
1430 DO_imm_mandr_r("pblendw", 140, src, dst);
1431 DO_imm_mandr_r("pblendw", 141, src, dst);
1432 DO_imm_mandr_r("pblendw", 142, src, dst);
1433 DO_imm_mandr_r("pblendw", 143, src, dst);
1434 DO_imm_mandr_r("pblendw", 144, src, dst);
1435 DO_imm_mandr_r("pblendw", 145, src, dst);
1436 DO_imm_mandr_r("pblendw", 146, src, dst);
1437 DO_imm_mandr_r("pblendw", 147, src, dst);
1438 DO_imm_mandr_r("pblendw", 148, src, dst);
1439 DO_imm_mandr_r("pblendw", 149, src, dst);
1440 DO_imm_mandr_r("pblendw", 150, src, dst);
1441 DO_imm_mandr_r("pblendw", 151, src, dst);
1442 DO_imm_mandr_r("pblendw", 152, src, dst);
1443 DO_imm_mandr_r("pblendw", 153, src, dst);
1444 DO_imm_mandr_r("pblendw", 154, src, dst);
1445 DO_imm_mandr_r("pblendw", 155, src, dst);
1446 DO_imm_mandr_r("pblendw", 156, src, dst);
1447 DO_imm_mandr_r("pblendw", 157, src, dst);
1448 DO_imm_mandr_r("pblendw", 158, src, dst);
1449 DO_imm_mandr_r("pblendw", 159, src, dst);
1450 DO_imm_mandr_r("pblendw", 160, src, dst);
1451 DO_imm_mandr_r("pblendw", 161, src, dst);
1452 DO_imm_mandr_r("pblendw", 162, src, dst);
1453 DO_imm_mandr_r("pblendw", 163, src, dst);
1454 DO_imm_mandr_r("pblendw", 164, src, dst);
1455 DO_imm_mandr_r("pblendw", 165, src, dst);
1456 DO_imm_mandr_r("pblendw", 166, src, dst);
1457 DO_imm_mandr_r("pblendw", 167, src, dst);
1458 DO_imm_mandr_r("pblendw", 168, src, dst);
1459 DO_imm_mandr_r("pblendw", 169, src, dst);
1460 DO_imm_mandr_r("pblendw", 170, src, dst);
1461 DO_imm_mandr_r("pblendw", 171, src, dst);
1462 DO_imm_mandr_r("pblendw", 172, src, dst);
1463 DO_imm_mandr_r("pblendw", 173, src, dst);
1464 DO_imm_mandr_r("pblendw", 174, src, dst);
1465 DO_imm_mandr_r("pblendw", 175, src, dst);
1466 DO_imm_mandr_r("pblendw", 176, src, dst);
1467 DO_imm_mandr_r("pblendw", 177, src, dst);
1468 DO_imm_mandr_r("pblendw", 178, src, dst);
1469 DO_imm_mandr_r("pblendw", 179, src, dst);
1470 DO_imm_mandr_r("pblendw", 180, src, dst);
1471 DO_imm_mandr_r("pblendw", 181, src, dst);
1472 DO_imm_mandr_r("pblendw", 182, src, dst);
1473 DO_imm_mandr_r("pblendw", 183, src, dst);
1474 DO_imm_mandr_r("pblendw", 184, src, dst);
1475 DO_imm_mandr_r("pblendw", 185, src, dst);
1476 DO_imm_mandr_r("pblendw", 186, src, dst);
1477 DO_imm_mandr_r("pblendw", 187, src, dst);
1478 DO_imm_mandr_r("pblendw", 188, src, dst);
1479 DO_imm_mandr_r("pblendw", 189, src, dst);
1480 DO_imm_mandr_r("pblendw", 190, src, dst);
1481 DO_imm_mandr_r("pblendw", 191, src, dst);
1482 DO_imm_mandr_r("pblendw", 192, src, dst);
1483 DO_imm_mandr_r("pblendw", 193, src, dst);
1484 DO_imm_mandr_r("pblendw", 194, src, dst);
1485 DO_imm_mandr_r("pblendw", 195, src, dst);
1486 DO_imm_mandr_r("pblendw", 196, src, dst);
1487 DO_imm_mandr_r("pblendw", 197, src, dst);
1488 DO_imm_mandr_r("pblendw", 198, src, dst);
1489 DO_imm_mandr_r("pblendw", 199, src, dst);
1490 DO_imm_mandr_r("pblendw", 200, src, dst);
1491 DO_imm_mandr_r("pblendw", 201, src, dst);
1492 DO_imm_mandr_r("pblendw", 202, src, dst);
1493 DO_imm_mandr_r("pblendw", 203, src, dst);
1494 DO_imm_mandr_r("pblendw", 204, src, dst);
1495 DO_imm_mandr_r("pblendw", 205, src, dst);
1496 DO_imm_mandr_r("pblendw", 206, src, dst);
1497 DO_imm_mandr_r("pblendw", 207, src, dst);
1498 DO_imm_mandr_r("pblendw", 208, src, dst);
1499 DO_imm_mandr_r("pblendw", 209, src, dst);
1500 DO_imm_mandr_r("pblendw", 210, src, dst);
1501 DO_imm_mandr_r("pblendw", 211, src, dst);
1502 DO_imm_mandr_r("pblendw", 212, src, dst);
1503 DO_imm_mandr_r("pblendw", 213, src, dst);
1504 DO_imm_mandr_r("pblendw", 214, src, dst);
1505 DO_imm_mandr_r("pblendw", 215, src, dst);
1506 DO_imm_mandr_r("pblendw", 216, src, dst);
1507 DO_imm_mandr_r("pblendw", 217, src, dst);
1508 DO_imm_mandr_r("pblendw", 218, src, dst);
1509 DO_imm_mandr_r("pblendw", 219, src, dst);
1510 DO_imm_mandr_r("pblendw", 220, src, dst);
1511 DO_imm_mandr_r("pblendw", 221, src, dst);
1512 DO_imm_mandr_r("pblendw", 222, src, dst);
1513 DO_imm_mandr_r("pblendw", 223, src, dst);
1514 DO_imm_mandr_r("pblendw", 224, src, dst);
1515 DO_imm_mandr_r("pblendw", 225, src, dst);
1516 DO_imm_mandr_r("pblendw", 226, src, dst);
1517 DO_imm_mandr_r("pblendw", 227, src, dst);
1518 DO_imm_mandr_r("pblendw", 228, src, dst);
1519 DO_imm_mandr_r("pblendw", 229, src, dst);
1520 DO_imm_mandr_r("pblendw", 230, src, dst);
1521 DO_imm_mandr_r("pblendw", 231, src, dst);
1522 DO_imm_mandr_r("pblendw", 232, src, dst);
1523 DO_imm_mandr_r("pblendw", 233, src, dst);
1524 DO_imm_mandr_r("pblendw", 234, src, dst);
1525 DO_imm_mandr_r("pblendw", 235, src, dst);
1526 DO_imm_mandr_r("pblendw", 236, src, dst);
1527 DO_imm_mandr_r("pblendw", 237, src, dst);
1528 DO_imm_mandr_r("pblendw", 238, src, dst);
1529 DO_imm_mandr_r("pblendw", 239, src, dst);
1530 DO_imm_mandr_r("pblendw", 240, src, dst);
1531 DO_imm_mandr_r("pblendw", 241, src, dst);
1532 DO_imm_mandr_r("pblendw", 242, src, dst);
1533 DO_imm_mandr_r("pblendw", 243, src, dst);
1534 DO_imm_mandr_r("pblendw", 244, src, dst);
1535 DO_imm_mandr_r("pblendw", 245, src, dst);
1536 DO_imm_mandr_r("pblendw", 246, src, dst);
1537 DO_imm_mandr_r("pblendw", 247, src, dst);
1538 DO_imm_mandr_r("pblendw", 248, src, dst);
1539 DO_imm_mandr_r("pblendw", 249, src, dst);
1540 DO_imm_mandr_r("pblendw", 250, src, dst);
1541 DO_imm_mandr_r("pblendw", 251, src, dst);
1542 DO_imm_mandr_r("pblendw", 252, src, dst);
1543 DO_imm_mandr_r("pblendw", 253, src, dst);
1544 DO_imm_mandr_r("pblendw", 254, src, dst);
1545 DO_imm_mandr_r("pblendw", 255, src, dst);
1546 }
1547 }
1548
1549
test_PCMPEQQ(void)1550 void test_PCMPEQQ ( void )
1551 {
1552 V128 src, dst;
1553 Int i;
1554 for (i = 0; i < 10; i++) {
1555 randV128(&src);
1556 randV128(&dst);
1557 switch (i - 6) {
1558 case 0: memset(&src[0], 0x55, 8);
1559 memset(&dst[0], 0x55, 8); break;
1560 case 1: memset(&src[8], 0x55, 8);
1561 memset(&dst[8], 0x55, 8); break;
1562 default:
1563 break;
1564 }
1565 DO_mandr_r("pcmpeqq", src, dst);
1566 }
1567 }
1568
1569
test_PEXTRB(void)1570 void test_PEXTRB ( void )
1571 {
1572 V128 src;
1573 randV128(&src);
1574 DO_imm_r_to_mandrscalar("pextrb", 0, src, "d");
1575 DO_imm_r_to_mandrscalar("pextrb", 1, src, "d");
1576 DO_imm_r_to_mandrscalar("pextrb", 2, src, "d");
1577 DO_imm_r_to_mandrscalar("pextrb", 3, src, "d");
1578 DO_imm_r_to_mandrscalar("pextrb", 4, src, "d");
1579 DO_imm_r_to_mandrscalar("pextrb", 5, src, "d");
1580 DO_imm_r_to_mandrscalar("pextrb", 6, src, "d");
1581 DO_imm_r_to_mandrscalar("pextrb", 7, src, "d");
1582 DO_imm_r_to_mandrscalar("pextrb", 8, src, "d");
1583 DO_imm_r_to_mandrscalar("pextrb", 9, src, "d");
1584 DO_imm_r_to_mandrscalar("pextrb", 10, src, "d");
1585 DO_imm_r_to_mandrscalar("pextrb", 11, src, "d");
1586 DO_imm_r_to_mandrscalar("pextrb", 12, src, "d");
1587 DO_imm_r_to_mandrscalar("pextrb", 13, src, "d");
1588 DO_imm_r_to_mandrscalar("pextrb", 14, src, "d");
1589 DO_imm_r_to_mandrscalar("pextrb", 15, src, "d");
1590 }
1591
test_PINSRB(void)1592 void test_PINSRB ( void )
1593 {
1594 ULong src;
1595 src = randULong();
1596 DO_imm_mandrscalar_to_r("pinsrb", 0, src, "d");
1597 src = randULong();
1598 DO_imm_mandrscalar_to_r("pinsrb", 1, src, "d");
1599 src = randULong();
1600 DO_imm_mandrscalar_to_r("pinsrb", 2, src, "d");
1601 src = randULong();
1602 DO_imm_mandrscalar_to_r("pinsrb", 3, src, "d");
1603 src = randULong();
1604 DO_imm_mandrscalar_to_r("pinsrb", 4, src, "d");
1605 src = randULong();
1606 DO_imm_mandrscalar_to_r("pinsrb", 5, src, "d");
1607 src = randULong();
1608 DO_imm_mandrscalar_to_r("pinsrb", 6, src, "d");
1609 src = randULong();
1610 DO_imm_mandrscalar_to_r("pinsrb", 7, src, "d");
1611 src = randULong();
1612 DO_imm_mandrscalar_to_r("pinsrb", 8, src, "d");
1613 src = randULong();
1614 DO_imm_mandrscalar_to_r("pinsrb", 9, src, "d");
1615 src = randULong();
1616 DO_imm_mandrscalar_to_r("pinsrb", 10, src, "d");
1617 src = randULong();
1618 DO_imm_mandrscalar_to_r("pinsrb", 11, src, "d");
1619 src = randULong();
1620 DO_imm_mandrscalar_to_r("pinsrb", 12, src, "d");
1621 src = randULong();
1622 DO_imm_mandrscalar_to_r("pinsrb", 13, src, "d");
1623 src = randULong();
1624 DO_imm_mandrscalar_to_r("pinsrb", 14, src, "d");
1625 src = randULong();
1626 DO_imm_mandrscalar_to_r("pinsrb", 15, src, "d");
1627 }
1628
1629
test_PEXTRW(void)1630 void test_PEXTRW ( void )
1631 {
1632 V128 src;
1633 randV128(&src);
1634 DO_imm_r_to_mandrscalar("pextrw", 0, src, "d");
1635 DO_imm_r_to_mandrscalar("pextrw", 1, src, "d");
1636 DO_imm_r_to_mandrscalar("pextrw", 2, src, "d");
1637 DO_imm_r_to_mandrscalar("pextrw", 3, src, "d");
1638 DO_imm_r_to_mandrscalar("pextrw", 4, src, "d");
1639 DO_imm_r_to_mandrscalar("pextrw", 5, src, "d");
1640 DO_imm_r_to_mandrscalar("pextrw", 6, src, "d");
1641 DO_imm_r_to_mandrscalar("pextrw", 7, src, "d");
1642 }
1643
test_PINSRW(void)1644 void test_PINSRW ( void )
1645 {
1646 ULong src;
1647 src = randULong();
1648 DO_imm_mandrscalar_to_r("pinsrw", 0, src, "d");
1649 src = randULong();
1650 DO_imm_mandrscalar_to_r("pinsrw", 1, src, "d");
1651 src = randULong();
1652 DO_imm_mandrscalar_to_r("pinsrw", 2, src, "d");
1653 src = randULong();
1654 DO_imm_mandrscalar_to_r("pinsrw", 3, src, "d");
1655 src = randULong();
1656 DO_imm_mandrscalar_to_r("pinsrw", 4, src, "d");
1657 src = randULong();
1658 DO_imm_mandrscalar_to_r("pinsrw", 5, src, "d");
1659 src = randULong();
1660 DO_imm_mandrscalar_to_r("pinsrw", 6, src, "d");
1661 src = randULong();
1662 DO_imm_mandrscalar_to_r("pinsrw", 7, src, "d");
1663 }
1664
1665
test_PEXTRD(void)1666 void test_PEXTRD ( void )
1667 {
1668 V128 src;
1669 randV128(&src);
1670 DO_imm_r_to_mandrscalar("pextrd", 0, src, "d");
1671 DO_imm_r_to_mandrscalar("pextrd", 1, src, "d");
1672 DO_imm_r_to_mandrscalar("pextrd", 2, src, "d");
1673 DO_imm_r_to_mandrscalar("pextrd", 3, src, "d");
1674 }
1675
test_PINSRD(void)1676 void test_PINSRD ( void )
1677 {
1678 ULong src;
1679 src = randULong();
1680 DO_imm_mandrscalar_to_r("pinsrd", 0, src, "d");
1681 src = randULong();
1682 DO_imm_mandrscalar_to_r("pinsrd", 1, src, "d");
1683 src = randULong();
1684 DO_imm_mandrscalar_to_r("pinsrd", 2, src, "d");
1685 src = randULong();
1686 DO_imm_mandrscalar_to_r("pinsrd", 3, src, "d");
1687 }
1688
1689
test_PEXTRQ(void)1690 void test_PEXTRQ ( void )
1691 {
1692 V128 src;
1693 randV128(&src);
1694 DO_imm_r_to_mandrscalar("pextrq", 0, src, "");
1695 DO_imm_r_to_mandrscalar("pextrq", 1, src, "");
1696 }
1697
test_PINSRQ(void)1698 void test_PINSRQ ( void )
1699 {
1700 ULong src;
1701 src = randULong();
1702 DO_imm_mandrscalar_to_r("pinsrq", 0, src, "");
1703 src = randULong();
1704 DO_imm_mandrscalar_to_r("pinsrq", 1, src, "");
1705 }
1706
1707
test_EXTRACTPS(void)1708 void test_EXTRACTPS ( void )
1709 {
1710 V128 src;
1711 randV128(&src);
1712 DO_imm_r_to_mandrscalar("extractps", 0, src, "d");
1713 DO_imm_r_to_mandrscalar("extractps", 1, src, "d");
1714 DO_imm_r_to_mandrscalar("extractps", 2, src, "d");
1715 DO_imm_r_to_mandrscalar("extractps", 3, src, "d");
1716 }
1717
1718
test_PHMINPOSUW(void)1719 void test_PHMINPOSUW ( void )
1720 {
1721 V128 src, dst;
1722 Int i;
1723 for (i = 0; i < 20; i++) {
1724 randV128(&src);
1725 randV128(&dst);
1726 DO_mandr_r("phminposuw", src, dst);
1727 }
1728 memset(src, 0x55, sizeof(src));
1729 memset(dst, 0xAA, sizeof(dst));
1730 DO_mandr_r("phminposuw", src, dst);
1731 }
1732
test_PMAXSB(void)1733 void test_PMAXSB ( void )
1734 {
1735 V128 src, dst;
1736 Int i;
1737 for (i = 0; i < 10; i++) {
1738 randV128(&src);
1739 randV128(&dst);
1740 DO_mandr_r("pmaxsb", src, dst);
1741 }
1742 }
1743
test_PMAXSD(void)1744 void test_PMAXSD ( void )
1745 {
1746 V128 src, dst;
1747 Int i;
1748 for (i = 0; i < 10; i++) {
1749 randV128(&src);
1750 randV128(&dst);
1751 DO_mandr_r("pmaxsd", src, dst);
1752 }
1753 }
1754
test_PMAXUD(void)1755 void test_PMAXUD ( void )
1756 {
1757 V128 src, dst;
1758 Int i;
1759 for (i = 0; i < 10; i++) {
1760 randV128(&src);
1761 randV128(&dst);
1762 DO_mandr_r("pmaxud", src, dst);
1763 }
1764 }
1765
test_PMAXUW(void)1766 void test_PMAXUW ( void )
1767 {
1768 V128 src, dst;
1769 Int i;
1770 for (i = 0; i < 10; i++) {
1771 randV128(&src);
1772 randV128(&dst);
1773 DO_mandr_r("pmaxuw", src, dst);
1774 }
1775 }
1776
test_PMINSB(void)1777 void test_PMINSB ( void )
1778 {
1779 V128 src, dst;
1780 Int i;
1781 for (i = 0; i < 10; i++) {
1782 randV128(&src);
1783 randV128(&dst);
1784 DO_mandr_r("pminsb", src, dst);
1785 }
1786 }
1787
test_PMINSD(void)1788 void test_PMINSD ( void )
1789 {
1790 V128 src, dst;
1791 Int i;
1792 for (i = 0; i < 10; i++) {
1793 randV128(&src);
1794 randV128(&dst);
1795 DO_mandr_r("pminsd", src, dst);
1796 }
1797 }
1798
test_PMINUD(void)1799 void test_PMINUD ( void )
1800 {
1801 V128 src, dst;
1802 Int i;
1803 for (i = 0; i < 10; i++) {
1804 randV128(&src);
1805 randV128(&dst);
1806 DO_mandr_r("pminud", src, dst);
1807 }
1808 }
1809
test_PMINUW(void)1810 void test_PMINUW ( void )
1811 {
1812 V128 src, dst;
1813 Int i;
1814 for (i = 0; i < 10; i++) {
1815 randV128(&src);
1816 randV128(&dst);
1817 DO_mandr_r("pminuw", src, dst);
1818 }
1819 }
1820
test_PMOVSXBW(void)1821 void test_PMOVSXBW ( void )
1822 {
1823 V128 src, dst;
1824 Int i;
1825 for (i = 0; i < 10; i++) {
1826 randV128(&src);
1827 randV128(&dst);
1828 DO_mandr_r("pmovsxbw", src, dst);
1829 }
1830 }
1831
test_PMOVSXBD(void)1832 void test_PMOVSXBD ( void )
1833 {
1834 V128 src, dst;
1835 Int i;
1836 for (i = 0; i < 10; i++) {
1837 randV128(&src);
1838 randV128(&dst);
1839 DO_mandr_r("pmovsxbd", src, dst);
1840 }
1841 }
1842
test_PMOVSXBQ(void)1843 void test_PMOVSXBQ ( void )
1844 {
1845 V128 src, dst;
1846 Int i;
1847 for (i = 0; i < 10; i++) {
1848 randV128(&src);
1849 randV128(&dst);
1850 DO_mandr_r("pmovsxbq", src, dst);
1851 }
1852 }
1853
test_PMOVSXWD(void)1854 void test_PMOVSXWD ( void )
1855 {
1856 V128 src, dst;
1857 Int i;
1858 for (i = 0; i < 10; i++) {
1859 randV128(&src);
1860 randV128(&dst);
1861 DO_mandr_r("pmovsxwd", src, dst);
1862 }
1863 }
1864
test_PMOVSXWQ(void)1865 void test_PMOVSXWQ ( void )
1866 {
1867 V128 src, dst;
1868 Int i;
1869 for (i = 0; i < 10; i++) {
1870 randV128(&src);
1871 randV128(&dst);
1872 DO_mandr_r("pmovsxwq", src, dst);
1873 }
1874 }
1875
test_PMOVSXDQ(void)1876 void test_PMOVSXDQ ( void )
1877 {
1878 V128 src, dst;
1879 Int i;
1880 for (i = 0; i < 10; i++) {
1881 randV128(&src);
1882 randV128(&dst);
1883 DO_mandr_r("pmovsxdq", src, dst);
1884 }
1885 }
1886
test_PMOVZXBW(void)1887 void test_PMOVZXBW ( void )
1888 {
1889 V128 src, dst;
1890 Int i;
1891 for (i = 0; i < 10; i++) {
1892 randV128(&src);
1893 randV128(&dst);
1894 DO_mandr_r("pmovzxbw", src, dst);
1895 }
1896 }
1897
test_PMOVZXBD(void)1898 void test_PMOVZXBD ( void )
1899 {
1900 V128 src, dst;
1901 Int i;
1902 for (i = 0; i < 10; i++) {
1903 randV128(&src);
1904 randV128(&dst);
1905 DO_mandr_r("pmovzxbd", src, dst);
1906 }
1907 }
1908
test_PMOVZXBQ(void)1909 void test_PMOVZXBQ ( void )
1910 {
1911 V128 src, dst;
1912 Int i;
1913 for (i = 0; i < 10; i++) {
1914 randV128(&src);
1915 randV128(&dst);
1916 DO_mandr_r("pmovzxbq", src, dst);
1917 }
1918 }
1919
test_PMOVZXWD(void)1920 void test_PMOVZXWD ( void )
1921 {
1922 V128 src, dst;
1923 Int i;
1924 for (i = 0; i < 10; i++) {
1925 randV128(&src);
1926 randV128(&dst);
1927 DO_mandr_r("pmovzxwd", src, dst);
1928 }
1929 }
1930
test_PMOVZXWQ(void)1931 void test_PMOVZXWQ ( void )
1932 {
1933 V128 src, dst;
1934 Int i;
1935 for (i = 0; i < 10; i++) {
1936 randV128(&src);
1937 randV128(&dst);
1938 DO_mandr_r("pmovzxwq", src, dst);
1939 }
1940 }
1941
test_PMOVZXDQ(void)1942 void test_PMOVZXDQ ( void )
1943 {
1944 V128 src, dst;
1945 Int i;
1946 for (i = 0; i < 10; i++) {
1947 randV128(&src);
1948 randV128(&dst);
1949 DO_mandr_r("pmovzxdq", src, dst);
1950 }
1951 }
1952
test_PMULDQ(void)1953 void test_PMULDQ ( void )
1954 {
1955 V128 src, dst;
1956 Int i;
1957 for (i = 0; i < 10; i++) {
1958 randV128(&src);
1959 randV128(&dst);
1960 DO_mandr_r("pmuldq", src, dst);
1961 }
1962 }
1963
1964
test_PMULLD(void)1965 void test_PMULLD ( void )
1966 {
1967 V128 src, dst;
1968 Int i;
1969 for (i = 0; i < 10; i++) {
1970 randV128(&src);
1971 randV128(&dst);
1972 DO_mandr_r("pmulld", src, dst);
1973 }
1974 }
1975
1976
test_POPCNTQ(void)1977 void test_POPCNTQ ( void )
1978 {
1979 ULong block[4];
1980 Int i;
1981 ULong oszacp_mask = 0x8D5;
1982 for (i = 0; i < 10; i++) {
1983 block[0] = i == 0 ? 0 : randULong();
1984 block[1] = randULong();
1985 block[2] = randULong();
1986 block[3] = randULong();
1987 __asm__ __volatile__(
1988 "movq %0, %%rax" "\n\t"
1989 "movq 0(%%rax), %%rdi" "\n\t"
1990 "movq 8(%%rax), %%r11" "\n\t"
1991 #ifndef VGP_amd64_darwin
1992 "popcntq %%rdi, %%r11" "\n\t"
1993 #else
1994 "popcnt %%rdi, %%r11" "\n\t"
1995 #endif
1996 "movq %%r11, 16(%%rax)" "\n\t"
1997 "pushfq" "\n\t"
1998 "popq %%r12" "\n\t"
1999 "movq %%r12, 24(%%rax)" "\n"
2000 : /*out*/
2001 : /*in*/"r"(&block[0])
2002 : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
2003 );
2004 printf("r popcntq %016llx %016llx %016llx %016llx\n",
2005 block[0], block[1], block[2], block[3] & oszacp_mask);
2006
2007 block[0] = i == 0 ? 0 : randULong();
2008 block[1] = randULong();
2009 block[2] = randULong();
2010 block[3] = randULong();
2011 __asm__ __volatile__(
2012 "movq %0, %%rax" "\n\t"
2013 "movq 8(%%rax), %%r11" "\n\t"
2014 #ifndef VGP_amd64_darwin
2015 "popcntq 0(%%rax), %%r11" "\n\t"
2016 #else
2017 "popcnt 0(%%rax), %%r11" "\n\t"
2018 #endif
2019 "movq %%r11, 16(%%rax)" "\n\t"
2020 "pushfq" "\n\t"
2021 "popq %%r12" "\n\t"
2022 "movq %%r12, 24(%%rax)" "\n"
2023 : /*out*/
2024 : /*in*/"r"(&block[0])
2025 : /*trash*/ "cc", "memory", "r11", "r12"
2026 );
2027 printf("m popcntq %016llx %016llx %016llx %016llx\n",
2028 block[0], block[1], block[2], block[3] & oszacp_mask);
2029 }
2030 }
2031
2032
test_POPCNTL(void)2033 void test_POPCNTL ( void )
2034 {
2035 ULong block[4];
2036 Int i;
2037 ULong oszacp_mask = 0x8D5;
2038 for (i = 0; i < 10; i++) {
2039 block[0] = i == 0 ? 0 : randULong();
2040 block[1] = randULong();
2041 block[2] = randULong();
2042 block[3] = randULong();
2043 __asm__ __volatile__(
2044 "movq %0, %%rax" "\n\t"
2045 "movq 0(%%rax), %%rdi" "\n\t"
2046 "movq 8(%%rax), %%r11" "\n\t"
2047 #ifndef VGP_amd64_darwin
2048 "popcntl %%edi, %%r11d" "\n\t"
2049 #else
2050 "popcnt %%edi, %%r11d" "\n\t"
2051 #endif
2052 "movq %%r11, 16(%%rax)" "\n\t"
2053 "pushfq" "\n\t"
2054 "popq %%r12" "\n\t"
2055 "movq %%r12, 24(%%rax)" "\n"
2056 : /*out*/
2057 : /*in*/"r"(&block[0])
2058 : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
2059 );
2060 printf("r popcntl %016llx %016llx %016llx %016llx\n",
2061 block[0], block[1], block[2], block[3] & oszacp_mask);
2062
2063 block[0] = i == 0 ? 0 : randULong();
2064 block[1] = randULong();
2065 block[2] = randULong();
2066 block[3] = randULong();
2067 __asm__ __volatile__(
2068 "movq %0, %%rax" "\n\t"
2069 "movq 8(%%rax), %%r11" "\n\t"
2070 #ifndef VGP_amd64_darwin
2071 "popcntl 0(%%rax), %%r11d" "\n\t"
2072 #else
2073 "popcnt 0(%%rax), %%r11d" "\n\t"
2074 #endif
2075 "movq %%r11, 16(%%rax)" "\n\t"
2076 "pushfq" "\n\t"
2077 "popq %%r12" "\n\t"
2078 "movq %%r12, 24(%%rax)" "\n"
2079 : /*out*/
2080 : /*in*/"r"(&block[0])
2081 : /*trash*/ "cc", "memory", "r11", "r12"
2082 );
2083 printf("m popcntl %016llx %016llx %016llx %016llx\n",
2084 block[0], block[1], block[2], block[3] & oszacp_mask);
2085 }
2086 }
2087
2088
test_POPCNTW(void)2089 void test_POPCNTW ( void )
2090 {
2091 ULong block[4];
2092 Int i;
2093 ULong oszacp_mask = 0x8D5;
2094 for (i = 0; i < 10; i++) {
2095 block[0] = i == 0 ? 0 : randULong();
2096 block[1] = randULong();
2097 block[2] = randULong();
2098 block[3] = randULong();
2099 __asm__ __volatile__(
2100 "movq %0, %%rax" "\n\t"
2101 "movq 0(%%rax), %%rdi" "\n\t"
2102 "movq 8(%%rax), %%r11" "\n\t"
2103 #ifndef VGP_amd64_darwin
2104 "popcntw %%di, %%r11w" "\n\t"
2105 #else
2106 "popcnt %%di, %%r11w" "\n\t"
2107 #endif
2108 "movq %%r11, 16(%%rax)" "\n\t"
2109 "pushfq" "\n\t"
2110 "popq %%r12" "\n\t"
2111 "movq %%r12, 24(%%rax)" "\n"
2112 : /*out*/
2113 : /*in*/"r"(&block[0])
2114 : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
2115 );
2116 printf("r popcntw %016llx %016llx %016llx %016llx\n",
2117 block[0], block[1], block[2], block[3] & oszacp_mask);
2118
2119 block[0] = i == 0 ? 0 : randULong();
2120 block[1] = randULong();
2121 block[2] = randULong();
2122 block[3] = randULong();
2123 __asm__ __volatile__(
2124 "movq %0, %%rax" "\n\t"
2125 "movq 8(%%rax), %%r11" "\n\t"
2126 #ifndef VGP_amd64_darwin
2127 "popcntw 0(%%rax), %%r11w" "\n\t"
2128 #else
2129 "popcnt 0(%%rax), %%r11w" "\n\t"
2130 #endif
2131 "movq %%r11, 16(%%rax)" "\n\t"
2132 "pushfq" "\n\t"
2133 "popq %%r12" "\n\t"
2134 "movq %%r12, 24(%%rax)" "\n"
2135 : /*out*/
2136 : /*in*/"r"(&block[0])
2137 : /*trash*/ "cc", "memory", "r11", "r12"
2138 );
2139 printf("m popcntw %016llx %016llx %016llx %016llx\n",
2140 block[0], block[1], block[2], block[3] & oszacp_mask);
2141 }
2142 }
2143
2144
test_PCMPGTQ(void)2145 void test_PCMPGTQ ( void )
2146 {
2147 V128 spec[7];
2148 do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0xffffffffffffffffULL );
2149 do64HLtoV128( &spec[1], 0x0000000000000001ULL, 0xfffffffffffffffeULL );
2150 do64HLtoV128( &spec[2], 0x7fffffffffffffffULL, 0x8000000000000001ULL );
2151 do64HLtoV128( &spec[3], 0x8000000000000000ULL, 0x8000000000000000ULL );
2152 do64HLtoV128( &spec[4], 0x8000000000000001ULL, 0x7fffffffffffffffULL );
2153 do64HLtoV128( &spec[5], 0xfffffffffffffffeULL, 0x0000000000000001ULL );
2154 do64HLtoV128( &spec[6], 0xffffffffffffffffULL, 0x0000000000000000ULL );
2155
2156 V128 src, dst;
2157 Int i, j;
2158 for (i = 0; i < 10; i++) {
2159 randV128(&src);
2160 randV128(&dst);
2161 DO_mandr_r("pcmpgtq", src, dst);
2162 }
2163 for (i = 0; i < 7; i++) {
2164 for (j = 0; j < 7; j++) {
2165 memcpy(&src, &spec[i], 16);
2166 memcpy(&dst, &spec[j], 16);
2167 DO_mandr_r("pcmpgtq", src, dst);
2168 }
2169 }
2170 }
2171
2172 /* ------------ ROUNDSD ------------ */
2173
do_ROUNDSD_000(Bool mem,V128 * src,V128 * dst)2174 void do_ROUNDSD_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
2175 {
2176 if (mem) {
2177 __asm__ __volatile__(
2178 "movupd (%1), %%xmm11" "\n\t"
2179 "roundsd $0, (%0), %%xmm11" "\n\t"
2180 "movupd %%xmm11, (%1)" "\n"
2181 : /*OUT*/
2182 : /*IN*/ "r"(src), "r"(dst)
2183 : /*TRASH*/ "xmm11"
2184 );
2185 } else {
2186 __asm__ __volatile__(
2187 "movupd (%1), %%xmm11" "\n\t"
2188 "movupd (%0), %%xmm2" "\n\t"
2189 "roundsd $0, %%xmm2, %%xmm11" "\n\t"
2190 "movupd %%xmm11, (%1)" "\n"
2191 : /*OUT*/
2192 : /*IN*/ "r"(src), "r"(dst)
2193 : /*TRASH*/ "xmm11","xmm2"
2194 );
2195 }
2196 }
2197
do_ROUNDSD_001(Bool mem,V128 * src,V128 * dst)2198 void do_ROUNDSD_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
2199 {
2200 if (mem) {
2201 __asm__ __volatile__(
2202 "movupd (%1), %%xmm11" "\n\t"
2203 "roundsd $1, (%0), %%xmm11" "\n\t"
2204 "movupd %%xmm11, (%1)" "\n"
2205 : /*OUT*/
2206 : /*IN*/ "r"(src), "r"(dst)
2207 : /*TRASH*/ "xmm11"
2208 );
2209 } else {
2210 __asm__ __volatile__(
2211 "movupd (%1), %%xmm11" "\n\t"
2212 "movupd (%0), %%xmm2" "\n\t"
2213 "roundsd $1, %%xmm2, %%xmm11" "\n\t"
2214 "movupd %%xmm11, (%1)" "\n"
2215 : /*OUT*/
2216 : /*IN*/ "r"(src), "r"(dst)
2217 : /*TRASH*/ "xmm11","xmm2"
2218 );
2219 }
2220 }
2221
do_ROUNDSD_010(Bool mem,V128 * src,V128 * dst)2222 void do_ROUNDSD_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
2223 {
2224 if (mem) {
2225 __asm__ __volatile__(
2226 "movupd (%1), %%xmm11" "\n\t"
2227 "roundsd $2, (%0), %%xmm11" "\n\t"
2228 "movupd %%xmm11, (%1)" "\n"
2229 : /*OUT*/
2230 : /*IN*/ "r"(src), "r"(dst)
2231 : /*TRASH*/ "xmm11"
2232 );
2233 } else {
2234 __asm__ __volatile__(
2235 "movupd (%1), %%xmm11" "\n\t"
2236 "movupd (%0), %%xmm2" "\n\t"
2237 "roundsd $2, %%xmm2, %%xmm11" "\n\t"
2238 "movupd %%xmm11, (%1)" "\n"
2239 : /*OUT*/
2240 : /*IN*/ "r"(src), "r"(dst)
2241 : /*TRASH*/ "xmm11","xmm2"
2242 );
2243 }
2244 }
2245
do_ROUNDSD_011(Bool mem,V128 * src,V128 * dst)2246 void do_ROUNDSD_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
2247 {
2248 if (mem) {
2249 __asm__ __volatile__(
2250 "movupd (%1), %%xmm11" "\n\t"
2251 "roundsd $3, (%0), %%xmm11" "\n\t"
2252 "movupd %%xmm11, (%1)" "\n"
2253 : /*OUT*/
2254 : /*IN*/ "r"(src), "r"(dst)
2255 : /*TRASH*/ "xmm11"
2256 );
2257 } else {
2258 __asm__ __volatile__(
2259 "movupd (%1), %%xmm11" "\n\t"
2260 "movupd (%0), %%xmm2" "\n\t"
2261 "roundsd $3, %%xmm2, %%xmm11" "\n\t"
2262 "movupd %%xmm11, (%1)" "\n"
2263 : /*OUT*/
2264 : /*IN*/ "r"(src), "r"(dst)
2265 : /*TRASH*/ "xmm11","xmm2"
2266 );
2267 }
2268 }
2269
do_ROUNDSD_1XX(Bool mem,V128 * src,V128 * dst)2270 void do_ROUNDSD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
2271 {
2272 if (mem) {
2273 __asm__ __volatile__(
2274 "movupd (%1), %%xmm11" "\n\t"
2275 "roundsd $4, (%0), %%xmm11" "\n\t"
2276 "movupd %%xmm11, (%1)" "\n"
2277 : /*OUT*/
2278 : /*IN*/ "r"(src), "r"(dst)
2279 : /*TRASH*/ "xmm11"
2280 );
2281 } else {
2282 __asm__ __volatile__(
2283 "movupd (%1), %%xmm11" "\n\t"
2284 "movupd (%0), %%xmm2" "\n\t"
2285 "roundsd $4, %%xmm2, %%xmm11" "\n\t"
2286 "movupd %%xmm11, (%1)" "\n"
2287 : /*OUT*/
2288 : /*IN*/ "r"(src), "r"(dst)
2289 : /*TRASH*/ "xmm11","xmm2"
2290 );
2291 }
2292 }
2293
test_ROUNDSD_w_immediate_rounding(void)2294 void test_ROUNDSD_w_immediate_rounding ( void )
2295 {
2296 double vals[22];
2297 Int i = 0;
2298 vals[i++] = 0.0;
2299 vals[i++] = -0.0;
2300 vals[i++] = mkPosInf();
2301 vals[i++] = mkNegInf();
2302 vals[i++] = mkPosNan();
2303 vals[i++] = mkNegNan();
2304 vals[i++] = -1.3;
2305 vals[i++] = -1.1;
2306 vals[i++] = -0.9;
2307 vals[i++] = -0.7;
2308 vals[i++] = -0.50001;
2309 vals[i++] = -0.49999;
2310 vals[i++] = -0.3;
2311 vals[i++] = -0.1;
2312 vals[i++] = 0.1;
2313 vals[i++] = 0.3;
2314 vals[i++] = 0.49999;
2315 vals[i++] = 0.50001;
2316 vals[i++] = 0.7;
2317 vals[i++] = 0.9;
2318 vals[i++] = 1.1;
2319 vals[i++] = 1.3;
2320 assert(i == 22);
2321
2322 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2323 V128 src, dst;
2324
2325 randV128(&src);
2326 randV128(&dst);
2327 memcpy(&src[0], &vals[i], 8);
2328 do_ROUNDSD_000(False/*reg*/, &src, &dst);
2329 printf("r roundsd_000 ");
2330 showV128(&src);
2331 printf(" ");
2332 showV128(&dst);
2333 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2334 printf("\n");
2335
2336 randV128(&src);
2337 randV128(&dst);
2338 memcpy(&src[0], &vals[i], 8);
2339 do_ROUNDSD_000(True/*mem*/, &src, &dst);
2340 printf("m roundsd_000 ");
2341 showV128(&src);
2342 printf(" ");
2343 showV128(&dst);
2344 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2345 printf("\n");
2346
2347
2348 randV128(&src);
2349 randV128(&dst);
2350 memcpy(&src[0], &vals[i], 8);
2351 do_ROUNDSD_001(False/*reg*/, &src, &dst);
2352 printf("r roundsd_001 ");
2353 showV128(&src);
2354 printf(" ");
2355 showV128(&dst);
2356 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2357 printf("\n");
2358
2359 randV128(&src);
2360 randV128(&dst);
2361 memcpy(&src[0], &vals[i], 8);
2362 do_ROUNDSD_001(True/*mem*/, &src, &dst);
2363 printf("m roundsd_001 ");
2364 showV128(&src);
2365 printf(" ");
2366 showV128(&dst);
2367 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2368 printf("\n");
2369
2370
2371 randV128(&src);
2372 randV128(&dst);
2373 memcpy(&src[0], &vals[i], 8);
2374 do_ROUNDSD_010(False/*reg*/, &src, &dst);
2375 printf("r roundsd_010 ");
2376 showV128(&src);
2377 printf(" ");
2378 showV128(&dst);
2379 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2380 printf("\n");
2381
2382 randV128(&src);
2383 randV128(&dst);
2384 memcpy(&src[0], &vals[i], 8);
2385 do_ROUNDSD_010(True/*mem*/, &src, &dst);
2386 printf("m roundsd_010 ");
2387 showV128(&src);
2388 printf(" ");
2389 showV128(&dst);
2390 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2391 printf("\n");
2392
2393
2394 randV128(&src);
2395 randV128(&dst);
2396 memcpy(&src[0], &vals[i], 8);
2397 do_ROUNDSD_011(False/*reg*/, &src, &dst);
2398 printf("r roundsd_011 ");
2399 showV128(&src);
2400 printf(" ");
2401 showV128(&dst);
2402 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2403 printf("\n");
2404
2405 randV128(&src);
2406 randV128(&dst);
2407 memcpy(&src[0], &vals[i], 8);
2408 do_ROUNDSD_011(True/*mem*/, &src, &dst);
2409 printf("m roundsd_011 ");
2410 showV128(&src);
2411 printf(" ");
2412 showV128(&dst);
2413 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2414 printf("\n");
2415 }
2416 }
2417
test_ROUNDSD_w_mxcsr_rounding(void)2418 void test_ROUNDSD_w_mxcsr_rounding ( void )
2419 {
2420 UInt rm;
2421 double vals[22];
2422 Int i = 0;
2423 vals[i++] = 0.0;
2424 vals[i++] = -0.0;
2425 vals[i++] = mkPosInf();
2426 vals[i++] = mkNegInf();
2427 vals[i++] = mkPosNan();
2428 vals[i++] = mkNegNan();
2429 vals[i++] = -1.3;
2430 vals[i++] = -1.1;
2431 vals[i++] = -0.9;
2432 vals[i++] = -0.7;
2433 vals[i++] = -0.50001;
2434 vals[i++] = -0.49999;
2435 vals[i++] = -0.3;
2436 vals[i++] = -0.1;
2437 vals[i++] = 0.1;
2438 vals[i++] = 0.3;
2439 vals[i++] = 0.49999;
2440 vals[i++] = 0.50001;
2441 vals[i++] = 0.7;
2442 vals[i++] = 0.9;
2443 vals[i++] = 1.1;
2444 vals[i++] = 1.3;
2445 assert(i == 22);
2446
2447 rm = get_sse_roundingmode();
2448 assert(rm == 0); // 0 == RN == default
2449
2450 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2451 V128 src, dst;
2452
2453 for (rm = 0; rm <= 3; rm++) {
2454 set_sse_roundingmode(rm);
2455
2456 randV128(&src);
2457 randV128(&dst);
2458 memcpy(&src[0], &vals[i], 8);
2459 do_ROUNDSD_1XX(False/*reg*/, &src, &dst);
2460 printf("r (rm=%u) roundsd_1XX ", rm);
2461 showV128(&src);
2462 printf(" ");
2463 showV128(&dst);
2464 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2465 printf("\n");
2466
2467 randV128(&src);
2468 randV128(&dst);
2469 memcpy(&src[0], &vals[i], 8);
2470 do_ROUNDSD_1XX(True/*mem*/, &src, &dst);
2471 printf("m (rm=%u) roundsd_1XX ", rm);
2472 showV128(&src);
2473 printf(" ");
2474 showV128(&dst);
2475 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2476 printf("\n");
2477 }
2478 }
2479
2480 rm = get_sse_roundingmode();
2481 assert(rm == 3);
2482 set_sse_roundingmode(0);
2483 rm = get_sse_roundingmode();
2484 assert(rm == 0); // 0 == RN == default
2485 }
2486
2487
2488 /* ------------ ROUNDSS ------------ */
2489
do_ROUNDSS_000(Bool mem,V128 * src,V128 * dst)2490 void do_ROUNDSS_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
2491 {
2492 if (mem) {
2493 __asm__ __volatile__(
2494 "movupd (%1), %%xmm11" "\n\t"
2495 "roundss $0, (%0), %%xmm11" "\n\t"
2496 "movupd %%xmm11, (%1)" "\n"
2497 : /*OUT*/
2498 : /*IN*/ "r"(src), "r"(dst)
2499 : /*TRASH*/ "xmm11"
2500 );
2501 } else {
2502 __asm__ __volatile__(
2503 "movupd (%1), %%xmm11" "\n\t"
2504 "movupd (%0), %%xmm2" "\n\t"
2505 "roundss $0, %%xmm2, %%xmm11" "\n\t"
2506 "movupd %%xmm11, (%1)" "\n"
2507 : /*OUT*/
2508 : /*IN*/ "r"(src), "r"(dst)
2509 : /*TRASH*/ "xmm11","xmm2"
2510 );
2511 }
2512 }
2513
do_ROUNDSS_001(Bool mem,V128 * src,V128 * dst)2514 void do_ROUNDSS_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
2515 {
2516 if (mem) {
2517 __asm__ __volatile__(
2518 "movupd (%1), %%xmm11" "\n\t"
2519 "roundss $1, (%0), %%xmm11" "\n\t"
2520 "movupd %%xmm11, (%1)" "\n"
2521 : /*OUT*/
2522 : /*IN*/ "r"(src), "r"(dst)
2523 : /*TRASH*/ "xmm11"
2524 );
2525 } else {
2526 __asm__ __volatile__(
2527 "movupd (%1), %%xmm11" "\n\t"
2528 "movupd (%0), %%xmm2" "\n\t"
2529 "roundss $1, %%xmm2, %%xmm11" "\n\t"
2530 "movupd %%xmm11, (%1)" "\n"
2531 : /*OUT*/
2532 : /*IN*/ "r"(src), "r"(dst)
2533 : /*TRASH*/ "xmm11","xmm2"
2534 );
2535 }
2536 }
2537
do_ROUNDSS_010(Bool mem,V128 * src,V128 * dst)2538 void do_ROUNDSS_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
2539 {
2540 if (mem) {
2541 __asm__ __volatile__(
2542 "movupd (%1), %%xmm11" "\n\t"
2543 "roundss $2, (%0), %%xmm11" "\n\t"
2544 "movupd %%xmm11, (%1)" "\n"
2545 : /*OUT*/
2546 : /*IN*/ "r"(src), "r"(dst)
2547 : /*TRASH*/ "xmm11"
2548 );
2549 } else {
2550 __asm__ __volatile__(
2551 "movupd (%1), %%xmm11" "\n\t"
2552 "movupd (%0), %%xmm2" "\n\t"
2553 "roundss $2, %%xmm2, %%xmm11" "\n\t"
2554 "movupd %%xmm11, (%1)" "\n"
2555 : /*OUT*/
2556 : /*IN*/ "r"(src), "r"(dst)
2557 : /*TRASH*/ "xmm11","xmm2"
2558 );
2559 }
2560 }
2561
do_ROUNDSS_011(Bool mem,V128 * src,V128 * dst)2562 void do_ROUNDSS_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
2563 {
2564 if (mem) {
2565 __asm__ __volatile__(
2566 "movupd (%1), %%xmm11" "\n\t"
2567 "roundss $3, (%0), %%xmm11" "\n\t"
2568 "movupd %%xmm11, (%1)" "\n"
2569 : /*OUT*/
2570 : /*IN*/ "r"(src), "r"(dst)
2571 : /*TRASH*/ "xmm11"
2572 );
2573 } else {
2574 __asm__ __volatile__(
2575 "movupd (%1), %%xmm11" "\n\t"
2576 "movupd (%0), %%xmm2" "\n\t"
2577 "roundss $3, %%xmm2, %%xmm11" "\n\t"
2578 "movupd %%xmm11, (%1)" "\n"
2579 : /*OUT*/
2580 : /*IN*/ "r"(src), "r"(dst)
2581 : /*TRASH*/ "xmm11","xmm2"
2582 );
2583 }
2584 }
2585
do_ROUNDSS_1XX(Bool mem,V128 * src,V128 * dst)2586 void do_ROUNDSS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
2587 {
2588 if (mem) {
2589 __asm__ __volatile__(
2590 "movupd (%1), %%xmm11" "\n\t"
2591 "roundss $4, (%0), %%xmm11" "\n\t"
2592 "movupd %%xmm11, (%1)" "\n"
2593 : /*OUT*/
2594 : /*IN*/ "r"(src), "r"(dst)
2595 : /*TRASH*/ "xmm11"
2596 );
2597 } else {
2598 __asm__ __volatile__(
2599 "movupd (%1), %%xmm11" "\n\t"
2600 "movupd (%0), %%xmm2" "\n\t"
2601 "roundss $4, %%xmm2, %%xmm11" "\n\t"
2602 "movupd %%xmm11, (%1)" "\n"
2603 : /*OUT*/
2604 : /*IN*/ "r"(src), "r"(dst)
2605 : /*TRASH*/ "xmm11","xmm2"
2606 );
2607 }
2608 }
2609
test_ROUNDSS_w_immediate_rounding(void)2610 void test_ROUNDSS_w_immediate_rounding ( void )
2611 {
2612 float vals[22];
2613 Int i = 0;
2614 vals[i++] = 0.0;
2615 vals[i++] = -0.0;
2616 vals[i++] = mkPosInf();
2617 vals[i++] = mkNegInf();
2618 vals[i++] = mkPosNan();
2619 vals[i++] = mkNegNan();
2620 vals[i++] = -1.3;
2621 vals[i++] = -1.1;
2622 vals[i++] = -0.9;
2623 vals[i++] = -0.7;
2624 vals[i++] = -0.50001;
2625 vals[i++] = -0.49999;
2626 vals[i++] = -0.3;
2627 vals[i++] = -0.1;
2628 vals[i++] = 0.1;
2629 vals[i++] = 0.3;
2630 vals[i++] = 0.49999;
2631 vals[i++] = 0.50001;
2632 vals[i++] = 0.7;
2633 vals[i++] = 0.9;
2634 vals[i++] = 1.1;
2635 vals[i++] = 1.3;
2636 assert(i == 22);
2637
2638 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2639 V128 src, dst;
2640
2641 randV128(&src);
2642 randV128(&dst);
2643 memcpy(&src[0], &vals[i], 4);
2644 do_ROUNDSS_000(False/*reg*/, &src, &dst);
2645 printf("r roundss_000 ");
2646 showV128(&src);
2647 printf(" ");
2648 showV128(&dst);
2649 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2650 printf("\n");
2651
2652 randV128(&src);
2653 randV128(&dst);
2654 memcpy(&src[0], &vals[i], 4);
2655 do_ROUNDSS_000(True/*mem*/, &src, &dst);
2656 printf("m roundss_000 ");
2657 showV128(&src);
2658 printf(" ");
2659 showV128(&dst);
2660 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2661 printf("\n");
2662
2663
2664 randV128(&src);
2665 randV128(&dst);
2666 memcpy(&src[0], &vals[i], 4);
2667 do_ROUNDSS_001(False/*reg*/, &src, &dst);
2668 printf("r roundss_001 ");
2669 showV128(&src);
2670 printf(" ");
2671 showV128(&dst);
2672 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2673 printf("\n");
2674
2675 randV128(&src);
2676 randV128(&dst);
2677 memcpy(&src[0], &vals[i], 4);
2678 do_ROUNDSS_001(True/*mem*/, &src, &dst);
2679 printf("m roundss_001 ");
2680 showV128(&src);
2681 printf(" ");
2682 showV128(&dst);
2683 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2684 printf("\n");
2685
2686
2687 randV128(&src);
2688 randV128(&dst);
2689 memcpy(&src[0], &vals[i], 4);
2690 do_ROUNDSS_010(False/*reg*/, &src, &dst);
2691 printf("r roundss_010 ");
2692 showV128(&src);
2693 printf(" ");
2694 showV128(&dst);
2695 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2696 printf("\n");
2697
2698 randV128(&src);
2699 randV128(&dst);
2700 memcpy(&src[0], &vals[i], 4);
2701 do_ROUNDSS_010(True/*mem*/, &src, &dst);
2702 printf("m roundss_010 ");
2703 showV128(&src);
2704 printf(" ");
2705 showV128(&dst);
2706 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2707 printf("\n");
2708
2709
2710 randV128(&src);
2711 randV128(&dst);
2712 memcpy(&src[0], &vals[i], 4);
2713 do_ROUNDSS_011(False/*reg*/, &src, &dst);
2714 printf("r roundss_011 ");
2715 showV128(&src);
2716 printf(" ");
2717 showV128(&dst);
2718 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2719 printf("\n");
2720
2721 randV128(&src);
2722 randV128(&dst);
2723 memcpy(&src[0], &vals[i], 4);
2724 do_ROUNDSS_011(True/*mem*/, &src, &dst);
2725 printf("m roundss_011 ");
2726 showV128(&src);
2727 printf(" ");
2728 showV128(&dst);
2729 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2730 printf("\n");
2731 }
2732 }
2733
test_ROUNDSS_w_mxcsr_rounding(void)2734 void test_ROUNDSS_w_mxcsr_rounding ( void )
2735 {
2736 UInt rm;
2737 float vals[22];
2738 Int i = 0;
2739 vals[i++] = 0.0;
2740 vals[i++] = -0.0;
2741 vals[i++] = mkPosInf();
2742 vals[i++] = mkNegInf();
2743 vals[i++] = mkPosNan();
2744 vals[i++] = mkNegNan();
2745 vals[i++] = -1.3;
2746 vals[i++] = -1.1;
2747 vals[i++] = -0.9;
2748 vals[i++] = -0.7;
2749 vals[i++] = -0.50001;
2750 vals[i++] = -0.49999;
2751 vals[i++] = -0.3;
2752 vals[i++] = -0.1;
2753 vals[i++] = 0.1;
2754 vals[i++] = 0.3;
2755 vals[i++] = 0.49999;
2756 vals[i++] = 0.50001;
2757 vals[i++] = 0.7;
2758 vals[i++] = 0.9;
2759 vals[i++] = 1.1;
2760 vals[i++] = 1.3;
2761 assert(i == 22);
2762
2763 rm = get_sse_roundingmode();
2764 assert(rm == 0); // 0 == RN == default
2765
2766 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2767 V128 src, dst;
2768
2769 for (rm = 0; rm <= 3; rm++) {
2770 set_sse_roundingmode(rm);
2771
2772 randV128(&src);
2773 randV128(&dst);
2774 memcpy(&src[0], &vals[i], 4);
2775 do_ROUNDSS_1XX(False/*reg*/, &src, &dst);
2776 printf("r (rm=%u) roundss_1XX ", rm);
2777 showV128(&src);
2778 printf(" ");
2779 showV128(&dst);
2780 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2781 printf("\n");
2782
2783 randV128(&src);
2784 randV128(&dst);
2785 memcpy(&src[0], &vals[i], 4);
2786 do_ROUNDSS_1XX(True/*mem*/, &src, &dst);
2787 printf("m (rm=%u) roundss_1XX ", rm);
2788 showV128(&src);
2789 printf(" ");
2790 showV128(&dst);
2791 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2792 printf("\n");
2793 }
2794 }
2795
2796 rm = get_sse_roundingmode();
2797 assert(rm == 3);
2798 set_sse_roundingmode(0);
2799 rm = get_sse_roundingmode();
2800 assert(rm == 0); // 0 == RN == default
2801 }
2802
2803 /* ------------ ROUNDPD ------------ */
2804
do_ROUNDPD_000(Bool mem,V128 * src,V128 * dst)2805 void do_ROUNDPD_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
2806 {
2807 if (mem) {
2808 __asm__ __volatile__(
2809 "movupd (%1), %%xmm11" "\n\t"
2810 "roundpd $0, (%0), %%xmm11" "\n\t"
2811 "movupd %%xmm11, (%1)" "\n"
2812 : /*OUT*/
2813 : /*IN*/ "r"(src), "r"(dst)
2814 : /*TRASH*/ "xmm11"
2815 );
2816 } else {
2817 __asm__ __volatile__(
2818 "movupd (%1), %%xmm11" "\n\t"
2819 "movupd (%0), %%xmm2" "\n\t"
2820 "roundpd $0, %%xmm2, %%xmm11" "\n\t"
2821 "movupd %%xmm11, (%1)" "\n"
2822 : /*OUT*/
2823 : /*IN*/ "r"(src), "r"(dst)
2824 : /*TRASH*/ "xmm11","xmm2"
2825 );
2826 }
2827 }
2828
do_ROUNDPD_001(Bool mem,V128 * src,V128 * dst)2829 void do_ROUNDPD_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
2830 {
2831 if (mem) {
2832 __asm__ __volatile__(
2833 "movupd (%1), %%xmm11" "\n\t"
2834 "roundpd $1, (%0), %%xmm11" "\n\t"
2835 "movupd %%xmm11, (%1)" "\n"
2836 : /*OUT*/
2837 : /*IN*/ "r"(src), "r"(dst)
2838 : /*TRASH*/ "xmm11"
2839 );
2840 } else {
2841 __asm__ __volatile__(
2842 "movupd (%1), %%xmm11" "\n\t"
2843 "movupd (%0), %%xmm2" "\n\t"
2844 "roundpd $1, %%xmm2, %%xmm11" "\n\t"
2845 "movupd %%xmm11, (%1)" "\n"
2846 : /*OUT*/
2847 : /*IN*/ "r"(src), "r"(dst)
2848 : /*TRASH*/ "xmm11","xmm2"
2849 );
2850 }
2851 }
2852
do_ROUNDPD_010(Bool mem,V128 * src,V128 * dst)2853 void do_ROUNDPD_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
2854 {
2855 if (mem) {
2856 __asm__ __volatile__(
2857 "movupd (%1), %%xmm11" "\n\t"
2858 "roundpd $2, (%0), %%xmm11" "\n\t"
2859 "movupd %%xmm11, (%1)" "\n"
2860 : /*OUT*/
2861 : /*IN*/ "r"(src), "r"(dst)
2862 : /*TRASH*/ "xmm11"
2863 );
2864 } else {
2865 __asm__ __volatile__(
2866 "movupd (%1), %%xmm11" "\n\t"
2867 "movupd (%0), %%xmm2" "\n\t"
2868 "roundpd $2, %%xmm2, %%xmm11" "\n\t"
2869 "movupd %%xmm11, (%1)" "\n"
2870 : /*OUT*/
2871 : /*IN*/ "r"(src), "r"(dst)
2872 : /*TRASH*/ "xmm11","xmm2"
2873 );
2874 }
2875 }
2876
do_ROUNDPD_011(Bool mem,V128 * src,V128 * dst)2877 void do_ROUNDPD_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
2878 {
2879 if (mem) {
2880 __asm__ __volatile__(
2881 "movupd (%1), %%xmm11" "\n\t"
2882 "roundpd $3, (%0), %%xmm11" "\n\t"
2883 "movupd %%xmm11, (%1)" "\n"
2884 : /*OUT*/
2885 : /*IN*/ "r"(src), "r"(dst)
2886 : /*TRASH*/ "xmm11"
2887 );
2888 } else {
2889 __asm__ __volatile__(
2890 "movupd (%1), %%xmm11" "\n\t"
2891 "movupd (%0), %%xmm2" "\n\t"
2892 "roundpd $3, %%xmm2, %%xmm11" "\n\t"
2893 "movupd %%xmm11, (%1)" "\n"
2894 : /*OUT*/
2895 : /*IN*/ "r"(src), "r"(dst)
2896 : /*TRASH*/ "xmm11","xmm2"
2897 );
2898 }
2899 }
2900
do_ROUNDPD_1XX(Bool mem,V128 * src,V128 * dst)2901 void do_ROUNDPD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
2902 {
2903 if (mem) {
2904 __asm__ __volatile__(
2905 "movupd (%1), %%xmm11" "\n\t"
2906 "roundpd $4, (%0), %%xmm11" "\n\t"
2907 "movupd %%xmm11, (%1)" "\n"
2908 : /*OUT*/
2909 : /*IN*/ "r"(src), "r"(dst)
2910 : /*TRASH*/ "xmm11"
2911 );
2912 } else {
2913 __asm__ __volatile__(
2914 "movupd (%1), %%xmm11" "\n\t"
2915 "movupd (%0), %%xmm2" "\n\t"
2916 "roundpd $4, %%xmm2, %%xmm11" "\n\t"
2917 "movupd %%xmm11, (%1)" "\n"
2918 : /*OUT*/
2919 : /*IN*/ "r"(src), "r"(dst)
2920 : /*TRASH*/ "xmm11","xmm2"
2921 );
2922 }
2923 }
2924
test_ROUNDPD_w_immediate_rounding(void)2925 void test_ROUNDPD_w_immediate_rounding ( void )
2926 {
2927 double vals[22];
2928 Int i = 0;
2929 vals[i++] = 0.0;
2930 vals[i++] = -0.0;
2931 vals[i++] = mkPosInf();
2932 vals[i++] = mkNegInf();
2933 vals[i++] = mkPosNan();
2934 vals[i++] = mkNegNan();
2935 vals[i++] = -1.3;
2936 vals[i++] = -1.1;
2937 vals[i++] = -0.9;
2938 vals[i++] = -0.7;
2939 vals[i++] = -0.50001;
2940 vals[i++] = -0.49999;
2941 vals[i++] = -0.3;
2942 vals[i++] = -0.1;
2943 vals[i++] = 0.1;
2944 vals[i++] = 0.3;
2945 vals[i++] = 0.49999;
2946 vals[i++] = 0.50001;
2947 vals[i++] = 0.7;
2948 vals[i++] = 0.9;
2949 vals[i++] = 1.1;
2950 vals[i++] = 1.3;
2951 assert(i == 22);
2952
2953 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2954 V128 src, dst;
2955
2956 randV128(&src);
2957 randV128(&dst);
2958 memcpy(&src[0], &vals[i], 8);
2959 memcpy(&src[8], &vals[(i+11)%22], 8);
2960 do_ROUNDPD_000(False/*reg*/, &src, &dst);
2961 printf("r roundpd_000 ");
2962 showV128(&src);
2963 printf(" ");
2964 showV128(&dst);
2965 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
2966 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
2967 printf("\n");
2968
2969 randV128(&src);
2970 randV128(&dst);
2971 memcpy(&src[0], &vals[i], 8);
2972 memcpy(&src[8], &vals[(i+11)%22], 8);
2973 do_ROUNDPD_000(True/*mem*/, &src, &dst);
2974 printf("m roundpd_000 ");
2975 showV128(&src);
2976 printf(" ");
2977 showV128(&dst);
2978 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
2979 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
2980 printf("\n");
2981
2982
2983 randV128(&src);
2984 randV128(&dst);
2985 memcpy(&src[0], &vals[i], 8);
2986 memcpy(&src[8], &vals[(i+11)%22], 8);
2987 do_ROUNDPD_001(False/*reg*/, &src, &dst);
2988 printf("r roundpd_001 ");
2989 showV128(&src);
2990 printf(" ");
2991 showV128(&dst);
2992 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
2993 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
2994 printf("\n");
2995
2996 randV128(&src);
2997 randV128(&dst);
2998 memcpy(&src[0], &vals[i], 8);
2999 memcpy(&src[8], &vals[(i+11)%22], 8);
3000 do_ROUNDPD_001(True/*mem*/, &src, &dst);
3001 printf("m roundpd_001 ");
3002 showV128(&src);
3003 printf(" ");
3004 showV128(&dst);
3005 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3006 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3007 printf("\n");
3008
3009
3010 randV128(&src);
3011 randV128(&dst);
3012 memcpy(&src[0], &vals[i], 8);
3013 memcpy(&src[8], &vals[(i+11)%22], 8);
3014 do_ROUNDPD_010(False/*reg*/, &src, &dst);
3015 printf("r roundpd_010 ");
3016 showV128(&src);
3017 printf(" ");
3018 showV128(&dst);
3019 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3020 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3021 printf("\n");
3022
3023 randV128(&src);
3024 randV128(&dst);
3025 memcpy(&src[0], &vals[i], 8);
3026 memcpy(&src[8], &vals[(i+11)%22], 8);
3027 do_ROUNDPD_010(True/*mem*/, &src, &dst);
3028 printf("m roundpd_010 ");
3029 showV128(&src);
3030 printf(" ");
3031 showV128(&dst);
3032 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3033 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3034 printf("\n");
3035
3036
3037 randV128(&src);
3038 randV128(&dst);
3039 memcpy(&src[0], &vals[i], 8);
3040 memcpy(&src[8], &vals[(i+11)%22], 8);
3041 do_ROUNDPD_011(False/*reg*/, &src, &dst);
3042 printf("r roundpd_011 ");
3043 showV128(&src);
3044 printf(" ");
3045 showV128(&dst);
3046 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3047 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3048 printf("\n");
3049
3050 randV128(&src);
3051 randV128(&dst);
3052 memcpy(&src[0], &vals[i], 8);
3053 memcpy(&src[8], &vals[(i+11)%22], 8);
3054 do_ROUNDPD_011(True/*mem*/, &src, &dst);
3055 printf("m roundpd_011 ");
3056 showV128(&src);
3057 printf(" ");
3058 showV128(&dst);
3059 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3060 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3061 printf("\n");
3062 }
3063 }
3064
test_ROUNDPD_w_mxcsr_rounding(void)3065 void test_ROUNDPD_w_mxcsr_rounding ( void )
3066 {
3067 UInt rm;
3068 double vals[22];
3069 Int i = 0;
3070 vals[i++] = 0.0;
3071 vals[i++] = -0.0;
3072 vals[i++] = mkPosInf();
3073 vals[i++] = mkNegInf();
3074 vals[i++] = mkPosNan();
3075 vals[i++] = mkNegNan();
3076 vals[i++] = -1.3;
3077 vals[i++] = -1.1;
3078 vals[i++] = -0.9;
3079 vals[i++] = -0.7;
3080 vals[i++] = -0.50001;
3081 vals[i++] = -0.49999;
3082 vals[i++] = -0.3;
3083 vals[i++] = -0.1;
3084 vals[i++] = 0.1;
3085 vals[i++] = 0.3;
3086 vals[i++] = 0.49999;
3087 vals[i++] = 0.50001;
3088 vals[i++] = 0.7;
3089 vals[i++] = 0.9;
3090 vals[i++] = 1.1;
3091 vals[i++] = 1.3;
3092 assert(i == 22);
3093
3094 rm = get_sse_roundingmode();
3095 assert(rm == 0); // 0 == RN == default
3096
3097 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
3098 V128 src, dst;
3099
3100 for (rm = 0; rm <= 3; rm++) {
3101 set_sse_roundingmode(rm);
3102
3103 randV128(&src);
3104 randV128(&dst);
3105 memcpy(&src[0], &vals[i], 8);
3106 memcpy(&src[8], &vals[(i+11)%22], 8);
3107 do_ROUNDPD_1XX(False/*reg*/, &src, &dst);
3108 printf("r (rm=%u) roundpd_1XX ", rm);
3109 showV128(&src);
3110 printf(" ");
3111 showV128(&dst);
3112 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3113 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3114 printf("\n");
3115
3116 randV128(&src);
3117 randV128(&dst);
3118 memcpy(&src[0], &vals[i], 8);
3119 memcpy(&src[8], &vals[(i+11)%22], 8);
3120 do_ROUNDPD_1XX(True/*mem*/, &src, &dst);
3121 printf("m (rm=%u) roundpd_1XX ", rm);
3122 showV128(&src);
3123 printf(" ");
3124 showV128(&dst);
3125 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3126 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3127 printf("\n");
3128 }
3129 }
3130
3131 rm = get_sse_roundingmode();
3132 assert(rm == 3);
3133 set_sse_roundingmode(0);
3134 rm = get_sse_roundingmode();
3135 assert(rm == 0); // 0 == RN == default
3136 }
3137
3138 /* ------------ ROUNDPS ------------ */
3139
do_ROUNDPS_000(Bool mem,V128 * src,V128 * dst)3140 void do_ROUNDPS_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
3141 {
3142 if (mem) {
3143 __asm__ __volatile__(
3144 "movupd (%1), %%xmm11" "\n\t"
3145 "roundps $0, (%0), %%xmm11" "\n\t"
3146 "movupd %%xmm11, (%1)" "\n"
3147 : /*OUT*/
3148 : /*IN*/ "r"(src), "r"(dst)
3149 : /*TRASH*/ "xmm11"
3150 );
3151 } else {
3152 __asm__ __volatile__(
3153 "movupd (%1), %%xmm11" "\n\t"
3154 "movupd (%0), %%xmm2" "\n\t"
3155 "roundps $0, %%xmm2, %%xmm11" "\n\t"
3156 "movupd %%xmm11, (%1)" "\n"
3157 : /*OUT*/
3158 : /*IN*/ "r"(src), "r"(dst)
3159 : /*TRASH*/ "xmm11","xmm2"
3160 );
3161 }
3162 }
3163
do_ROUNDPS_001(Bool mem,V128 * src,V128 * dst)3164 void do_ROUNDPS_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
3165 {
3166 if (mem) {
3167 __asm__ __volatile__(
3168 "movupd (%1), %%xmm11" "\n\t"
3169 "roundps $1, (%0), %%xmm11" "\n\t"
3170 "movupd %%xmm11, (%1)" "\n"
3171 : /*OUT*/
3172 : /*IN*/ "r"(src), "r"(dst)
3173 : /*TRASH*/ "xmm11"
3174 );
3175 } else {
3176 __asm__ __volatile__(
3177 "movupd (%1), %%xmm11" "\n\t"
3178 "movupd (%0), %%xmm2" "\n\t"
3179 "roundps $1, %%xmm2, %%xmm11" "\n\t"
3180 "movupd %%xmm11, (%1)" "\n"
3181 : /*OUT*/
3182 : /*IN*/ "r"(src), "r"(dst)
3183 : /*TRASH*/ "xmm11","xmm2"
3184 );
3185 }
3186 }
3187
do_ROUNDPS_010(Bool mem,V128 * src,V128 * dst)3188 void do_ROUNDPS_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
3189 {
3190 if (mem) {
3191 __asm__ __volatile__(
3192 "movupd (%1), %%xmm11" "\n\t"
3193 "roundps $2, (%0), %%xmm11" "\n\t"
3194 "movupd %%xmm11, (%1)" "\n"
3195 : /*OUT*/
3196 : /*IN*/ "r"(src), "r"(dst)
3197 : /*TRASH*/ "xmm11"
3198 );
3199 } else {
3200 __asm__ __volatile__(
3201 "movupd (%1), %%xmm11" "\n\t"
3202 "movupd (%0), %%xmm2" "\n\t"
3203 "roundps $2, %%xmm2, %%xmm11" "\n\t"
3204 "movupd %%xmm11, (%1)" "\n"
3205 : /*OUT*/
3206 : /*IN*/ "r"(src), "r"(dst)
3207 : /*TRASH*/ "xmm11","xmm2"
3208 );
3209 }
3210 }
3211
do_ROUNDPS_011(Bool mem,V128 * src,V128 * dst)3212 void do_ROUNDPS_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
3213 {
3214 if (mem) {
3215 __asm__ __volatile__(
3216 "movupd (%1), %%xmm11" "\n\t"
3217 "roundps $3, (%0), %%xmm11" "\n\t"
3218 "movupd %%xmm11, (%1)" "\n"
3219 : /*OUT*/
3220 : /*IN*/ "r"(src), "r"(dst)
3221 : /*TRASH*/ "xmm11"
3222 );
3223 } else {
3224 __asm__ __volatile__(
3225 "movupd (%1), %%xmm11" "\n\t"
3226 "movupd (%0), %%xmm2" "\n\t"
3227 "roundps $3, %%xmm2, %%xmm11" "\n\t"
3228 "movupd %%xmm11, (%1)" "\n"
3229 : /*OUT*/
3230 : /*IN*/ "r"(src), "r"(dst)
3231 : /*TRASH*/ "xmm11","xmm2"
3232 );
3233 }
3234 }
3235
do_ROUNDPS_1XX(Bool mem,V128 * src,V128 * dst)3236 void do_ROUNDPS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
3237 {
3238 if (mem) {
3239 __asm__ __volatile__(
3240 "movupd (%1), %%xmm11" "\n\t"
3241 "roundps $4, (%0), %%xmm11" "\n\t"
3242 "movupd %%xmm11, (%1)" "\n"
3243 : /*OUT*/
3244 : /*IN*/ "r"(src), "r"(dst)
3245 : /*TRASH*/ "xmm11"
3246 );
3247 } else {
3248 __asm__ __volatile__(
3249 "movupd (%1), %%xmm11" "\n\t"
3250 "movupd (%0), %%xmm2" "\n\t"
3251 "roundps $4, %%xmm2, %%xmm11" "\n\t"
3252 "movupd %%xmm11, (%1)" "\n"
3253 : /*OUT*/
3254 : /*IN*/ "r"(src), "r"(dst)
3255 : /*TRASH*/ "xmm11","xmm2"
3256 );
3257 }
3258 }
3259
test_ROUNDPS_w_immediate_rounding(void)3260 void test_ROUNDPS_w_immediate_rounding ( void )
3261 {
3262 float vals[22];
3263 Int i = 0;
3264 vals[i++] = 0.0;
3265 vals[i++] = -0.0;
3266 vals[i++] = mkPosInf();
3267 vals[i++] = mkNegInf();
3268 vals[i++] = mkPosNan();
3269 vals[i++] = mkNegNan();
3270 vals[i++] = -1.3;
3271 vals[i++] = -1.1;
3272 vals[i++] = -0.9;
3273 vals[i++] = -0.7;
3274 vals[i++] = -0.50001;
3275 vals[i++] = -0.49999;
3276 vals[i++] = -0.3;
3277 vals[i++] = -0.1;
3278 vals[i++] = 0.1;
3279 vals[i++] = 0.3;
3280 vals[i++] = 0.49999;
3281 vals[i++] = 0.50001;
3282 vals[i++] = 0.7;
3283 vals[i++] = 0.9;
3284 vals[i++] = 1.1;
3285 vals[i++] = 1.3;
3286 assert(i == 22);
3287
3288 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
3289 V128 src, dst;
3290
3291 randV128(&src);
3292 randV128(&dst);
3293 memcpy(&src[0], &vals[i], 4);
3294 memcpy(&src[4], &vals[(i+5)%22], 4);
3295 memcpy(&src[8], &vals[(i+11)%22], 4);
3296 memcpy(&src[12], &vals[(i+17)%22], 4);
3297 do_ROUNDPS_000(False/*reg*/, &src, &dst);
3298 printf("r roundps_000 ");
3299 showV128(&src);
3300 printf(" ");
3301 showV128(&dst);
3302 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3303 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3304 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3305 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3306 printf("\n");
3307
3308 randV128(&src);
3309 randV128(&dst);
3310 memcpy(&src[0], &vals[i], 4);
3311 memcpy(&src[4], &vals[(i+5)%22], 4);
3312 memcpy(&src[8], &vals[(i+11)%22], 4);
3313 memcpy(&src[12], &vals[(i+17)%22], 4);
3314 do_ROUNDPS_000(True/*mem*/, &src, &dst);
3315 printf("m roundps_000 ");
3316 showV128(&src);
3317 printf(" ");
3318 showV128(&dst);
3319 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3320 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3321 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3322 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3323 printf("\n");
3324
3325
3326 randV128(&src);
3327 randV128(&dst);
3328 memcpy(&src[0], &vals[i], 4);
3329 memcpy(&src[4], &vals[(i+5)%22], 4);
3330 memcpy(&src[8], &vals[(i+11)%22], 4);
3331 memcpy(&src[12], &vals[(i+17)%22], 4);
3332 do_ROUNDPS_001(False/*reg*/, &src, &dst);
3333 printf("r roundps_001 ");
3334 showV128(&src);
3335 printf(" ");
3336 showV128(&dst);
3337 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3338 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3339 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3340 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3341 printf("\n");
3342
3343 randV128(&src);
3344 randV128(&dst);
3345 memcpy(&src[0], &vals[i], 4);
3346 memcpy(&src[4], &vals[(i+5)%22], 4);
3347 memcpy(&src[8], &vals[(i+11)%22], 4);
3348 memcpy(&src[12], &vals[(i+17)%22], 4);
3349 do_ROUNDPS_001(True/*mem*/, &src, &dst);
3350 printf("m roundps_001 ");
3351 showV128(&src);
3352 printf(" ");
3353 showV128(&dst);
3354 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3355 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3356 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3357 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3358 printf("\n");
3359
3360
3361 randV128(&src);
3362 randV128(&dst);
3363 memcpy(&src[0], &vals[i], 4);
3364 memcpy(&src[4], &vals[(i+5)%22], 4);
3365 memcpy(&src[8], &vals[(i+11)%22], 4);
3366 memcpy(&src[12], &vals[(i+17)%22], 4);
3367 do_ROUNDPS_010(False/*reg*/, &src, &dst);
3368 printf("r roundps_010 ");
3369 showV128(&src);
3370 printf(" ");
3371 showV128(&dst);
3372 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3373 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3374 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3375 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3376 printf("\n");
3377
3378 randV128(&src);
3379 randV128(&dst);
3380 memcpy(&src[0], &vals[i], 4);
3381 memcpy(&src[4], &vals[(i+5)%22], 4);
3382 memcpy(&src[8], &vals[(i+11)%22], 4);
3383 memcpy(&src[12], &vals[(i+17)%22], 4);
3384 do_ROUNDPS_010(True/*mem*/, &src, &dst);
3385 printf("m roundps_010 ");
3386 showV128(&src);
3387 printf(" ");
3388 showV128(&dst);
3389 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3390 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3391 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3392 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3393 printf("\n");
3394
3395
3396 randV128(&src);
3397 randV128(&dst);
3398 memcpy(&src[0], &vals[i], 4);
3399 memcpy(&src[4], &vals[(i+5)%22], 4);
3400 memcpy(&src[8], &vals[(i+11)%22], 4);
3401 memcpy(&src[12], &vals[(i+17)%22], 4);
3402 do_ROUNDPS_011(False/*reg*/, &src, &dst);
3403 printf("r roundps_011 ");
3404 showV128(&src);
3405 printf(" ");
3406 showV128(&dst);
3407 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3408 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3409 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3410 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3411 printf("\n");
3412
3413 randV128(&src);
3414 randV128(&dst);
3415 memcpy(&src[0], &vals[i], 4);
3416 memcpy(&src[4], &vals[(i+5)%22], 4);
3417 memcpy(&src[8], &vals[(i+11)%22], 4);
3418 memcpy(&src[12], &vals[(i+17)%22], 4);
3419 do_ROUNDPS_011(True/*mem*/, &src, &dst);
3420 printf("m roundps_011 ");
3421 showV128(&src);
3422 printf(" ");
3423 showV128(&dst);
3424 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3425 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3426 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3427 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3428 printf("\n");
3429 }
3430 }
3431
test_ROUNDPS_w_mxcsr_rounding(void)3432 void test_ROUNDPS_w_mxcsr_rounding ( void )
3433 {
3434 UInt rm;
3435 float vals[22];
3436 Int i = 0;
3437 vals[i++] = 0.0;
3438 vals[i++] = -0.0;
3439 vals[i++] = mkPosInf();
3440 vals[i++] = mkNegInf();
3441 vals[i++] = mkPosNan();
3442 vals[i++] = mkNegNan();
3443 vals[i++] = -1.3;
3444 vals[i++] = -1.1;
3445 vals[i++] = -0.9;
3446 vals[i++] = -0.7;
3447 vals[i++] = -0.50001;
3448 vals[i++] = -0.49999;
3449 vals[i++] = -0.3;
3450 vals[i++] = -0.1;
3451 vals[i++] = 0.1;
3452 vals[i++] = 0.3;
3453 vals[i++] = 0.49999;
3454 vals[i++] = 0.50001;
3455 vals[i++] = 0.7;
3456 vals[i++] = 0.9;
3457 vals[i++] = 1.1;
3458 vals[i++] = 1.3;
3459 assert(i == 22);
3460
3461 rm = get_sse_roundingmode();
3462 assert(rm == 0); // 0 == RN == default
3463
3464 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
3465 V128 src, dst;
3466
3467 for (rm = 0; rm <= 3; rm++) {
3468 set_sse_roundingmode(rm);
3469
3470 randV128(&src);
3471 randV128(&dst);
3472 memcpy(&src[0], &vals[i], 4);
3473 memcpy(&src[4], &vals[(i+5)%22], 4);
3474 memcpy(&src[8], &vals[(i+11)%22], 4);
3475 memcpy(&src[12], &vals[(i+17)%22], 4);
3476 do_ROUNDPS_1XX(False/*reg*/, &src, &dst);
3477 printf("r (rm=%u) roundps_1XX ", rm);
3478 showV128(&src);
3479 printf(" ");
3480 showV128(&dst);
3481 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3482 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3483 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3484 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3485 printf("\n");
3486
3487 randV128(&src);
3488 randV128(&dst);
3489 memcpy(&src[0], &vals[i], 4);
3490 memcpy(&src[4], &vals[(i+5)%22], 4);
3491 memcpy(&src[8], &vals[(i+11)%22], 4);
3492 memcpy(&src[12], &vals[(i+17)%22], 4);
3493 do_ROUNDPS_1XX(True/*mem*/, &src, &dst);
3494 printf("m (rm=%u) roundps_1XX ", rm);
3495 showV128(&src);
3496 printf(" ");
3497 showV128(&dst);
3498 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3499 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3500 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3501 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3502 printf("\n");
3503 }
3504 }
3505
3506 rm = get_sse_roundingmode();
3507 assert(rm == 3);
3508 set_sse_roundingmode(0);
3509 rm = get_sse_roundingmode();
3510 assert(rm == 0); // 0 == RN == default
3511 }
3512
3513 /* ------------ PTEST ------------ */
3514
test_PTEST(void)3515 void test_PTEST ( void )
3516 {
3517 const Int ntests = 8;
3518 V128 spec[ntests];
3519 do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0x0000000000000000ULL );
3520 do64HLtoV128( &spec[1], 0x0000000000000000ULL, 0x0000000000000001ULL );
3521 do64HLtoV128( &spec[2], 0x0000000000000001ULL, 0x0000000000000000ULL );
3522 do64HLtoV128( &spec[3], 0x0000000000000001ULL, 0x0000000000000001ULL );
3523 do64HLtoV128( &spec[4], 0xffffffffffffffffULL, 0xffffffffffffffffULL );
3524 do64HLtoV128( &spec[5], 0xffffffffffffffffULL, 0xfffffffffffffffeULL );
3525 do64HLtoV128( &spec[6], 0xfffffffffffffffeULL, 0xffffffffffffffffULL );
3526 do64HLtoV128( &spec[7], 0xfffffffffffffffeULL, 0xfffffffffffffffeULL );
3527 V128 block[2];
3528 Int i, j;
3529 ULong flags;
3530 for (i = 0; i < ntests; i++) {
3531 for (j = 0; j < ntests; j++) {
3532 memcpy(&block[0], &spec[i], 16);
3533 memcpy(&block[1], &spec[j], 16);
3534 __asm__ __volatile__(
3535 "subq $256, %%rsp" "\n\t"
3536 "movupd 0(%1), %%xmm2" "\n\t"
3537 "ptest 16(%1), %%xmm2" "\n\t"
3538 "pushfq" "\n\t"
3539 "popq %0" "\n\t"
3540 "addq $256, %%rsp" "\n\t"
3541 : /*out*/"=r"(flags) : /*in*/ "r"(&block[0]) :
3542 "xmm2", "memory", "cc"
3543 );
3544 printf("r ptest ");
3545 showV128(&block[0]);
3546 printf(" ");
3547 showV128(&block[1]);
3548 printf(" -> eflags %04x\n", (UInt)flags & 0x8D5);
3549 }
3550 }
3551 }
3552
3553 /* ------------ PBLENDVB ------------ */
3554
do_PBLENDVB(Bool mem,V128 * xmm0,V128 * src,V128 * dst)3555 void do_PBLENDVB ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
3556 {
3557 if (mem) {
3558 __asm__ __volatile__(
3559 "movupd (%2), %%xmm0" "\n\t"
3560 "movupd (%1), %%xmm11" "\n\t"
3561 "pblendvb (%0), %%xmm11" "\n\t"
3562 "movupd %%xmm11, (%1)" "\n"
3563 : /*OUT*/
3564 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3565 : /*TRASH*/ "xmm11","xmm0"
3566 );
3567 } else {
3568 __asm__ __volatile__(
3569 "movupd (%2), %%xmm0" "\n\t"
3570 "movupd (%1), %%xmm11" "\n\t"
3571 "movupd (%0), %%xmm2" "\n\t"
3572 "pblendvb %%xmm2, %%xmm11" "\n\t"
3573 "movupd %%xmm11, (%1)" "\n"
3574 : /*OUT*/
3575 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3576 : /*TRASH*/ "xmm11","xmm2","xmm0"
3577 );
3578 }
3579 }
3580
test_PBLENDVB(void)3581 void test_PBLENDVB ( void )
3582 {
3583 V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
3584 Int i;
3585 for (i = 0; i < 10; i++) {
3586 randV128(&t_xmm0);
3587 randV128(&t_src);
3588 randV128(&t_dst);
3589
3590 memcpy(&xmm0, &t_xmm0, 16);
3591 memcpy(&src, &t_src, 16);
3592 memcpy(&dst, &t_dst, 16);
3593 do_PBLENDVB(False/*reg*/, &xmm0, &src, &dst);
3594 printf("r pblendvb ");
3595 showV128(&t_xmm0);
3596 printf(" ");
3597 showV128(&t_src);
3598 printf(" ");
3599 showV128(&t_dst);
3600 printf(" -> ");
3601 showV128(&dst);
3602 printf("\n");
3603
3604 memcpy(&xmm0, &t_xmm0, 16);
3605 memcpy(&src, &t_src, 16);
3606 memcpy(&dst, &t_dst, 16);
3607 do_PBLENDVB(True/*mem*/, &xmm0, &src, &dst);
3608 printf("m pblendvb ");
3609 showV128(&t_xmm0);
3610 printf(" ");
3611 showV128(&t_src);
3612 printf(" ");
3613 showV128(&t_dst);
3614 printf(" -> ");
3615 showV128(&dst);
3616 printf("\n");
3617 }
3618 }
3619
3620 /* ------------ BLENDVPD ------------ */
3621
do_BLENDVPD(Bool mem,V128 * xmm0,V128 * src,V128 * dst)3622 void do_BLENDVPD ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
3623 {
3624 if (mem) {
3625 __asm__ __volatile__(
3626 "movupd (%2), %%xmm0" "\n\t"
3627 "movupd (%1), %%xmm11" "\n\t"
3628 "blendvpd (%0), %%xmm11" "\n\t"
3629 "movupd %%xmm11, (%1)" "\n"
3630 : /*OUT*/
3631 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3632 : /*TRASH*/ "xmm11","xmm0"
3633 );
3634 } else {
3635 __asm__ __volatile__(
3636 "movupd (%2), %%xmm0" "\n\t"
3637 "movupd (%1), %%xmm11" "\n\t"
3638 "movupd (%0), %%xmm2" "\n\t"
3639 "blendvpd %%xmm2, %%xmm11" "\n\t"
3640 "movupd %%xmm11, (%1)" "\n"
3641 : /*OUT*/
3642 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3643 : /*TRASH*/ "xmm11","xmm2","xmm0"
3644 );
3645 }
3646 }
3647
test_BLENDVPD(void)3648 void test_BLENDVPD ( void )
3649 {
3650 V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
3651 Int i;
3652 for (i = 0; i < 10; i++) {
3653 randV128(&t_xmm0);
3654 randV128(&t_src);
3655 randV128(&t_dst);
3656
3657 memcpy(&xmm0, &t_xmm0, 16);
3658 memcpy(&src, &t_src, 16);
3659 memcpy(&dst, &t_dst, 16);
3660 do_BLENDVPD(False/*reg*/, &xmm0, &src, &dst);
3661 printf("r blendvpd ");
3662 showV128(&t_xmm0);
3663 printf(" ");
3664 showV128(&t_src);
3665 printf(" ");
3666 showV128(&t_dst);
3667 printf(" -> ");
3668 showV128(&dst);
3669 printf("\n");
3670
3671 memcpy(&xmm0, &t_xmm0, 16);
3672 memcpy(&src, &t_src, 16);
3673 memcpy(&dst, &t_dst, 16);
3674 do_BLENDVPD(True/*mem*/, &xmm0, &src, &dst);
3675 printf("m blendvpd ");
3676 showV128(&t_xmm0);
3677 printf(" ");
3678 showV128(&t_src);
3679 printf(" ");
3680 showV128(&t_dst);
3681 printf(" -> ");
3682 showV128(&dst);
3683 printf("\n");
3684 }
3685 }
3686
3687 /* ------------ BLENDVPS ------------ */
3688
do_BLENDVPS(Bool mem,V128 * xmm0,V128 * src,V128 * dst)3689 void do_BLENDVPS ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
3690 {
3691 if (mem) {
3692 __asm__ __volatile__(
3693 "movupd (%2), %%xmm0" "\n\t"
3694 "movupd (%1), %%xmm11" "\n\t"
3695 "blendvps (%0), %%xmm11" "\n\t"
3696 "movupd %%xmm11, (%1)" "\n"
3697 : /*OUT*/
3698 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3699 : /*TRASH*/ "xmm11","xmm0"
3700 );
3701 } else {
3702 __asm__ __volatile__(
3703 "movupd (%2), %%xmm0" "\n\t"
3704 "movupd (%1), %%xmm11" "\n\t"
3705 "movupd (%0), %%xmm2" "\n\t"
3706 "blendvps %%xmm2, %%xmm11" "\n\t"
3707 "movupd %%xmm11, (%1)" "\n"
3708 : /*OUT*/
3709 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3710 : /*TRASH*/ "xmm11","xmm2","xmm0"
3711 );
3712 }
3713 }
3714
test_BLENDVPS(void)3715 void test_BLENDVPS ( void )
3716 {
3717 V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
3718 Int i;
3719 for (i = 0; i < 10; i++) {
3720 randV128(&t_xmm0);
3721 randV128(&t_src);
3722 randV128(&t_dst);
3723
3724 memcpy(&xmm0, &t_xmm0, 16);
3725 memcpy(&src, &t_src, 16);
3726 memcpy(&dst, &t_dst, 16);
3727 do_BLENDVPS(False/*reg*/, &xmm0, &src, &dst);
3728 printf("r blendvps ");
3729 showV128(&t_xmm0);
3730 printf(" ");
3731 showV128(&t_src);
3732 printf(" ");
3733 showV128(&t_dst);
3734 printf(" -> ");
3735 showV128(&dst);
3736 printf("\n");
3737
3738 memcpy(&xmm0, &t_xmm0, 16);
3739 memcpy(&src, &t_src, 16);
3740 memcpy(&dst, &t_dst, 16);
3741 do_BLENDVPS(True/*mem*/, &xmm0, &src, &dst);
3742 printf("m blendvps ");
3743 showV128(&t_xmm0);
3744 printf(" ");
3745 showV128(&t_src);
3746 printf(" ");
3747 showV128(&t_dst);
3748 printf(" -> ");
3749 showV128(&dst);
3750 printf("\n");
3751 }
3752 }
3753
test_MOVNTDQA(void)3754 void test_MOVNTDQA ( void )
3755 {
3756 V128 src, dst;
3757 Int i;
3758 for (i = 0; i < 10; i++) {
3759 randV128(&src);
3760 /* make sure the load actually happens */
3761 randV128(&dst);
3762 DO_m_r("movntdqa", src, dst);
3763 }
3764 }
3765
3766 /* ------------ main ------------ */
3767
main(int argc,char ** argv)3768 int main ( int argc, char** argv )
3769 {
3770 #if 1
3771 // ------ SSE 4.1 ------
3772 test_BLENDPD(); // done Apr.01.2010
3773 test_BLENDPS(); // done Apr.02.2010
3774 test_PBLENDW();
3775 test_PBLENDVB();
3776 test_BLENDVPD();
3777 test_BLENDVPS();
3778 test_DPPD(); // done Apr.08.2010
3779 test_DPPS(); // done Apr.09.2010
3780 test_EXTRACTPS();
3781 test_INSERTPS(); // done Apr.01.2010
3782 test_PCMPEQQ();
3783 test_PEXTRB(); // done Apr.15.2010
3784 test_PEXTRD(); // done Apr.14.2010
3785 test_PEXTRQ(); // done Apr.14.2010
3786 test_PEXTRW(); // done Apr.14.2010
3787 test_PINSRQ(); // done Apr.16.2010
3788 test_PINSRD(); // todo
3789 test_PINSRW(); /* Umm, this is SSE2, not SSE4. Right? */
3790 test_PINSRB(); // todo
3791 test_PMAXSB();
3792 test_PMAXSD(); // done Apr.09.2010
3793 test_PMAXUD(); // done Apr.16.2010
3794 test_PMAXUW();
3795 test_PMINSB();
3796 test_PMINSD(); // done Apr.09.2010
3797 test_PMINUD();
3798 test_PMINUW();
3799 test_PMOVSXBW(); // done Apr.02.2010
3800 test_PMOVSXBD(); // done Mar.30.2010
3801 test_PMOVSXBQ(); // done Mar.30.2010
3802 test_PMOVSXWD(); // done Mar.31.2010
3803 test_PMOVSXWQ(); // done Mar.31.2010
3804 test_PMOVSXDQ(); // done Mar.31.2010
3805 test_PMOVZXBW(); // done Mar.28.2010
3806 test_PMOVZXBD(); // done Mar.29.2010
3807 test_PMOVZXBQ(); // done Mar.29.2010
3808 test_PMOVZXWD(); // done Mar.28.2010
3809 test_PMOVZXWQ(); // done Mar.29.2010
3810 test_PMOVZXDQ(); // done Mar.29.2010
3811 test_POPCNTW();
3812 test_POPCNTL();
3813 test_POPCNTQ();
3814 test_PMULDQ();
3815 test_PMULLD();
3816 test_PTEST();
3817 test_ROUNDSD_w_immediate_rounding();
3818 test_ROUNDSS_w_immediate_rounding();
3819 test_ROUNDPD_w_immediate_rounding();
3820 test_ROUNDPS_w_immediate_rounding();
3821 test_ROUNDSD_w_mxcsr_rounding();
3822 test_ROUNDSS_w_mxcsr_rounding();
3823 test_ROUNDPD_w_mxcsr_rounding();
3824 test_ROUNDPS_w_mxcsr_rounding();
3825 // ------ SSE 4.2 ------
3826 test_PCMPGTQ();
3827 // CRC32B,Q
3828 test_PACKUSDW();
3829 test_PHMINPOSUW();
3830 test_MPSADBW();
3831 test_MOVNTDQA(); /* not sure whether this is 4.1 or 4.2 */
3832 #else
3833 test_MPSADBW();
3834 #endif
3835
3836 return 0;
3837 }
3838
3839