1
2 /* Tests in detail the core arithmetic for pcmp{e,i}str{i,m} using
3 pcmpistri to drive it. Does not check the e-vs-i or i-vs-m
4 aspect. */
5
6 #include <string.h>
7 #include <stdio.h>
8 #include <assert.h>
9
10 typedef unsigned int UInt;
11 typedef signed int Int;
12 typedef unsigned char UChar;
13 typedef unsigned short UShort;
14 typedef unsigned long long int ULong;
15 typedef UChar Bool;
16 #define False ((Bool)0)
17 #define True ((Bool)1)
18
19 //typedef unsigned char V128[16];
20 typedef
21 union {
22 UChar uChar[16];
23 UShort uShort[8];
24 UInt uInt[4];
25 UInt w32[4];
26 }
27 V128;
28
29 #define SHIFT_O 11
30 #define SHIFT_S 7
31 #define SHIFT_Z 6
32 #define SHIFT_A 4
33 #define SHIFT_C 0
34 #define SHIFT_P 2
35
36 #define MASK_O (1ULL << SHIFT_O)
37 #define MASK_S (1ULL << SHIFT_S)
38 #define MASK_Z (1ULL << SHIFT_Z)
39 #define MASK_A (1ULL << SHIFT_A)
40 #define MASK_C (1ULL << SHIFT_C)
41 #define MASK_P (1ULL << SHIFT_P)
42
43
clz32(UInt x)44 UInt clz32 ( UInt x )
45 {
46 Int y, m, n;
47 y = -(x >> 16);
48 m = (y >> 16) & 16;
49 n = 16 - m;
50 x = x >> m;
51 y = x - 0x100;
52 m = (y >> 16) & 8;
53 n = n + m;
54 x = x << m;
55 y = x - 0x1000;
56 m = (y >> 16) & 4;
57 n = n + m;
58 x = x << m;
59 y = x - 0x4000;
60 m = (y >> 16) & 2;
61 n = n + m;
62 x = x << m;
63 y = x >> 14;
64 m = y & ~(y >> 1);
65 return n + 2 - m;
66 }
67
ctz32(UInt x)68 UInt ctz32 ( UInt x )
69 {
70 return 32 - clz32((~x) & (x-1));
71 }
72
expand(V128 * dst,char * summary)73 void expand ( V128* dst, char* summary )
74 {
75 Int i;
76 assert( strlen(summary) == 16 );
77 for (i = 0; i < 16; i++) {
78 UChar xx = 0;
79 UChar x = summary[15-i];
80 if (x >= '0' && x <= '9') { xx = x - '0'; }
81 else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
82 else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
83 else assert(0);
84
85 assert(xx < 16);
86 xx = (xx << 4) | xx;
87 assert(xx < 256);
88 dst->uChar[i] = xx;
89 }
90 }
91
try_istri(char * which,UInt (* h_fn)(V128 *,V128 *),UInt (* s_fn)(V128 *,V128 *),char * summL,char * summR)92 void try_istri ( char* which,
93 UInt(*h_fn)(V128*,V128*),
94 UInt(*s_fn)(V128*,V128*),
95 char* summL, char* summR )
96 {
97 assert(strlen(which) == 2);
98 V128 argL, argR;
99 expand(&argL, summL);
100 expand(&argR, summR);
101 UInt h_res = h_fn(&argL, &argR);
102 UInt s_res = s_fn(&argL, &argR);
103 printf("istri %s %s %s -> %08x %08x %s\n",
104 which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
105 }
106
zmask_from_V128(V128 * arg)107 UInt zmask_from_V128 ( V128* arg )
108 {
109 UInt i, res = 0;
110 for (i = 0; i < 8; i++) {
111 res |= ((arg->uShort[i] == 0) ? 1 : 0) << i;
112 }
113 return res;
114 }
115
116 //////////////////////////////////////////////////////////
117 // //
118 // GENERAL //
119 // //
120 //////////////////////////////////////////////////////////
121
122
123 /* Given partial results from a 16-bit pcmpXstrX operation (intRes1,
124 basically), generate an I- or M-format output value, also the new
125 OSZACP flags. */
126 static
PCMPxSTRx_WRK_gen_output_fmt_I_wide(V128 * resV,UInt * resOSZACP,UInt intRes1,UInt zmaskL,UInt zmaskR,UInt validL,UInt pol,UInt idx)127 void PCMPxSTRx_WRK_gen_output_fmt_I_wide ( /*OUT*/V128* resV,
128 /*OUT*/UInt* resOSZACP,
129 UInt intRes1,
130 UInt zmaskL, UInt zmaskR,
131 UInt validL,
132 UInt pol, UInt idx )
133 {
134 assert((pol >> 2) == 0);
135 assert((idx >> 1) == 0);
136
137 UInt intRes2 = 0;
138 switch (pol) {
139 case 0: intRes2 = intRes1; break; // pol +
140 case 1: intRes2 = ~intRes1; break; // pol -
141 case 2: intRes2 = intRes1; break; // pol m+
142 case 3: intRes2 = intRes1 ^ validL; break; // pol m-
143 }
144 intRes2 &= 0xFF;
145
146 // generate I-format output (an index in ECX)
147 // generate ecx value
148 UInt newECX = 0;
149 if (idx) {
150 // index of ms-1-bit
151 newECX = intRes2 == 0 ? 8 : (31 - clz32(intRes2));
152 } else {
153 // index of ls-1-bit
154 newECX = intRes2 == 0 ? 8 : ctz32(intRes2);
155 }
156
157 resV->w32[0] = newECX;
158 resV->w32[1] = 0;
159 resV->w32[2] = 0;
160 resV->w32[3] = 0;
161
162 // generate new flags, common to all ISTRI and ISTRM cases
163 *resOSZACP // A, P are zero
164 = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
165 | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
166 | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0
167 | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0]
168 }
169
170 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
171 variants on 16-bit characters.
172
173 For xSTRI variants, the new ECX value is placed in the 32 bits
174 pointed to by *resV, and the top 96 bits are zeroed. For xSTRM
175 variants, the result is a 128 bit value and is placed at *resV in
176 the obvious way.
177
178 For all variants, the new OSZACP value is placed at *resOSZACP.
179
180 argLV and argRV are the vector args. The caller must prepare a
181 8-bit mask for each, zmaskL and zmaskR. For ISTRx variants this
182 must be 1 for each zero byte of of the respective arg. For ESTRx
183 variants this is derived from the explicit length indication, and
184 must be 0 in all places except at the bit index corresponding to
185 the valid length (0 .. 8). If the valid length is 8 then the
186 mask must be all zeroes. In all cases, bits 31:8 must be zero.
187
188 imm8 is the original immediate from the instruction. isSTRM
189 indicates whether this is a xSTRM or xSTRI variant, which controls
190 how much of *res is written.
191
192 If the given imm8 case can be handled, the return value is True.
193 If not, False is returned, and neither *res not *resOSZACP are
194 altered.
195 */
196
pcmpXstrX_WRK_wide(V128 * resV,UInt * resOSZACP,V128 * argLV,V128 * argRV,UInt zmaskL,UInt zmaskR,UInt imm8,Bool isxSTRM)197 Bool pcmpXstrX_WRK_wide ( /*OUT*/V128* resV,
198 /*OUT*/UInt* resOSZACP,
199 V128* argLV, V128* argRV,
200 UInt zmaskL, UInt zmaskR,
201 UInt imm8, Bool isxSTRM )
202 {
203 assert(imm8 < 0x80);
204 assert((zmaskL >> 8) == 0);
205 assert((zmaskR >> 8) == 0);
206
207 /* Explicitly reject any imm8 values that haven't been validated,
208 even if they would probably work. Life is too short to have
209 unvalidated cases in the code base. */
210 switch (imm8) {
211 case 0x01: case 0x03: case 0x09: case 0x0B: case 0x0D:
212 case 0x13: case 0x1B:
213 case 0x39: case 0x3B:
214 case 0x45: case 0x4B:
215 break;
216 default:
217 return False;
218 }
219
220 UInt fmt = (imm8 >> 0) & 3; // imm8[1:0] data format
221 UInt agg = (imm8 >> 2) & 3; // imm8[3:2] aggregation fn
222 UInt pol = (imm8 >> 4) & 3; // imm8[5:4] polarity
223 UInt idx = (imm8 >> 6) & 1; // imm8[6] 1==msb/bytemask
224
225 /*----------------------------------------*/
226 /*-- strcmp on wide data --*/
227 /*----------------------------------------*/
228
229 if (agg == 2/*equal each, aka strcmp*/
230 && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
231 Int i;
232 UShort* argL = (UShort*)argLV;
233 UShort* argR = (UShort*)argRV;
234 UInt boolResII = 0;
235 for (i = 7; i >= 0; i--) {
236 UShort cL = argL[i];
237 UShort cR = argR[i];
238 boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
239 }
240 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
241 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
242
243 // do invalidation, common to all equal-each cases
244 UInt intRes1
245 = (boolResII & validL & validR) // if both valid, use cmpres
246 | (~ (validL | validR)); // if both invalid, force 1
247 // else force 0
248 intRes1 &= 0xFF;
249
250 // generate I-format output
251 PCMPxSTRx_WRK_gen_output_fmt_I_wide(
252 resV, resOSZACP,
253 intRes1, zmaskL, zmaskR, validL, pol, idx
254 );
255
256 return True;
257 }
258
259 /*----------------------------------------*/
260 /*-- set membership on wide data --*/
261 /*----------------------------------------*/
262
263 if (agg == 0/*equal any, aka find chars in a set*/
264 && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
265 /* argL: the string, argR: charset */
266 UInt si, ci;
267 UShort* argL = (UShort*)argLV;
268 UShort* argR = (UShort*)argRV;
269 UInt boolRes = 0;
270 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
271 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
272
273 for (si = 0; si < 8; si++) {
274 if ((validL & (1 << si)) == 0)
275 // run off the end of the string.
276 break;
277 UInt m = 0;
278 for (ci = 0; ci < 8; ci++) {
279 if ((validR & (1 << ci)) == 0) break;
280 if (argR[ci] == argL[si]) { m = 1; break; }
281 }
282 boolRes |= (m << si);
283 }
284
285 // boolRes is "pre-invalidated"
286 UInt intRes1 = boolRes & 0xFF;
287
288 // generate I-format output
289 PCMPxSTRx_WRK_gen_output_fmt_I_wide(
290 resV, resOSZACP,
291 intRes1, zmaskL, zmaskR, validL, pol, idx
292 );
293
294 return True;
295 }
296
297 /*----------------------------------------*/
298 /*-- substring search on wide data --*/
299 /*----------------------------------------*/
300
301 if (agg == 3/*equal ordered, aka substring search*/
302 && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
303
304 /* argL: haystack, argR: needle */
305 UInt ni, hi;
306 UShort* argL = (UShort*)argLV;
307 UShort* argR = (UShort*)argRV;
308 UInt boolRes = 0;
309 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
310 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
311 for (hi = 0; hi < 8; hi++) {
312 UInt m = 1;
313 for (ni = 0; ni < 8; ni++) {
314 if ((validR & (1 << ni)) == 0) break;
315 UInt i = ni + hi;
316 if (i >= 8) break;
317 if (argL[i] != argR[ni]) { m = 0; break; }
318 }
319 boolRes |= (m << hi);
320 if ((validL & (1 << hi)) == 0)
321 // run off the end of the haystack
322 break;
323 }
324
325 // boolRes is "pre-invalidated"
326 UInt intRes1 = boolRes & 0xFF;
327
328 // generate I-format output
329 PCMPxSTRx_WRK_gen_output_fmt_I_wide(
330 resV, resOSZACP,
331 intRes1, zmaskL, zmaskR, validL, pol, idx
332 );
333
334 return True;
335 }
336
337 /*----------------------------------------*/
338 /*-- ranges, unsigned wide data --*/
339 /*----------------------------------------*/
340
341 if (agg == 1/*ranges*/
342 && fmt == 1/*uw*/) {
343
344 /* argL: string, argR: range-pairs */
345 UInt ri, si;
346 UShort* argL = (UShort*)argLV;
347 UShort* argR = (UShort*)argRV;
348 UInt boolRes = 0;
349 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
350 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
351 for (si = 0; si < 8; si++) {
352 if ((validL & (1 << si)) == 0)
353 // run off the end of the string
354 break;
355 UInt m = 0;
356 for (ri = 0; ri < 8; ri += 2) {
357 if ((validR & (3 << ri)) != (3 << ri)) break;
358 if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
359 m = 1; break;
360 }
361 }
362 boolRes |= (m << si);
363 }
364
365 // boolRes is "pre-invalidated"
366 UInt intRes1 = boolRes & 0xFF;
367
368 // generate I-format output
369 PCMPxSTRx_WRK_gen_output_fmt_I_wide(
370 resV, resOSZACP,
371 intRes1, zmaskL, zmaskR, validL, pol, idx
372 );
373
374 return True;
375 }
376
377 return False;
378 }
379
380 //////////////////////////////////////////////////////////
381 // //
382 // ISTRI_4B //
383 // //
384 //////////////////////////////////////////////////////////
385
h_pcmpistri_4B(V128 * argL,V128 * argR)386 UInt h_pcmpistri_4B ( V128* argL, V128* argR )
387 {
388 V128 block[2];
389 memcpy(&block[0], argL, sizeof(V128));
390 memcpy(&block[1], argR, sizeof(V128));
391 ULong res, flags;
392 __asm__ __volatile__(
393 "subq $1024, %%rsp" "\n\t"
394 "movdqu 0(%2), %%xmm2" "\n\t"
395 "movdqu 16(%2), %%xmm11" "\n\t"
396 "pcmpistri $0x4B, %%xmm2, %%xmm11" "\n\t"
397 "pushfq" "\n\t"
398 "popq %%rdx" "\n\t"
399 "movq %%rcx, %0" "\n\t"
400 "movq %%rdx, %1" "\n\t"
401 "addq $1024, %%rsp" "\n\t"
402 : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
403 : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
404 );
405 return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
406 }
407
s_pcmpistri_4B(V128 * argLU,V128 * argRU)408 UInt s_pcmpistri_4B ( V128* argLU, V128* argRU )
409 {
410 V128 resV;
411 UInt resOSZACP, resECX;
412 Bool ok
413 = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
414 zmask_from_V128(argLU),
415 zmask_from_V128(argRU),
416 0x4B, False/*!isSTRM*/
417 );
418 assert(ok);
419 resECX = resV.uInt[0];
420 return (resOSZACP << 16) | resECX;
421 }
422
istri_4B(void)423 void istri_4B ( void )
424 {
425 char* wot = "4B";
426 UInt(*h)(V128*,V128*) = h_pcmpistri_4B;
427 UInt(*s)(V128*,V128*) = s_pcmpistri_4B;
428
429 try_istri(wot,h,s, "0000000000000000", "0000000000000000");
430
431 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
432 try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
433 try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
434 try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
435
436 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
437 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
438 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
439
440 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
441 try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
442 try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
443 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
444
445 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
446 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
447 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
448
449 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
450
451 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
452 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
453 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
454
455 try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
456 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
457 try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
458
459 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
460 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
461 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
462
463 try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
464 try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
465 try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
466
467 try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
468 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
469 }
470
471 //////////////////////////////////////////////////////////
472 // //
473 // ISTRI_3B //
474 // //
475 //////////////////////////////////////////////////////////
476
h_pcmpistri_3B(V128 * argL,V128 * argR)477 UInt h_pcmpistri_3B ( V128* argL, V128* argR )
478 {
479 V128 block[2];
480 memcpy(&block[0], argL, sizeof(V128));
481 memcpy(&block[1], argR, sizeof(V128));
482 ULong res, flags;
483 __asm__ __volatile__(
484 "subq $1024, %%rsp" "\n\t"
485 "movdqu 0(%2), %%xmm2" "\n\t"
486 "movdqu 16(%2), %%xmm11" "\n\t"
487 "pcmpistri $0x3B, %%xmm2, %%xmm11" "\n\t"
488 "pushfq" "\n\t"
489 "popq %%rdx" "\n\t"
490 "movq %%rcx, %0" "\n\t"
491 "movq %%rdx, %1" "\n\t"
492 "addq $1024, %%rsp" "\n\t"
493 : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
494 : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
495 );
496 return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
497 }
498
s_pcmpistri_3B(V128 * argLU,V128 * argRU)499 UInt s_pcmpistri_3B ( V128* argLU, V128* argRU )
500 {
501 V128 resV;
502 UInt resOSZACP, resECX;
503 Bool ok
504 = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
505 zmask_from_V128(argLU),
506 zmask_from_V128(argRU),
507 0x3B, False/*!isSTRM*/
508 );
509 assert(ok);
510 resECX = resV.uInt[0];
511 return (resOSZACP << 16) | resECX;
512 }
513
istri_3B(void)514 void istri_3B ( void )
515 {
516 char* wot = "3B";
517 UInt(*h)(V128*,V128*) = h_pcmpistri_3B;
518 UInt(*s)(V128*,V128*) = s_pcmpistri_3B;
519
520 try_istri(wot,h,s, "0000000000000000", "0000000000000000");
521
522 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
523 try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
524 try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
525 try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
526
527 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
528 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
529 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
530
531 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
532 try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
533 try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
534 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
535
536 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
537 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
538 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
539
540 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
541
542 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
543 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
544 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
545
546 try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
547 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
548 try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
549
550 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
551 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
552 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
553
554 try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
555 try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
556 try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
557
558 try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
559 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
560 }
561
562
563
564 //////////////////////////////////////////////////////////
565 // //
566 // ISTRI_0D //
567 // //
568 //////////////////////////////////////////////////////////
569
570 __attribute__((noinline))
h_pcmpistri_0D(V128 * argL,V128 * argR)571 UInt h_pcmpistri_0D ( V128* argL, V128* argR )
572 {
573 V128 block[2];
574 memcpy(&block[0], argL, sizeof(V128));
575 memcpy(&block[1], argR, sizeof(V128));
576 ULong res = 0, flags = 0;
577 __asm__ __volatile__(
578 "movdqu 0(%2), %%xmm2" "\n\t"
579 "movdqu 16(%2), %%xmm11" "\n\t"
580 "pcmpistri $0x0D, %%xmm2, %%xmm11" "\n\t"
581 //"pcmpistrm $0x0D, %%xmm2, %%xmm11" "\n\t"
582 //"movd %%xmm0, %%ecx" "\n\t"
583 "pushfq" "\n\t"
584 "popq %%rdx" "\n\t"
585 "movq %%rcx, %0" "\n\t"
586 "movq %%rdx, %1" "\n\t"
587 : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
588 : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
589 );
590 return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
591 }
592
s_pcmpistri_0D(V128 * argLU,V128 * argRU)593 UInt s_pcmpistri_0D ( V128* argLU, V128* argRU )
594 {
595 V128 resV;
596 UInt resOSZACP, resECX;
597 Bool ok
598 = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
599 zmask_from_V128(argLU),
600 zmask_from_V128(argRU),
601 0x0D, False/*!isSTRM*/
602 );
603 assert(ok);
604 resECX = resV.uInt[0];
605 return (resOSZACP << 16) | resECX;
606 }
607
istri_0D(void)608 void istri_0D ( void )
609 {
610 char* wot = "0D";
611 UInt(*h)(V128*,V128*) = h_pcmpistri_0D;
612 UInt(*s)(V128*,V128*) = s_pcmpistri_0D;
613
614 try_istri(wot,h,s, "11111111abcdef11", "0000000000abcdef");
615
616 try_istri(wot,h,s, "11111111abcdef11", "00abcdef00abcdef");
617
618 try_istri(wot,h,s, "11111111abcdef11", "0000000000abcdef");
619 try_istri(wot,h,s, "1111111111abcdef", "0000000000abcdef");
620 try_istri(wot,h,s, "111111111111abcd", "0000000000abcdef");
621
622 try_istri(wot,h,s, "1111abcd11abcd11", "000000000000abcd");
623
624 try_istri(wot,h,s, "11abcd1111abcd11", "000000000000abcd");
625 try_istri(wot,h,s, "abcd111111abcd11", "000000000000abcd");
626 try_istri(wot,h,s, "cd11111111abcd11", "000000000000abcd");
627
628 try_istri(wot,h,s, "01abcd11abcd1111", "000000000000abcd");
629 try_istri(wot,h,s, "00abcd11abcd1111", "000000000000abcd");
630 try_istri(wot,h,s, "0000cd11abcd1111", "000000000000abcd");
631
632 try_istri(wot,h,s, "00abcd1100abcd11", "000000000000abcd");
633 try_istri(wot,h,s, "00abcd110000cd11", "000000000000abcd");
634
635 try_istri(wot,h,s, "1111111111111234", "0000000000000000");
636 try_istri(wot,h,s, "1111111111111234", "0000000000000011");
637 try_istri(wot,h,s, "1111111111111234", "0000000000001111");
638
639 try_istri(wot,h,s, "1111111111111234", "1111111111111234");
640 try_istri(wot,h,s, "0a11111111111111", "000000000000000a");
641 try_istri(wot,h,s, "0b11111111111111", "000000000000000a");
642
643 try_istri(wot,h,s, "b111111111111111", "0000000000000000");
644 try_istri(wot,h,s, "0000000000000000", "0000000000000000");
645 try_istri(wot,h,s, "123456789abcdef1", "0000000000000000");
646 try_istri(wot,h,s, "0000000000000000", "123456789abcdef1");
647 }
648
649
650 //////////////////////////////////////////////////////////
651 // //
652 // ISTRI_09 //
653 // //
654 //////////////////////////////////////////////////////////
655
h_pcmpistri_09(V128 * argL,V128 * argR)656 UInt h_pcmpistri_09 ( V128* argL, V128* argR )
657 {
658 V128 block[2];
659 memcpy(&block[0], argL, sizeof(V128));
660 memcpy(&block[1], argR, sizeof(V128));
661 ULong res, flags;
662 __asm__ __volatile__(
663 "subq $1024, %%rsp" "\n\t"
664 "movdqu 0(%2), %%xmm2" "\n\t"
665 "movdqu 16(%2), %%xmm11" "\n\t"
666 "pcmpistri $0x09, %%xmm2, %%xmm11" "\n\t"
667 "pushfq" "\n\t"
668 "popq %%rdx" "\n\t"
669 "movq %%rcx, %0" "\n\t"
670 "movq %%rdx, %1" "\n\t"
671 "addq $1024, %%rsp" "\n\t"
672 : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
673 : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
674 );
675 return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
676 }
677
s_pcmpistri_09(V128 * argLU,V128 * argRU)678 UInt s_pcmpistri_09 ( V128* argLU, V128* argRU )
679 {
680 V128 resV;
681 UInt resOSZACP, resECX;
682 Bool ok
683 = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
684 zmask_from_V128(argLU),
685 zmask_from_V128(argRU),
686 0x09, False/*!isSTRM*/
687 );
688 assert(ok);
689 resECX = resV.uInt[0];
690 return (resOSZACP << 16) | resECX;
691 }
692
istri_09(void)693 void istri_09 ( void )
694 {
695 char* wot = "09";
696 UInt(*h)(V128*,V128*) = h_pcmpistri_09;
697 UInt(*s)(V128*,V128*) = s_pcmpistri_09;
698
699 try_istri(wot,h,s, "0000000000000000", "0000000000000000");
700
701 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
702 try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
703 try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
704 try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
705
706 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
707 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
708 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
709
710 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
711 try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
712 try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
713 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
714
715 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
716 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
717 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
718
719 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
720
721 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
722 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
723 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
724
725 try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
726 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
727 try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
728
729 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
730 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
731 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
732
733 try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
734 try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
735 try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
736
737 try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
738 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
739 }
740
741
742
743 //////////////////////////////////////////////////////////
744 // //
745 // ISTRI_1B //
746 // //
747 //////////////////////////////////////////////////////////
748
h_pcmpistri_1B(V128 * argL,V128 * argR)749 UInt h_pcmpistri_1B ( V128* argL, V128* argR )
750 {
751 V128 block[2];
752 memcpy(&block[0], argL, sizeof(V128));
753 memcpy(&block[1], argR, sizeof(V128));
754 ULong res, flags;
755 __asm__ __volatile__(
756 "subq $1024, %%rsp" "\n\t"
757 "movdqu 0(%2), %%xmm2" "\n\t"
758 "movdqu 16(%2), %%xmm11" "\n\t"
759 "pcmpistri $0x1B, %%xmm2, %%xmm11" "\n\t"
760 "pushfq" "\n\t"
761 "popq %%rdx" "\n\t"
762 "movq %%rcx, %0" "\n\t"
763 "movq %%rdx, %1" "\n\t"
764 "addq $1024, %%rsp" "\n\t"
765 : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
766 : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
767 );
768 return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
769 }
770
s_pcmpistri_1B(V128 * argLU,V128 * argRU)771 UInt s_pcmpistri_1B ( V128* argLU, V128* argRU )
772 {
773 V128 resV;
774 UInt resOSZACP, resECX;
775 Bool ok
776 = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
777 zmask_from_V128(argLU),
778 zmask_from_V128(argRU),
779 0x1B, False/*!isSTRM*/
780 );
781 assert(ok);
782 resECX = resV.uInt[0];
783 return (resOSZACP << 16) | resECX;
784 }
785
istri_1B(void)786 void istri_1B ( void )
787 {
788 char* wot = "1B";
789 UInt(*h)(V128*,V128*) = h_pcmpistri_1B;
790 UInt(*s)(V128*,V128*) = s_pcmpistri_1B;
791
792 try_istri(wot,h,s, "0000000000000000", "0000000000000000");
793
794 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
795 try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
796 try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
797 try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
798
799 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
800 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
801 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
802
803 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
804 try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
805 try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
806 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
807
808 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
809 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
810 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
811
812 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
813
814 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
815 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
816 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
817
818 try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
819 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
820 try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
821
822 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
823 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
824 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
825
826 try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
827 try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
828 try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
829
830 try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
831 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
832 }
833
834
835
836 //////////////////////////////////////////////////////////
837 // //
838 // ISTRI_03 //
839 // //
840 //////////////////////////////////////////////////////////
841
h_pcmpistri_03(V128 * argL,V128 * argR)842 UInt h_pcmpistri_03 ( V128* argL, V128* argR )
843 {
844 V128 block[2];
845 memcpy(&block[0], argL, sizeof(V128));
846 memcpy(&block[1], argR, sizeof(V128));
847 ULong res, flags;
848 __asm__ __volatile__(
849 "subq $1024, %%rsp" "\n\t"
850 "movdqu 0(%2), %%xmm2" "\n\t"
851 "movdqu 16(%2), %%xmm11" "\n\t"
852 "pcmpistri $0x03, %%xmm2, %%xmm11" "\n\t"
853 //"pcmpistrm $0x03, %%xmm2, %%xmm11" "\n\t"
854 //"movd %%xmm0, %%ecx" "\n\t"
855 "pushfq" "\n\t"
856 "popq %%rdx" "\n\t"
857 "movq %%rcx, %0" "\n\t"
858 "movq %%rdx, %1" "\n\t"
859 "addq $1024, %%rsp" "\n\t"
860 : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
861 : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
862 );
863 return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
864 }
865
s_pcmpistri_03(V128 * argLU,V128 * argRU)866 UInt s_pcmpistri_03 ( V128* argLU, V128* argRU )
867 {
868 V128 resV;
869 UInt resOSZACP, resECX;
870 Bool ok
871 = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
872 zmask_from_V128(argLU),
873 zmask_from_V128(argRU),
874 0x03, False/*!isSTRM*/
875 );
876 assert(ok);
877 resECX = resV.uInt[0];
878 return (resOSZACP << 16) | resECX;
879 }
880
istri_03(void)881 void istri_03 ( void )
882 {
883 char* wot = "03";
884 UInt(*h)(V128*,V128*) = h_pcmpistri_03;
885 UInt(*s)(V128*,V128*) = s_pcmpistri_03;
886
887 try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
888 try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
889 try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
890 try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
891
892 try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
893 try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
894 try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
895 try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
896 try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
897
898 try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
899 try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
900 try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
901 try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
902
903 try_istri(wot,h,s, "0000000000000000", "0000000000000000");
904 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
905
906 try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
907 try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
908 try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
909 try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
910
911 try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
912
913 try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
914 try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
915 }
916
917
918 //////////////////////////////////////////////////////////
919 // //
920 // ISTRI_13 //
921 // //
922 //////////////////////////////////////////////////////////
923
h_pcmpistri_13(V128 * argL,V128 * argR)924 UInt h_pcmpistri_13 ( V128* argL, V128* argR )
925 {
926 V128 block[2];
927 memcpy(&block[0], argL, sizeof(V128));
928 memcpy(&block[1], argR, sizeof(V128));
929 ULong res, flags;
930 __asm__ __volatile__(
931 "subq $1024, %%rsp" "\n\t"
932 "movdqu 0(%2), %%xmm2" "\n\t"
933 "movdqu 16(%2), %%xmm11" "\n\t"
934 "pcmpistri $0x13, %%xmm2, %%xmm11" "\n\t"
935 //"pcmpistrm $0x13, %%xmm2, %%xmm11" "\n\t"
936 //"movd %%xmm0, %%ecx" "\n\t"
937 "pushfq" "\n\t"
938 "popq %%rdx" "\n\t"
939 "movq %%rcx, %0" "\n\t"
940 "movq %%rdx, %1" "\n\t"
941 "addq $1024, %%rsp" "\n\t"
942 : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
943 : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
944 );
945 return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
946 }
947
s_pcmpistri_13(V128 * argLU,V128 * argRU)948 UInt s_pcmpistri_13 ( V128* argLU, V128* argRU )
949 {
950 V128 resV;
951 UInt resOSZACP, resECX;
952 Bool ok
953 = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
954 zmask_from_V128(argLU),
955 zmask_from_V128(argRU),
956 0x13, False/*!isSTRM*/
957 );
958 assert(ok);
959 resECX = resV.uInt[0];
960 return (resOSZACP << 16) | resECX;
961 }
962
istri_13(void)963 void istri_13 ( void )
964 {
965 char* wot = "13";
966 UInt(*h)(V128*,V128*) = h_pcmpistri_13;
967 UInt(*s)(V128*,V128*) = s_pcmpistri_13;
968
969 try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
970 try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
971 try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
972 try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
973
974 try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
975 try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
976 try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
977 try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
978 try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
979
980 try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
981 try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
982 try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
983 try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
984
985 try_istri(wot,h,s, "0000000000000000", "0000000000000000");
986 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
987
988 try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
989 try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
990 try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
991 try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
992
993 try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
994
995 try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
996 try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
997 }
998
999
1000
1001 //////////////////////////////////////////////////////////
1002 // //
1003 // ISTRI_45 //
1004 // //
1005 //////////////////////////////////////////////////////////
1006
h_pcmpistri_45(V128 * argL,V128 * argR)1007 UInt h_pcmpistri_45 ( V128* argL, V128* argR )
1008 {
1009 V128 block[2];
1010 memcpy(&block[0], argL, sizeof(V128));
1011 memcpy(&block[1], argR, sizeof(V128));
1012 ULong res, flags;
1013 __asm__ __volatile__(
1014 "subq $1024, %%rsp" "\n\t"
1015 "movdqu 0(%2), %%xmm2" "\n\t"
1016 "movdqu 16(%2), %%xmm11" "\n\t"
1017 "pcmpistri $0x45, %%xmm2, %%xmm11" "\n\t"
1018 //"pcmpistrm $0x04, %%xmm2, %%xmm11" "\n\t"
1019 //"movd %%xmm0, %%ecx" "\n\t"
1020 "pushfq" "\n\t"
1021 "popq %%rdx" "\n\t"
1022 "movq %%rcx, %0" "\n\t"
1023 "movq %%rdx, %1" "\n\t"
1024 "addq $1024, %%rsp" "\n\t"
1025 : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1026 : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1027 );
1028 return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1029 }
1030
s_pcmpistri_45(V128 * argLU,V128 * argRU)1031 UInt s_pcmpistri_45 ( V128* argLU, V128* argRU )
1032 {
1033 V128 resV;
1034 UInt resOSZACP, resECX;
1035 Bool ok
1036 = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
1037 zmask_from_V128(argLU),
1038 zmask_from_V128(argRU),
1039 0x45, False/*!isSTRM*/
1040 );
1041 assert(ok);
1042 resECX = resV.uInt[0];
1043 return (resOSZACP << 16) | resECX;
1044 }
1045
istri_45(void)1046 void istri_45 ( void )
1047 {
1048 char* wot = "45";
1049 UInt(*h)(V128*,V128*) = h_pcmpistri_45;
1050 UInt(*s)(V128*,V128*) = s_pcmpistri_45;
1051
1052 try_istri(wot,h,s, "aaaabbbbccccdddd", "000000000000bbcc");
1053 try_istri(wot,h,s, "aaaabbbbccccdddd", "000000000000ccbb");
1054 try_istri(wot,h,s, "baaabbbbccccdddd", "000000000000ccbb");
1055 try_istri(wot,h,s, "baaabbbbccccdddc", "000000000000ccbb");
1056
1057 try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000ccbb");
1058 try_istri(wot,h,s, "bbbbbbbb00bbbbbb", "000000000000ccbb");
1059 try_istri(wot,h,s, "bbbbbbbbbbbb00bb", "000000000000ccbb");
1060 try_istri(wot,h,s, "bbbbbbbbbbbbbb00", "000000000000ccbb");
1061 try_istri(wot,h,s, "0000000000000000", "000000000000ccbb");
1062
1063 try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1064
1065 try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000ccbb");
1066 try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000bb");
1067 try_istri(wot,h,s, "bb44bb44bb44bb44", "000000006622ccbb");
1068
1069 try_istri(wot,h,s, "bb44bb44bb44bb44", "000000000022ccbb");
1070 try_istri(wot,h,s, "bb44bb44bb44bb44", "000000000000ccbb");
1071 try_istri(wot,h,s, "bb44bb44bb44bb44", "00000000000000bb");
1072
1073 try_istri(wot,h,s, "0011223344556677", "0000997755442211");
1074 try_istri(wot,h,s, "1122334455667711", "0000997755442211");
1075
1076 try_istri(wot,h,s, "0011223344556677", "0000aa8866553322");
1077 try_istri(wot,h,s, "1122334455667711", "0000aa8866553322");
1078 }
1079
1080
1081 //////////////////////////////////////////////////////////
1082 // //
1083 // ISTRI_01 //
1084 // //
1085 //////////////////////////////////////////////////////////
1086
h_pcmpistri_01(V128 * argL,V128 * argR)1087 UInt h_pcmpistri_01 ( V128* argL, V128* argR )
1088 {
1089 V128 block[2];
1090 memcpy(&block[0], argL, sizeof(V128));
1091 memcpy(&block[1], argR, sizeof(V128));
1092 ULong res, flags;
1093 __asm__ __volatile__(
1094 "subq $1024, %%rsp" "\n\t"
1095 "movdqu 0(%2), %%xmm2" "\n\t"
1096 "movdqu 16(%2), %%xmm11" "\n\t"
1097 "pcmpistri $0x01, %%xmm2, %%xmm11" "\n\t"
1098 //"pcmpistrm $0x01, %%xmm2, %%xmm11" "\n\t"
1099 //"movd %%xmm0, %%ecx" "\n\t"
1100 "pushfq" "\n\t"
1101 "popq %%rdx" "\n\t"
1102 "movq %%rcx, %0" "\n\t"
1103 "movq %%rdx, %1" "\n\t"
1104 "addq $1024, %%rsp" "\n\t"
1105 : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1106 : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1107 );
1108 return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1109 }
1110
s_pcmpistri_01(V128 * argLU,V128 * argRU)1111 UInt s_pcmpistri_01 ( V128* argLU, V128* argRU )
1112 {
1113 V128 resV;
1114 UInt resOSZACP, resECX;
1115 Bool ok
1116 = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
1117 zmask_from_V128(argLU),
1118 zmask_from_V128(argRU),
1119 0x01, False/*!isSTRM*/
1120 );
1121 assert(ok);
1122 resECX = resV.uInt[0];
1123 return (resOSZACP << 16) | resECX;
1124 }
1125
istri_01(void)1126 void istri_01 ( void )
1127 {
1128 char* wot = "01";
1129 UInt(*h)(V128*,V128*) = h_pcmpistri_01;
1130 UInt(*s)(V128*,V128*) = s_pcmpistri_01;
1131
1132 try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
1133 try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
1134 try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
1135 try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
1136
1137 try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
1138 try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
1139 try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
1140 try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
1141 try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
1142
1143 try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
1144 try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
1145 try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
1146 try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
1147
1148 try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1149 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1150
1151 try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
1152 try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
1153 try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
1154 try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
1155
1156 try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
1157
1158 try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
1159 try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
1160 }
1161
1162
1163 //////////////////////////////////////////////////////////
1164 // //
1165 // ISTRI_39 //
1166 // //
1167 //////////////////////////////////////////////////////////
1168
h_pcmpistri_39(V128 * argL,V128 * argR)1169 UInt h_pcmpistri_39 ( V128* argL, V128* argR )
1170 {
1171 V128 block[2];
1172 memcpy(&block[0], argL, sizeof(V128));
1173 memcpy(&block[1], argR, sizeof(V128));
1174 ULong res, flags;
1175 __asm__ __volatile__(
1176 "subq $1024, %%rsp" "\n\t"
1177 "movdqu 0(%2), %%xmm2" "\n\t"
1178 "movdqu 16(%2), %%xmm11" "\n\t"
1179 "pcmpistri $0x39, %%xmm2, %%xmm11" "\n\t"
1180 "pushfq" "\n\t"
1181 "popq %%rdx" "\n\t"
1182 "movq %%rcx, %0" "\n\t"
1183 "movq %%rdx, %1" "\n\t"
1184 "addq $1024, %%rsp" "\n\t"
1185 : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1186 : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1187 );
1188 return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1189 }
1190
s_pcmpistri_39(V128 * argLU,V128 * argRU)1191 UInt s_pcmpistri_39 ( V128* argLU, V128* argRU )
1192 {
1193 V128 resV;
1194 UInt resOSZACP, resECX;
1195 Bool ok
1196 = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
1197 zmask_from_V128(argLU),
1198 zmask_from_V128(argRU),
1199 0x39, False/*!isSTRM*/
1200 );
1201 assert(ok);
1202 resECX = resV.uInt[0];
1203 return (resOSZACP << 16) | resECX;
1204 }
1205
istri_39(void)1206 void istri_39 ( void )
1207 {
1208 char* wot = "39";
1209 UInt(*h)(V128*,V128*) = h_pcmpistri_39;
1210 UInt(*s)(V128*,V128*) = s_pcmpistri_39;
1211
1212 try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1213
1214 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1215 try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1216 try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
1217 try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
1218
1219 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
1220 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
1221 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
1222
1223 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1224 try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1225 try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1226 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1227
1228 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1229 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
1230 try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
1231
1232 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1233
1234 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
1235 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
1236 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
1237
1238 try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
1239 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
1240 try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
1241
1242 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
1243 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
1244 try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
1245
1246 try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
1247 try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
1248 try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
1249
1250 try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
1251 try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
1252 }
1253
1254
1255
1256 //////////////////////////////////////////////////////////
1257 // //
1258 // main //
1259 // //
1260 //////////////////////////////////////////////////////////
1261
main(void)1262 int main ( void )
1263 {
1264 istri_4B();
1265 istri_3B();
1266 istri_09();
1267 istri_1B();
1268 istri_03();
1269 istri_0D();
1270 istri_13();
1271 istri_45();
1272 istri_01();
1273 istri_39();
1274 return 0;
1275 }
1276