1 
2 /* Tests in detail the core arithmetic for pcmp{e,i}str{i,m} using
3    pcmpistri to drive it.  Does not check the e-vs-i or i-vs-m
4    aspect. */
5 
6 #include <string.h>
7 #include <stdio.h>
8 #include <assert.h>
9 
10 typedef  unsigned int   UInt;
11 typedef  signed int     Int;
12 typedef  unsigned char  UChar;
13 typedef  unsigned short UShort;
14 typedef  unsigned long long int ULong;
15 typedef  UChar          Bool;
16 #define False ((Bool)0)
17 #define True  ((Bool)1)
18 
19 //typedef  unsigned char  V128[16];
20 typedef
21    union {
22       UChar  uChar[16];
23       UShort uShort[8];
24       UInt   uInt[4];
25       UInt   w32[4];
26    }
27    V128;
28 
29 #define SHIFT_O   11
30 #define SHIFT_S   7
31 #define SHIFT_Z   6
32 #define SHIFT_A   4
33 #define SHIFT_C   0
34 #define SHIFT_P   2
35 
36 #define MASK_O    (1ULL << SHIFT_O)
37 #define MASK_S    (1ULL << SHIFT_S)
38 #define MASK_Z    (1ULL << SHIFT_Z)
39 #define MASK_A    (1ULL << SHIFT_A)
40 #define MASK_C    (1ULL << SHIFT_C)
41 #define MASK_P    (1ULL << SHIFT_P)
42 
43 
clz32(UInt x)44 UInt clz32 ( UInt x )
45 {
46    Int y, m, n;
47    y = -(x >> 16);
48    m = (y >> 16) & 16;
49    n = 16 - m;
50    x = x >> m;
51    y = x - 0x100;
52    m = (y >> 16) & 8;
53    n = n + m;
54    x = x << m;
55    y = x - 0x1000;
56    m = (y >> 16) & 4;
57    n = n + m;
58    x = x << m;
59    y = x - 0x4000;
60    m = (y >> 16) & 2;
61    n = n + m;
62    x = x << m;
63    y = x >> 14;
64    m = y & ~(y >> 1);
65    return n + 2 - m;
66 }
67 
ctz32(UInt x)68 UInt ctz32 ( UInt x )
69 {
70    return 32 - clz32((~x) & (x-1));
71 }
72 
expand(V128 * dst,char * summary)73 void expand ( V128* dst, char* summary )
74 {
75    Int i;
76    assert( strlen(summary) == 16 );
77    for (i = 0; i < 16; i++) {
78       UChar xx = 0;
79       UChar x = summary[15-i];
80       if      (x >= '0' && x <= '9') { xx = x - '0'; }
81       else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
82       else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
83       else assert(0);
84 
85       assert(xx < 16);
86       xx = (xx << 4) | xx;
87       assert(xx < 256);
88       dst->uChar[i] = xx;
89    }
90 }
91 
try_istri(char * which,UInt (* h_fn)(V128 *,V128 *),UInt (* s_fn)(V128 *,V128 *),char * summL,char * summR)92 void try_istri ( char* which,
93                  UInt(*h_fn)(V128*,V128*),
94                  UInt(*s_fn)(V128*,V128*),
95                  char* summL, char* summR )
96 {
97    assert(strlen(which) == 2);
98    V128 argL, argR;
99    expand(&argL, summL);
100    expand(&argR, summR);
101    UInt h_res = h_fn(&argL, &argR);
102    UInt s_res = s_fn(&argL, &argR);
103    printf("istri %s  %s %s -> %08x %08x %s\n",
104           which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
105 }
106 
zmask_from_V128(V128 * arg)107 UInt zmask_from_V128 ( V128* arg )
108 {
109    UInt i, res = 0;
110    for (i = 0; i < 8; i++) {
111       res |=  ((arg->uShort[i] == 0) ? 1 : 0) << i;
112    }
113    return res;
114 }
115 
116 //////////////////////////////////////////////////////////
117 //                                                      //
118 //                       GENERAL                        //
119 //                                                      //
120 //////////////////////////////////////////////////////////
121 
122 
123 /* Given partial results from a 16-bit pcmpXstrX operation (intRes1,
124    basically), generate an I- or M-format output value, also the new
125    OSZACP flags.  */
126 static
PCMPxSTRx_WRK_gen_output_fmt_I_wide(V128 * resV,UInt * resOSZACP,UInt intRes1,UInt zmaskL,UInt zmaskR,UInt validL,UInt pol,UInt idx)127 void PCMPxSTRx_WRK_gen_output_fmt_I_wide ( /*OUT*/V128* resV,
128 					   /*OUT*/UInt* resOSZACP,
129 					   UInt intRes1,
130 					   UInt zmaskL, UInt zmaskR,
131 					   UInt validL,
132 					   UInt pol, UInt idx )
133 {
134    assert((pol >> 2) == 0);
135    assert((idx >> 1) == 0);
136 
137    UInt intRes2 = 0;
138    switch (pol) {
139       case 0: intRes2 = intRes1;          break; // pol +
140       case 1: intRes2 = ~intRes1;         break; // pol -
141       case 2: intRes2 = intRes1;          break; // pol m+
142       case 3: intRes2 = intRes1 ^ validL; break; // pol m-
143    }
144    intRes2 &= 0xFF;
145 
146    // generate I-format output (an index in ECX)
147    // generate ecx value
148    UInt newECX = 0;
149    if (idx) {
150      // index of ms-1-bit
151      newECX = intRes2 == 0 ? 8 : (31 - clz32(intRes2));
152    } else {
153      // index of ls-1-bit
154      newECX = intRes2 == 0 ? 8 : ctz32(intRes2);
155    }
156 
157    resV->w32[0] = newECX;
158    resV->w32[1] = 0;
159    resV->w32[2] = 0;
160    resV->w32[3] = 0;
161 
162    // generate new flags, common to all ISTRI and ISTRM cases
163    *resOSZACP    // A, P are zero
164      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
165      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
166      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
167      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
168 }
169 
170 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
171    variants on 16-bit characters.
172 
173    For xSTRI variants, the new ECX value is placed in the 32 bits
174    pointed to by *resV, and the top 96 bits are zeroed.  For xSTRM
175    variants, the result is a 128 bit value and is placed at *resV in
176    the obvious way.
177 
178    For all variants, the new OSZACP value is placed at *resOSZACP.
179 
180    argLV and argRV are the vector args.  The caller must prepare a
181    8-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
182    must be 1 for each zero byte of of the respective arg.  For ESTRx
183    variants this is derived from the explicit length indication, and
184    must be 0 in all places except at the bit index corresponding to
185    the valid length (0 .. 8).  If the valid length is 8 then the
186    mask must be all zeroes.  In all cases, bits 31:8 must be zero.
187 
188    imm8 is the original immediate from the instruction.  isSTRM
189    indicates whether this is a xSTRM or xSTRI variant, which controls
190    how much of *res is written.
191 
192    If the given imm8 case can be handled, the return value is True.
193    If not, False is returned, and neither *res not *resOSZACP are
194    altered.
195 */
196 
pcmpXstrX_WRK_wide(V128 * resV,UInt * resOSZACP,V128 * argLV,V128 * argRV,UInt zmaskL,UInt zmaskR,UInt imm8,Bool isxSTRM)197 Bool pcmpXstrX_WRK_wide ( /*OUT*/V128* resV,
198 			  /*OUT*/UInt* resOSZACP,
199 			  V128* argLV,  V128* argRV,
200 			  UInt zmaskL, UInt zmaskR,
201 			  UInt imm8,   Bool isxSTRM )
202 {
203    assert(imm8 < 0x80);
204    assert((zmaskL >> 8) == 0);
205    assert((zmaskR >> 8) == 0);
206 
207    /* Explicitly reject any imm8 values that haven't been validated,
208       even if they would probably work.  Life is too short to have
209       unvalidated cases in the code base. */
210    switch (imm8) {
211       case 0x01: case 0x03: case 0x09: case 0x0B: case 0x0D:
212       case 0x13:            case 0x1B:
213                             case 0x39: case 0x3B:
214                  case 0x45:            case 0x4B:
215          break;
216       default:
217          return False;
218    }
219 
220    UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
221    UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
222    UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
223    UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
224 
225    /*----------------------------------------*/
226    /*-- strcmp on wide data                --*/
227    /*----------------------------------------*/
228 
229    if (agg == 2/*equal each, aka strcmp*/
230        && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
231       Int    i;
232       UShort* argL = (UShort*)argLV;
233       UShort* argR = (UShort*)argRV;
234       UInt boolResII = 0;
235       for (i = 7; i >= 0; i--) {
236          UShort cL  = argL[i];
237          UShort cR  = argR[i];
238          boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
239       }
240       UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
241       UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
242 
243       // do invalidation, common to all equal-each cases
244       UInt intRes1
245          = (boolResII & validL & validR)  // if both valid, use cmpres
246            | (~ (validL | validR));       // if both invalid, force 1
247                                           // else force 0
248       intRes1 &= 0xFF;
249 
250       // generate I-format output
251       PCMPxSTRx_WRK_gen_output_fmt_I_wide(
252          resV, resOSZACP,
253          intRes1, zmaskL, zmaskR, validL, pol, idx
254       );
255 
256       return True;
257    }
258 
259    /*----------------------------------------*/
260    /*-- set membership on wide data        --*/
261    /*----------------------------------------*/
262 
263    if (agg == 0/*equal any, aka find chars in a set*/
264        && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
265       /* argL: the string,  argR: charset */
266       UInt   si, ci;
267       UShort* argL    = (UShort*)argLV;
268       UShort* argR    = (UShort*)argRV;
269       UInt   boolRes = 0;
270       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
271       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
272 
273       for (si = 0; si < 8; si++) {
274          if ((validL & (1 << si)) == 0)
275             // run off the end of the string.
276             break;
277          UInt m = 0;
278          for (ci = 0; ci < 8; ci++) {
279             if ((validR & (1 << ci)) == 0) break;
280             if (argR[ci] == argL[si]) { m = 1; break; }
281          }
282          boolRes |= (m << si);
283       }
284 
285       // boolRes is "pre-invalidated"
286       UInt intRes1 = boolRes & 0xFF;
287 
288       // generate I-format output
289       PCMPxSTRx_WRK_gen_output_fmt_I_wide(
290          resV, resOSZACP,
291          intRes1, zmaskL, zmaskR, validL, pol, idx
292       );
293 
294       return True;
295    }
296 
297    /*----------------------------------------*/
298    /*-- substring search on wide data      --*/
299    /*----------------------------------------*/
300 
301    if (agg == 3/*equal ordered, aka substring search*/
302        && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
303 
304       /* argL: haystack,  argR: needle */
305       UInt   ni, hi;
306       UShort* argL    = (UShort*)argLV;
307       UShort* argR    = (UShort*)argRV;
308       UInt   boolRes = 0;
309       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
310       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
311       for (hi = 0; hi < 8; hi++) {
312          UInt m = 1;
313          for (ni = 0; ni < 8; ni++) {
314             if ((validR & (1 << ni)) == 0) break;
315             UInt i = ni + hi;
316             if (i >= 8) break;
317             if (argL[i] != argR[ni]) { m = 0; break; }
318          }
319          boolRes |= (m << hi);
320          if ((validL & (1 << hi)) == 0)
321             // run off the end of the haystack
322             break;
323       }
324 
325       // boolRes is "pre-invalidated"
326       UInt intRes1 = boolRes & 0xFF;
327 
328       // generate I-format output
329       PCMPxSTRx_WRK_gen_output_fmt_I_wide(
330          resV, resOSZACP,
331          intRes1, zmaskL, zmaskR, validL, pol, idx
332       );
333 
334       return True;
335    }
336 
337    /*----------------------------------------*/
338    /*-- ranges, unsigned wide data         --*/
339    /*----------------------------------------*/
340 
341    if (agg == 1/*ranges*/
342        && fmt == 1/*uw*/) {
343 
344       /* argL: string,  argR: range-pairs */
345       UInt   ri, si;
346       UShort* argL    = (UShort*)argLV;
347       UShort* argR    = (UShort*)argRV;
348       UInt   boolRes = 0;
349       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
350       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
351       for (si = 0; si < 8; si++) {
352          if ((validL & (1 << si)) == 0)
353             // run off the end of the string
354             break;
355          UInt m = 0;
356          for (ri = 0; ri < 8; ri += 2) {
357             if ((validR & (3 << ri)) != (3 << ri)) break;
358             if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
359                m = 1; break;
360             }
361          }
362          boolRes |= (m << si);
363       }
364 
365       // boolRes is "pre-invalidated"
366       UInt intRes1 = boolRes & 0xFF;
367 
368       // generate I-format output
369       PCMPxSTRx_WRK_gen_output_fmt_I_wide(
370          resV, resOSZACP,
371          intRes1, zmaskL, zmaskR, validL, pol, idx
372       );
373 
374       return True;
375    }
376 
377    return False;
378 }
379 
380 //////////////////////////////////////////////////////////
381 //                                                      //
382 //                       ISTRI_4B                       //
383 //                                                      //
384 //////////////////////////////////////////////////////////
385 
h_pcmpistri_4B(V128 * argL,V128 * argR)386 UInt h_pcmpistri_4B ( V128* argL, V128* argR )
387 {
388    V128 block[2];
389    memcpy(&block[0], argL, sizeof(V128));
390    memcpy(&block[1], argR, sizeof(V128));
391    ULong res, flags;
392    __asm__ __volatile__(
393       "subq      $1024,  %%rsp"             "\n\t"
394       "movdqu    0(%2),  %%xmm2"            "\n\t"
395       "movdqu    16(%2), %%xmm11"           "\n\t"
396       "pcmpistri $0x4B,  %%xmm2, %%xmm11"   "\n\t"
397       "pushfq"                              "\n\t"
398       "popq      %%rdx"                     "\n\t"
399       "movq      %%rcx,  %0"                "\n\t"
400       "movq      %%rdx,  %1"                "\n\t"
401       "addq      $1024,  %%rsp"             "\n\t"
402       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
403       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
404    );
405    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
406 }
407 
s_pcmpistri_4B(V128 * argLU,V128 * argRU)408 UInt s_pcmpistri_4B ( V128* argLU, V128* argRU )
409 {
410    V128 resV;
411    UInt resOSZACP, resECX;
412    Bool ok
413       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
414 			    zmask_from_V128(argLU),
415 			    zmask_from_V128(argRU),
416 			    0x4B, False/*!isSTRM*/
417         );
418    assert(ok);
419    resECX = resV.uInt[0];
420    return (resOSZACP << 16) | resECX;
421 }
422 
istri_4B(void)423 void istri_4B ( void )
424 {
425    char* wot = "4B";
426    UInt(*h)(V128*,V128*) = h_pcmpistri_4B;
427    UInt(*s)(V128*,V128*) = s_pcmpistri_4B;
428 
429    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
430 
431    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
432    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
433    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
434    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
435 
436    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
437    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
438    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
439 
440    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
441    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
442    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
443    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
444 
445    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
446    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
447    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
448 
449    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
450 
451    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
452    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
453    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
454 
455    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
456    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
457    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
458 
459    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
460    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
461    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
462 
463    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
464    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
465    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
466 
467    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
468    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
469 }
470 
471 //////////////////////////////////////////////////////////
472 //                                                      //
473 //                       ISTRI_3B                       //
474 //                                                      //
475 //////////////////////////////////////////////////////////
476 
h_pcmpistri_3B(V128 * argL,V128 * argR)477 UInt h_pcmpistri_3B ( V128* argL, V128* argR )
478 {
479    V128 block[2];
480    memcpy(&block[0], argL, sizeof(V128));
481    memcpy(&block[1], argR, sizeof(V128));
482    ULong res, flags;
483    __asm__ __volatile__(
484       "subq      $1024,  %%rsp"             "\n\t"
485       "movdqu    0(%2),  %%xmm2"            "\n\t"
486       "movdqu    16(%2), %%xmm11"           "\n\t"
487       "pcmpistri $0x3B,  %%xmm2, %%xmm11"   "\n\t"
488       "pushfq"                              "\n\t"
489       "popq      %%rdx"                     "\n\t"
490       "movq      %%rcx,  %0"                "\n\t"
491       "movq      %%rdx,  %1"                "\n\t"
492       "addq      $1024,  %%rsp"             "\n\t"
493       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
494       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
495    );
496    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
497 }
498 
s_pcmpistri_3B(V128 * argLU,V128 * argRU)499 UInt s_pcmpistri_3B ( V128* argLU, V128* argRU )
500 {
501    V128 resV;
502    UInt resOSZACP, resECX;
503    Bool ok
504       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
505 			    zmask_from_V128(argLU),
506 			    zmask_from_V128(argRU),
507 			    0x3B, False/*!isSTRM*/
508         );
509    assert(ok);
510    resECX = resV.uInt[0];
511    return (resOSZACP << 16) | resECX;
512 }
513 
istri_3B(void)514 void istri_3B ( void )
515 {
516    char* wot = "3B";
517    UInt(*h)(V128*,V128*) = h_pcmpistri_3B;
518    UInt(*s)(V128*,V128*) = s_pcmpistri_3B;
519 
520    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
521 
522    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
523    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
524    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
525    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
526 
527    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
528    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
529    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
530 
531    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
532    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
533    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
534    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
535 
536    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
537    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
538    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
539 
540    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
541 
542    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
543    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
544    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
545 
546    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
547    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
548    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
549 
550    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
551    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
552    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
553 
554    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
555    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
556    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
557 
558    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
559    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
560 }
561 
562 
563 
564 //////////////////////////////////////////////////////////
565 //                                                      //
566 //                       ISTRI_0D                       //
567 //                                                      //
568 //////////////////////////////////////////////////////////
569 
570 __attribute__((noinline))
h_pcmpistri_0D(V128 * argL,V128 * argR)571 UInt h_pcmpistri_0D ( V128* argL, V128* argR )
572 {
573    V128 block[2];
574    memcpy(&block[0], argL, sizeof(V128));
575    memcpy(&block[1], argR, sizeof(V128));
576    ULong res = 0, flags = 0;
577    __asm__ __volatile__(
578       "movdqu    0(%2),  %%xmm2"            "\n\t"
579       "movdqu    16(%2), %%xmm11"           "\n\t"
580       "pcmpistri $0x0D,  %%xmm2, %%xmm11"   "\n\t"
581       //"pcmpistrm $0x0D,  %%xmm2, %%xmm11"   "\n\t"
582       //"movd %%xmm0, %%ecx" "\n\t"
583       "pushfq"                              "\n\t"
584       "popq      %%rdx"                     "\n\t"
585       "movq      %%rcx,  %0"                "\n\t"
586       "movq      %%rdx,  %1"                "\n\t"
587       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
588       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
589    );
590    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
591 }
592 
s_pcmpistri_0D(V128 * argLU,V128 * argRU)593 UInt s_pcmpistri_0D ( V128* argLU, V128* argRU )
594 {
595    V128 resV;
596    UInt resOSZACP, resECX;
597    Bool ok
598       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
599 			    zmask_from_V128(argLU),
600 			    zmask_from_V128(argRU),
601 			    0x0D, False/*!isSTRM*/
602         );
603    assert(ok);
604    resECX = resV.uInt[0];
605    return (resOSZACP << 16) | resECX;
606 }
607 
istri_0D(void)608 void istri_0D ( void )
609 {
610    char* wot = "0D";
611    UInt(*h)(V128*,V128*) = h_pcmpistri_0D;
612    UInt(*s)(V128*,V128*) = s_pcmpistri_0D;
613 
614    try_istri(wot,h,s, "11111111abcdef11", "0000000000abcdef");
615 
616    try_istri(wot,h,s, "11111111abcdef11", "00abcdef00abcdef");
617 
618    try_istri(wot,h,s, "11111111abcdef11", "0000000000abcdef");
619    try_istri(wot,h,s, "1111111111abcdef", "0000000000abcdef");
620    try_istri(wot,h,s, "111111111111abcd", "0000000000abcdef");
621 
622    try_istri(wot,h,s, "1111abcd11abcd11", "000000000000abcd");
623 
624    try_istri(wot,h,s, "11abcd1111abcd11", "000000000000abcd");
625    try_istri(wot,h,s, "abcd111111abcd11", "000000000000abcd");
626    try_istri(wot,h,s, "cd11111111abcd11", "000000000000abcd");
627 
628    try_istri(wot,h,s, "01abcd11abcd1111", "000000000000abcd");
629    try_istri(wot,h,s, "00abcd11abcd1111", "000000000000abcd");
630    try_istri(wot,h,s, "0000cd11abcd1111", "000000000000abcd");
631 
632    try_istri(wot,h,s, "00abcd1100abcd11", "000000000000abcd");
633    try_istri(wot,h,s, "00abcd110000cd11", "000000000000abcd");
634 
635    try_istri(wot,h,s, "1111111111111234", "0000000000000000");
636    try_istri(wot,h,s, "1111111111111234", "0000000000000011");
637    try_istri(wot,h,s, "1111111111111234", "0000000000001111");
638 
639    try_istri(wot,h,s, "1111111111111234", "1111111111111234");
640    try_istri(wot,h,s, "0a11111111111111", "000000000000000a");
641    try_istri(wot,h,s, "0b11111111111111", "000000000000000a");
642 
643    try_istri(wot,h,s, "b111111111111111", "0000000000000000");
644    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
645    try_istri(wot,h,s, "123456789abcdef1", "0000000000000000");
646    try_istri(wot,h,s, "0000000000000000", "123456789abcdef1");
647 }
648 
649 
650 //////////////////////////////////////////////////////////
651 //                                                      //
652 //                       ISTRI_09                       //
653 //                                                      //
654 //////////////////////////////////////////////////////////
655 
h_pcmpistri_09(V128 * argL,V128 * argR)656 UInt h_pcmpistri_09 ( V128* argL, V128* argR )
657 {
658    V128 block[2];
659    memcpy(&block[0], argL, sizeof(V128));
660    memcpy(&block[1], argR, sizeof(V128));
661    ULong res, flags;
662    __asm__ __volatile__(
663       "subq      $1024,  %%rsp"             "\n\t"
664       "movdqu    0(%2),  %%xmm2"            "\n\t"
665       "movdqu    16(%2), %%xmm11"           "\n\t"
666       "pcmpistri $0x09,  %%xmm2, %%xmm11"   "\n\t"
667       "pushfq"                              "\n\t"
668       "popq      %%rdx"                     "\n\t"
669       "movq      %%rcx,  %0"                "\n\t"
670       "movq      %%rdx,  %1"                "\n\t"
671       "addq      $1024,  %%rsp"             "\n\t"
672       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
673       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
674    );
675    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
676 }
677 
s_pcmpistri_09(V128 * argLU,V128 * argRU)678 UInt s_pcmpistri_09 ( V128* argLU, V128* argRU )
679 {
680    V128 resV;
681    UInt resOSZACP, resECX;
682    Bool ok
683       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
684 			    zmask_from_V128(argLU),
685 			    zmask_from_V128(argRU),
686 			    0x09, False/*!isSTRM*/
687         );
688    assert(ok);
689    resECX = resV.uInt[0];
690    return (resOSZACP << 16) | resECX;
691 }
692 
istri_09(void)693 void istri_09 ( void )
694 {
695    char* wot = "09";
696    UInt(*h)(V128*,V128*) = h_pcmpistri_09;
697    UInt(*s)(V128*,V128*) = s_pcmpistri_09;
698 
699    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
700 
701    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
702    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
703    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
704    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
705 
706    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
707    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
708    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
709 
710    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
711    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
712    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
713    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
714 
715    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
716    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
717    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
718 
719    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
720 
721    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
722    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
723    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
724 
725    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
726    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
727    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
728 
729    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
730    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
731    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
732 
733    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
734    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
735    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
736 
737    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
738    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
739 }
740 
741 
742 
743 //////////////////////////////////////////////////////////
744 //                                                      //
745 //                       ISTRI_1B                       //
746 //                                                      //
747 //////////////////////////////////////////////////////////
748 
h_pcmpistri_1B(V128 * argL,V128 * argR)749 UInt h_pcmpistri_1B ( V128* argL, V128* argR )
750 {
751    V128 block[2];
752    memcpy(&block[0], argL, sizeof(V128));
753    memcpy(&block[1], argR, sizeof(V128));
754    ULong res, flags;
755    __asm__ __volatile__(
756       "subq      $1024,  %%rsp"             "\n\t"
757       "movdqu    0(%2),  %%xmm2"            "\n\t"
758       "movdqu    16(%2), %%xmm11"           "\n\t"
759       "pcmpistri $0x1B,  %%xmm2, %%xmm11"   "\n\t"
760       "pushfq"                              "\n\t"
761       "popq      %%rdx"                     "\n\t"
762       "movq      %%rcx,  %0"                "\n\t"
763       "movq      %%rdx,  %1"                "\n\t"
764       "addq      $1024,  %%rsp"             "\n\t"
765       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
766       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
767    );
768    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
769 }
770 
s_pcmpistri_1B(V128 * argLU,V128 * argRU)771 UInt s_pcmpistri_1B ( V128* argLU, V128* argRU )
772 {
773    V128 resV;
774    UInt resOSZACP, resECX;
775    Bool ok
776       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
777 			    zmask_from_V128(argLU),
778 			    zmask_from_V128(argRU),
779 			    0x1B, False/*!isSTRM*/
780         );
781    assert(ok);
782    resECX = resV.uInt[0];
783    return (resOSZACP << 16) | resECX;
784 }
785 
istri_1B(void)786 void istri_1B ( void )
787 {
788    char* wot = "1B";
789    UInt(*h)(V128*,V128*) = h_pcmpistri_1B;
790    UInt(*s)(V128*,V128*) = s_pcmpistri_1B;
791 
792    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
793 
794    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
795    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
796    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
797    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
798 
799    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
800    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
801    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
802 
803    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
804    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
805    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
806    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
807 
808    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
809    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
810    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
811 
812    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
813 
814    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
815    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
816    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
817 
818    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
819    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
820    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
821 
822    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
823    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
824    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
825 
826    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
827    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
828    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
829 
830    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
831    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
832 }
833 
834 
835 
836 //////////////////////////////////////////////////////////
837 //                                                      //
838 //                       ISTRI_03                       //
839 //                                                      //
840 //////////////////////////////////////////////////////////
841 
h_pcmpistri_03(V128 * argL,V128 * argR)842 UInt h_pcmpistri_03 ( V128* argL, V128* argR )
843 {
844    V128 block[2];
845    memcpy(&block[0], argL, sizeof(V128));
846    memcpy(&block[1], argR, sizeof(V128));
847    ULong res, flags;
848    __asm__ __volatile__(
849       "subq      $1024,  %%rsp"             "\n\t"
850       "movdqu    0(%2),  %%xmm2"            "\n\t"
851       "movdqu    16(%2), %%xmm11"           "\n\t"
852       "pcmpistri $0x03,  %%xmm2, %%xmm11"   "\n\t"
853 //"pcmpistrm $0x03, %%xmm2, %%xmm11"   "\n\t"
854 //"movd %%xmm0, %%ecx" "\n\t"
855       "pushfq"                              "\n\t"
856       "popq      %%rdx"                     "\n\t"
857       "movq      %%rcx,  %0"                "\n\t"
858       "movq      %%rdx,  %1"                "\n\t"
859       "addq      $1024,  %%rsp"             "\n\t"
860       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
861       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
862    );
863    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
864 }
865 
s_pcmpistri_03(V128 * argLU,V128 * argRU)866 UInt s_pcmpistri_03 ( V128* argLU, V128* argRU )
867 {
868    V128 resV;
869    UInt resOSZACP, resECX;
870    Bool ok
871       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
872 			    zmask_from_V128(argLU),
873 			    zmask_from_V128(argRU),
874 			    0x03, False/*!isSTRM*/
875         );
876    assert(ok);
877    resECX = resV.uInt[0];
878    return (resOSZACP << 16) | resECX;
879 }
880 
istri_03(void)881 void istri_03 ( void )
882 {
883    char* wot = "03";
884    UInt(*h)(V128*,V128*) = h_pcmpistri_03;
885    UInt(*s)(V128*,V128*) = s_pcmpistri_03;
886 
887    try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
888    try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
889    try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
890    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
891 
892    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
893    try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
894    try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
895    try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
896    try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
897 
898    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
899    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
900    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
901    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
902 
903    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
904    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
905 
906    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
907    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
908    try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
909    try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
910 
911    try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
912 
913    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
914    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
915 }
916 
917 
918 //////////////////////////////////////////////////////////
919 //                                                      //
920 //                       ISTRI_13                       //
921 //                                                      //
922 //////////////////////////////////////////////////////////
923 
h_pcmpistri_13(V128 * argL,V128 * argR)924 UInt h_pcmpistri_13 ( V128* argL, V128* argR )
925 {
926    V128 block[2];
927    memcpy(&block[0], argL, sizeof(V128));
928    memcpy(&block[1], argR, sizeof(V128));
929    ULong res, flags;
930    __asm__ __volatile__(
931       "subq      $1024,  %%rsp"             "\n\t"
932       "movdqu    0(%2),  %%xmm2"            "\n\t"
933       "movdqu    16(%2), %%xmm11"           "\n\t"
934       "pcmpistri $0x13,  %%xmm2, %%xmm11"   "\n\t"
935 //"pcmpistrm $0x13, %%xmm2, %%xmm11"   "\n\t"
936 //"movd %%xmm0, %%ecx" "\n\t"
937       "pushfq"                              "\n\t"
938       "popq      %%rdx"                     "\n\t"
939       "movq      %%rcx,  %0"                "\n\t"
940       "movq      %%rdx,  %1"                "\n\t"
941       "addq      $1024,  %%rsp"             "\n\t"
942       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
943       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
944    );
945    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
946 }
947 
s_pcmpistri_13(V128 * argLU,V128 * argRU)948 UInt s_pcmpistri_13 ( V128* argLU, V128* argRU )
949 {
950    V128 resV;
951    UInt resOSZACP, resECX;
952    Bool ok
953       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
954 			    zmask_from_V128(argLU),
955 			    zmask_from_V128(argRU),
956 			    0x13, False/*!isSTRM*/
957         );
958    assert(ok);
959    resECX = resV.uInt[0];
960    return (resOSZACP << 16) | resECX;
961 }
962 
istri_13(void)963 void istri_13 ( void )
964 {
965    char* wot = "13";
966    UInt(*h)(V128*,V128*) = h_pcmpistri_13;
967    UInt(*s)(V128*,V128*) = s_pcmpistri_13;
968 
969    try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
970    try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
971    try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
972    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
973 
974    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
975    try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
976    try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
977    try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
978    try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
979 
980    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
981    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
982    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
983    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
984 
985    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
986    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
987 
988    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
989    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
990    try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
991    try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
992 
993    try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
994 
995    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
996    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
997 }
998 
999 
1000 
1001 //////////////////////////////////////////////////////////
1002 //                                                      //
1003 //                       ISTRI_45                       //
1004 //                                                      //
1005 //////////////////////////////////////////////////////////
1006 
h_pcmpistri_45(V128 * argL,V128 * argR)1007 UInt h_pcmpistri_45 ( V128* argL, V128* argR )
1008 {
1009    V128 block[2];
1010    memcpy(&block[0], argL, sizeof(V128));
1011    memcpy(&block[1], argR, sizeof(V128));
1012    ULong res, flags;
1013    __asm__ __volatile__(
1014       "subq      $1024,  %%rsp"             "\n\t"
1015       "movdqu    0(%2),  %%xmm2"            "\n\t"
1016       "movdqu    16(%2), %%xmm11"           "\n\t"
1017       "pcmpistri $0x45,  %%xmm2, %%xmm11"   "\n\t"
1018 //"pcmpistrm $0x04, %%xmm2, %%xmm11"   "\n\t"
1019 //"movd %%xmm0, %%ecx" "\n\t"
1020       "pushfq"                              "\n\t"
1021       "popq      %%rdx"                     "\n\t"
1022       "movq      %%rcx,  %0"                "\n\t"
1023       "movq      %%rdx,  %1"                "\n\t"
1024       "addq      $1024,  %%rsp"             "\n\t"
1025       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1026       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1027    );
1028    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1029 }
1030 
s_pcmpistri_45(V128 * argLU,V128 * argRU)1031 UInt s_pcmpistri_45 ( V128* argLU, V128* argRU )
1032 {
1033    V128 resV;
1034    UInt resOSZACP, resECX;
1035    Bool ok
1036       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
1037 			    zmask_from_V128(argLU),
1038 			    zmask_from_V128(argRU),
1039 			    0x45, False/*!isSTRM*/
1040         );
1041    assert(ok);
1042    resECX = resV.uInt[0];
1043    return (resOSZACP << 16) | resECX;
1044 }
1045 
istri_45(void)1046 void istri_45 ( void )
1047 {
1048    char* wot = "45";
1049    UInt(*h)(V128*,V128*) = h_pcmpistri_45;
1050    UInt(*s)(V128*,V128*) = s_pcmpistri_45;
1051 
1052    try_istri(wot,h,s, "aaaabbbbccccdddd", "000000000000bbcc");
1053    try_istri(wot,h,s, "aaaabbbbccccdddd", "000000000000ccbb");
1054    try_istri(wot,h,s, "baaabbbbccccdddd", "000000000000ccbb");
1055    try_istri(wot,h,s, "baaabbbbccccdddc", "000000000000ccbb");
1056 
1057    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000ccbb");
1058    try_istri(wot,h,s, "bbbbbbbb00bbbbbb", "000000000000ccbb");
1059    try_istri(wot,h,s, "bbbbbbbbbbbb00bb", "000000000000ccbb");
1060    try_istri(wot,h,s, "bbbbbbbbbbbbbb00", "000000000000ccbb");
1061    try_istri(wot,h,s, "0000000000000000", "000000000000ccbb");
1062 
1063    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1064 
1065    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000ccbb");
1066    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000bb");
1067    try_istri(wot,h,s, "bb44bb44bb44bb44", "000000006622ccbb");
1068 
1069    try_istri(wot,h,s, "bb44bb44bb44bb44", "000000000022ccbb");
1070    try_istri(wot,h,s, "bb44bb44bb44bb44", "000000000000ccbb");
1071    try_istri(wot,h,s, "bb44bb44bb44bb44", "00000000000000bb");
1072 
1073    try_istri(wot,h,s, "0011223344556677", "0000997755442211");
1074    try_istri(wot,h,s, "1122334455667711", "0000997755442211");
1075 
1076    try_istri(wot,h,s, "0011223344556677", "0000aa8866553322");
1077    try_istri(wot,h,s, "1122334455667711", "0000aa8866553322");
1078 }
1079 
1080 
1081 //////////////////////////////////////////////////////////
1082 //                                                      //
1083 //                       ISTRI_01                       //
1084 //                                                      //
1085 //////////////////////////////////////////////////////////
1086 
h_pcmpistri_01(V128 * argL,V128 * argR)1087 UInt h_pcmpistri_01 ( V128* argL, V128* argR )
1088 {
1089    V128 block[2];
1090    memcpy(&block[0], argL, sizeof(V128));
1091    memcpy(&block[1], argR, sizeof(V128));
1092    ULong res, flags;
1093    __asm__ __volatile__(
1094       "subq      $1024,  %%rsp"             "\n\t"
1095       "movdqu    0(%2),  %%xmm2"            "\n\t"
1096       "movdqu    16(%2), %%xmm11"           "\n\t"
1097       "pcmpistri $0x01,  %%xmm2, %%xmm11"   "\n\t"
1098 //"pcmpistrm $0x01, %%xmm2, %%xmm11"   "\n\t"
1099 //"movd %%xmm0, %%ecx" "\n\t"
1100       "pushfq"                              "\n\t"
1101       "popq      %%rdx"                     "\n\t"
1102       "movq      %%rcx,  %0"                "\n\t"
1103       "movq      %%rdx,  %1"                "\n\t"
1104       "addq      $1024,  %%rsp"             "\n\t"
1105       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1106       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1107    );
1108    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1109 }
1110 
s_pcmpistri_01(V128 * argLU,V128 * argRU)1111 UInt s_pcmpistri_01 ( V128* argLU, V128* argRU )
1112 {
1113    V128 resV;
1114    UInt resOSZACP, resECX;
1115    Bool ok
1116       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
1117 			    zmask_from_V128(argLU),
1118 			    zmask_from_V128(argRU),
1119 			    0x01, False/*!isSTRM*/
1120         );
1121    assert(ok);
1122    resECX = resV.uInt[0];
1123    return (resOSZACP << 16) | resECX;
1124 }
1125 
istri_01(void)1126 void istri_01 ( void )
1127 {
1128    char* wot = "01";
1129    UInt(*h)(V128*,V128*) = h_pcmpistri_01;
1130    UInt(*s)(V128*,V128*) = s_pcmpistri_01;
1131 
1132    try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
1133    try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
1134    try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
1135    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
1136 
1137    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
1138    try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
1139    try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
1140    try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
1141    try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
1142 
1143    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
1144    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
1145    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
1146    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
1147 
1148    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1149    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1150 
1151    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
1152    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
1153    try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
1154    try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
1155 
1156    try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
1157 
1158    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
1159    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
1160 }
1161 
1162 
1163 //////////////////////////////////////////////////////////
1164 //                                                      //
1165 //                       ISTRI_39                       //
1166 //                                                      //
1167 //////////////////////////////////////////////////////////
1168 
h_pcmpistri_39(V128 * argL,V128 * argR)1169 UInt h_pcmpistri_39 ( V128* argL, V128* argR )
1170 {
1171    V128 block[2];
1172    memcpy(&block[0], argL, sizeof(V128));
1173    memcpy(&block[1], argR, sizeof(V128));
1174    ULong res, flags;
1175    __asm__ __volatile__(
1176       "subq      $1024,  %%rsp"             "\n\t"
1177       "movdqu    0(%2),  %%xmm2"            "\n\t"
1178       "movdqu    16(%2), %%xmm11"           "\n\t"
1179       "pcmpistri $0x39,  %%xmm2, %%xmm11"   "\n\t"
1180       "pushfq"                              "\n\t"
1181       "popq      %%rdx"                     "\n\t"
1182       "movq      %%rcx,  %0"                "\n\t"
1183       "movq      %%rdx,  %1"                "\n\t"
1184       "addq      $1024,  %%rsp"             "\n\t"
1185       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1186       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1187    );
1188    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1189 }
1190 
s_pcmpistri_39(V128 * argLU,V128 * argRU)1191 UInt s_pcmpistri_39 ( V128* argLU, V128* argRU )
1192 {
1193    V128 resV;
1194    UInt resOSZACP, resECX;
1195    Bool ok
1196       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
1197 			    zmask_from_V128(argLU),
1198 			    zmask_from_V128(argRU),
1199 			    0x39, False/*!isSTRM*/
1200         );
1201    assert(ok);
1202    resECX = resV.uInt[0];
1203    return (resOSZACP << 16) | resECX;
1204 }
1205 
istri_39(void)1206 void istri_39 ( void )
1207 {
1208    char* wot = "39";
1209    UInt(*h)(V128*,V128*) = h_pcmpistri_39;
1210    UInt(*s)(V128*,V128*) = s_pcmpistri_39;
1211 
1212    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1213 
1214    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1215    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1216    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
1217    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
1218 
1219    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
1220    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
1221    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
1222 
1223    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1224    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1225    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1226    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1227 
1228    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1229    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
1230    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
1231 
1232    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1233 
1234    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
1235    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
1236    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
1237 
1238    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
1239    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
1240    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
1241 
1242    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
1243    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
1244    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
1245 
1246    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
1247    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
1248    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
1249 
1250    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
1251    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
1252 }
1253 
1254 
1255 
1256 //////////////////////////////////////////////////////////
1257 //                                                      //
1258 //                         main                         //
1259 //                                                      //
1260 //////////////////////////////////////////////////////////
1261 
main(void)1262 int main ( void )
1263 {
1264    istri_4B();
1265    istri_3B();
1266    istri_09();
1267    istri_1B();
1268    istri_03();
1269    istri_0D();
1270    istri_13();
1271    istri_45();
1272    istri_01();
1273    istri_39();
1274    return 0;
1275 }
1276