1 
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <assert.h>
5 
6 #define VERBOSE 0
7 
8 typedef  unsigned int            UInt;
9 typedef  unsigned char           UChar;
10 typedef  unsigned long long int  ULong;
11 typedef  signed long long int    Long;
12 typedef  signed int              Int;
13 typedef  unsigned short          UShort;
14 typedef  unsigned long           UWord;
15 typedef  char                    HChar;
16 
17 /////////////////////////////////////////////////////////////////
18 // BEGIN crc32 stuff                                           //
19 /////////////////////////////////////////////////////////////////
20 
21 static const UInt crc32Table[256] = {
22 
23    /*-- Ugly, innit? --*/
24 
25    0x00000000L, 0x04c11db7L, 0x09823b6eL, 0x0d4326d9L,
26    0x130476dcL, 0x17c56b6bL, 0x1a864db2L, 0x1e475005L,
27    0x2608edb8L, 0x22c9f00fL, 0x2f8ad6d6L, 0x2b4bcb61L,
28    0x350c9b64L, 0x31cd86d3L, 0x3c8ea00aL, 0x384fbdbdL,
29    0x4c11db70L, 0x48d0c6c7L, 0x4593e01eL, 0x4152fda9L,
30    0x5f15adacL, 0x5bd4b01bL, 0x569796c2L, 0x52568b75L,
31    0x6a1936c8L, 0x6ed82b7fL, 0x639b0da6L, 0x675a1011L,
32    0x791d4014L, 0x7ddc5da3L, 0x709f7b7aL, 0x745e66cdL,
33    0x9823b6e0L, 0x9ce2ab57L, 0x91a18d8eL, 0x95609039L,
34    0x8b27c03cL, 0x8fe6dd8bL, 0x82a5fb52L, 0x8664e6e5L,
35    0xbe2b5b58L, 0xbaea46efL, 0xb7a96036L, 0xb3687d81L,
36    0xad2f2d84L, 0xa9ee3033L, 0xa4ad16eaL, 0xa06c0b5dL,
37    0xd4326d90L, 0xd0f37027L, 0xddb056feL, 0xd9714b49L,
38    0xc7361b4cL, 0xc3f706fbL, 0xceb42022L, 0xca753d95L,
39    0xf23a8028L, 0xf6fb9d9fL, 0xfbb8bb46L, 0xff79a6f1L,
40    0xe13ef6f4L, 0xe5ffeb43L, 0xe8bccd9aL, 0xec7dd02dL,
41    0x34867077L, 0x30476dc0L, 0x3d044b19L, 0x39c556aeL,
42    0x278206abL, 0x23431b1cL, 0x2e003dc5L, 0x2ac12072L,
43    0x128e9dcfL, 0x164f8078L, 0x1b0ca6a1L, 0x1fcdbb16L,
44    0x018aeb13L, 0x054bf6a4L, 0x0808d07dL, 0x0cc9cdcaL,
45    0x7897ab07L, 0x7c56b6b0L, 0x71159069L, 0x75d48ddeL,
46    0x6b93dddbL, 0x6f52c06cL, 0x6211e6b5L, 0x66d0fb02L,
47    0x5e9f46bfL, 0x5a5e5b08L, 0x571d7dd1L, 0x53dc6066L,
48    0x4d9b3063L, 0x495a2dd4L, 0x44190b0dL, 0x40d816baL,
49    0xaca5c697L, 0xa864db20L, 0xa527fdf9L, 0xa1e6e04eL,
50    0xbfa1b04bL, 0xbb60adfcL, 0xb6238b25L, 0xb2e29692L,
51    0x8aad2b2fL, 0x8e6c3698L, 0x832f1041L, 0x87ee0df6L,
52    0x99a95df3L, 0x9d684044L, 0x902b669dL, 0x94ea7b2aL,
53    0xe0b41de7L, 0xe4750050L, 0xe9362689L, 0xedf73b3eL,
54    0xf3b06b3bL, 0xf771768cL, 0xfa325055L, 0xfef34de2L,
55    0xc6bcf05fL, 0xc27dede8L, 0xcf3ecb31L, 0xcbffd686L,
56    0xd5b88683L, 0xd1799b34L, 0xdc3abdedL, 0xd8fba05aL,
57    0x690ce0eeL, 0x6dcdfd59L, 0x608edb80L, 0x644fc637L,
58    0x7a089632L, 0x7ec98b85L, 0x738aad5cL, 0x774bb0ebL,
59    0x4f040d56L, 0x4bc510e1L, 0x46863638L, 0x42472b8fL,
60    0x5c007b8aL, 0x58c1663dL, 0x558240e4L, 0x51435d53L,
61    0x251d3b9eL, 0x21dc2629L, 0x2c9f00f0L, 0x285e1d47L,
62    0x36194d42L, 0x32d850f5L, 0x3f9b762cL, 0x3b5a6b9bL,
63    0x0315d626L, 0x07d4cb91L, 0x0a97ed48L, 0x0e56f0ffL,
64    0x1011a0faL, 0x14d0bd4dL, 0x19939b94L, 0x1d528623L,
65    0xf12f560eL, 0xf5ee4bb9L, 0xf8ad6d60L, 0xfc6c70d7L,
66    0xe22b20d2L, 0xe6ea3d65L, 0xeba91bbcL, 0xef68060bL,
67    0xd727bbb6L, 0xd3e6a601L, 0xdea580d8L, 0xda649d6fL,
68    0xc423cd6aL, 0xc0e2d0ddL, 0xcda1f604L, 0xc960ebb3L,
69    0xbd3e8d7eL, 0xb9ff90c9L, 0xb4bcb610L, 0xb07daba7L,
70    0xae3afba2L, 0xaafbe615L, 0xa7b8c0ccL, 0xa379dd7bL,
71    0x9b3660c6L, 0x9ff77d71L, 0x92b45ba8L, 0x9675461fL,
72    0x8832161aL, 0x8cf30badL, 0x81b02d74L, 0x857130c3L,
73    0x5d8a9099L, 0x594b8d2eL, 0x5408abf7L, 0x50c9b640L,
74    0x4e8ee645L, 0x4a4ffbf2L, 0x470cdd2bL, 0x43cdc09cL,
75    0x7b827d21L, 0x7f436096L, 0x7200464fL, 0x76c15bf8L,
76    0x68860bfdL, 0x6c47164aL, 0x61043093L, 0x65c52d24L,
77    0x119b4be9L, 0x155a565eL, 0x18197087L, 0x1cd86d30L,
78    0x029f3d35L, 0x065e2082L, 0x0b1d065bL, 0x0fdc1becL,
79    0x3793a651L, 0x3352bbe6L, 0x3e119d3fL, 0x3ad08088L,
80    0x2497d08dL, 0x2056cd3aL, 0x2d15ebe3L, 0x29d4f654L,
81    0xc5a92679L, 0xc1683bceL, 0xcc2b1d17L, 0xc8ea00a0L,
82    0xd6ad50a5L, 0xd26c4d12L, 0xdf2f6bcbL, 0xdbee767cL,
83    0xe3a1cbc1L, 0xe760d676L, 0xea23f0afL, 0xeee2ed18L,
84    0xf0a5bd1dL, 0xf464a0aaL, 0xf9278673L, 0xfde69bc4L,
85    0x89b8fd09L, 0x8d79e0beL, 0x803ac667L, 0x84fbdbd0L,
86    0x9abc8bd5L, 0x9e7d9662L, 0x933eb0bbL, 0x97ffad0cL,
87    0xafb010b1L, 0xab710d06L, 0xa6322bdfL, 0xa2f33668L,
88    0xbcb4666dL, 0xb8757bdaL, 0xb5365d03L, 0xb1f740b4L
89 };
90 
91 #define UPDATE_CRC(crcVar,cha)                 \
92 {                                              \
93    crcVar = (crcVar << 8) ^                    \
94             crc32Table[(crcVar >> 24) ^        \
95                        ((UChar)cha)];          \
96 }
97 
crcBytes(UChar * bytes,UWord nBytes,UInt crcIn)98 static UInt crcBytes ( UChar* bytes, UWord nBytes, UInt crcIn )
99 {
100    UInt crc = crcIn;
101    while (nBytes >= 4) {
102       UPDATE_CRC(crc, bytes[0]);
103       UPDATE_CRC(crc, bytes[1]);
104       UPDATE_CRC(crc, bytes[2]);
105       UPDATE_CRC(crc, bytes[3]);
106       bytes += 4;
107       nBytes -= 4;
108    }
109    while (nBytes >= 1) {
110       UPDATE_CRC(crc, bytes[0]);
111       bytes += 1;
112       nBytes -= 1;
113    }
114    return crc;
115 }
116 
crcFinalise(UInt crc)117 static UInt crcFinalise ( UInt crc ) {
118    return ~crc;
119 }
120 
121 ////////
122 
123 static UInt theCRC = 0xFFFFFFFF;
124 
125 static HChar outBuf[1024];
126 // take output that's in outBuf, length as specified, and
127 // update the running crc.
send(int nbytes)128 static void send ( int nbytes )
129 {
130    assert( ((unsigned int)nbytes) < sizeof(outBuf)-1);
131    assert(outBuf[nbytes] == 0);
132    theCRC = crcBytes( (UChar*)&outBuf[0], nbytes, theCRC );
133    if (VERBOSE) printf("SEND %08x %s", theCRC, outBuf);
134 }
135 
136 
137 /////////////////////////////////////////////////////////////////
138 // END crc32 stuff                                             //
139 /////////////////////////////////////////////////////////////////
140 
141 #if 0
142 
143 // full version
144 #define NVALS 76
145 
146 static ULong val[NVALS]
147     = { 0x00ULL, 0x01ULL, 0x02ULL, 0x03ULL,
148         0x3FULL, 0x40ULL, 0x41ULL,
149         0x7EULL, 0x7FULL, 0x80ULL, 0x81ULL, 0x82ULL,
150         0xBFULL, 0xC0ULL, 0xC1ULL,
151         0xFCULL, 0xFDULL, 0xFEULL, 0xFFULL,
152 
153         0xFF00ULL, 0xFF01ULL, 0xFF02ULL, 0xFF03ULL,
154         0xFF3FULL, 0xFF40ULL, 0xFF41ULL,
155         0xFF7EULL, 0xFF7FULL, 0xFF80ULL, 0xFF81ULL, 0xFF82ULL,
156         0xFFBFULL, 0xFFC0ULL, 0xFFC1ULL,
157         0xFFFCULL, 0xFFFDULL, 0xFFFEULL, 0xFFFFULL,
158 
159         0xFFFFFF00ULL, 0xFFFFFF01ULL, 0xFFFFFF02ULL, 0xFFFFFF03ULL,
160         0xFFFFFF3FULL, 0xFFFFFF40ULL, 0xFFFFFF41ULL,
161         0xFFFFFF7EULL, 0xFFFFFF7FULL, 0xFFFFFF80ULL, 0xFFFFFF81ULL, 0xFFFFFF82ULL,
162         0xFFFFFFBFULL, 0xFFFFFFC0ULL, 0xFFFFFFC1ULL,
163         0xFFFFFFFCULL, 0xFFFFFFFDULL, 0xFFFFFFFEULL, 0xFFFFFFFFULL,
164 
165         0xFFFFFFFFFFFFFF00ULL, 0xFFFFFFFFFFFFFF01ULL, 0xFFFFFFFFFFFFFF02ULL,
166                                0xFFFFFFFFFFFFFF03ULL,
167         0xFFFFFFFFFFFFFF3FULL, 0xFFFFFFFFFFFFFF40ULL, 0xFFFFFFFFFFFFFF41ULL,
168         0xFFFFFFFFFFFFFF7EULL, 0xFFFFFFFFFFFFFF7FULL, 0xFFFFFFFFFFFFFF80ULL,
169                                0xFFFFFFFFFFFFFF81ULL, 0xFFFFFFFFFFFFFF82ULL,
170         0xFFFFFFFFFFFFFFBFULL, 0xFFFFFFFFFFFFFFC0ULL, 0xFFFFFFFFFFFFFFC1ULL,
171         0xFFFFFFFFFFFFFFFCULL, 0xFFFFFFFFFFFFFFFDULL, 0xFFFFFFFFFFFFFFFEULL,
172                                0xFFFFFFFFFFFFFFFFULL
173       };
174 
175 #else
176 
177 // shortened version, for use as valgrind regtest
178 #define NVALS 36
179 
180 static ULong val[NVALS]
181     = { 0x00ULL, 0x01ULL,
182         0x3FULL, 0x40ULL,
183         0x7FULL, 0x80ULL,
184         0xBFULL, 0xC0ULL,
185         0xFFULL,
186 
187         0xFF00ULL, 0xFF01ULL,
188         0xFF3FULL, 0xFF40ULL,
189         0xFF7FULL, 0xFF80ULL,
190         0xFFBFULL, 0xFFC0ULL,
191         0xFFFFULL,
192 
193         0xFFFFFF00ULL, 0xFFFFFF01ULL,
194         0xFFFFFF3FULL, 0xFFFFFF40ULL,
195         0xFFFFFF7EULL, 0xFFFFFF7FULL,
196         0xFFFFFFBFULL, 0xFFFFFFC0ULL,
197         0xFFFFFFFFULL,
198 
199         0xFFFFFFFFFFFFFF00ULL, 0xFFFFFFFFFFFFFF01ULL,
200         0xFFFFFFFFFFFFFF3FULL, 0xFFFFFFFFFFFFFF40ULL,
201         0xFFFFFFFFFFFFFF7FULL, 0xFFFFFFFFFFFFFF80ULL,
202         0xFFFFFFFFFFFFFFBFULL, 0xFFFFFFFFFFFFFFC0ULL,
203         0xFFFFFFFFFFFFFFFFULL
204       };
205 
206 #endif
207 
208 /////////////////////////////////////
209 
210 #define CC_C    0x0001
211 #define CC_P    0x0004
212 #define CC_A    0x0010
213 #define CC_Z    0x0040
214 #define CC_S    0x0080
215 #define CC_O    0x0800
216 
217 #define CC_MASK (CC_C | CC_P | CC_A | CC_Z | CC_S | CC_O)
218 
219 #define GEN_do_locked_G_E(_name,_eax)   \
220   \
221   __attribute__((noinline)) void do_locked_G_E_##_name ( void )  \
222   {   \
223     volatile Long e_val, g_val, e_val_before;   \
224     Long o, s, z, a, c, p, v1, v2, flags_in;       \
225     Long block[4];   \
226     \
227     for (v1 = 0; v1 < NVALS; v1++) {   \
228     for (v2 = 0; v2 < NVALS; v2++) {   \
229     \
230     for (o = 0; o < 2; o++) {   \
231     for (s = 0; s < 2; s++) {   \
232     for (z = 0; z < 2; z++) {   \
233     for (a = 0; a < 2; a++) {   \
234     for (c = 0; c < 2; c++) {   \
235     for (p = 0; p < 2; p++) {   \
236       \
237       flags_in = (o ? CC_O : 0)   \
238                | (s ? CC_S : 0)   \
239                | (z ? CC_Z : 0)   \
240                | (a ? CC_A : 0)   \
241                | (c ? CC_C : 0)   \
242                | (p ? CC_P : 0);   \
243       \
244       g_val = val[v1];   \
245       e_val = val[v2];   \
246       e_val_before = e_val;   \
247       \
248       block[0] = flags_in;   \
249       block[1] = g_val;   \
250       block[2] = (long)&e_val;   \
251       block[3] = 0;   \
252       __asm__ __volatile__(   \
253           "movq 0(%0), %%rax\n\t"   \
254           "pushq %%rax\n\t"   \
255           "popfq\n\t"   \
256           "movq 8(%0), %%rax\n\t"   \
257           "movq 16(%0), %%rbx\n\t"   \
258           "lock; " #_name " %%" #_eax ",(%%rbx)\n\t"   \
259           "pushfq\n\t"   \
260           "popq %%rax\n\t"   \
261           "movq %%rax, 24(%0)\n\t"   \
262           : : "r"(&block[0]) : "rax","rbx","cc","memory"   \
263       );   \
264       \
265       send( \
266       sprintf(outBuf, \
267              "%s G=%016llx E=%016llx CCIN=%08llx -> E=%016llx CCOUT=%08llx\n",       \
268              #_name, g_val, e_val_before, flags_in,   \
269               e_val, block[3] & CC_MASK));            \
270       \
271     }}}}}}   \
272     \
273     }}   \
274   }
275 
GEN_do_locked_G_E(addb,al)276 GEN_do_locked_G_E(addb,al)
277 GEN_do_locked_G_E(addw,ax)
278 GEN_do_locked_G_E(addl,eax)
279 GEN_do_locked_G_E(addq,rax)
280 
281 GEN_do_locked_G_E(orb, al)
282 GEN_do_locked_G_E(orw, ax)
283 GEN_do_locked_G_E(orl, eax)
284 GEN_do_locked_G_E(orq, rax)
285 
286 GEN_do_locked_G_E(adcb,al)
287 GEN_do_locked_G_E(adcw,ax)
288 GEN_do_locked_G_E(adcl,eax)
289 GEN_do_locked_G_E(adcq,rax)
290 
291 GEN_do_locked_G_E(sbbb,al)
292 GEN_do_locked_G_E(sbbw,ax)
293 GEN_do_locked_G_E(sbbl,eax)
294 GEN_do_locked_G_E(sbbq,rax)
295 
296 GEN_do_locked_G_E(andb,al)
297 GEN_do_locked_G_E(andw,ax)
298 GEN_do_locked_G_E(andl,eax)
299 GEN_do_locked_G_E(andq,rax)
300 
301 GEN_do_locked_G_E(subb,al)
302 GEN_do_locked_G_E(subw,ax)
303 GEN_do_locked_G_E(subl,eax)
304 GEN_do_locked_G_E(subq,rax)
305 
306 GEN_do_locked_G_E(xorb,al)
307 GEN_do_locked_G_E(xorw,ax)
308 GEN_do_locked_G_E(xorl,eax)
309 GEN_do_locked_G_E(xorq,rax)
310 
311 
312 
313 
314 #define GEN_do_locked_imm_E(_name,_eax,_imm)        \
315   \
316   __attribute__((noinline)) void do_locked_imm_E_##_name##_##_imm ( void )  \
317   {   \
318     volatile Long e_val, e_val_before;   \
319     Long o, s, z, a, c, p, v2, flags_in;   \
320     Long block[3];   \
321     \
322     for (v2 = 0; v2 < NVALS; v2++) {   \
323     \
324     for (o = 0; o < 2; o++) {   \
325     for (s = 0; s < 2; s++) {   \
326     for (z = 0; z < 2; z++) {   \
327     for (a = 0; a < 2; a++) {   \
328     for (c = 0; c < 2; c++) {   \
329     for (p = 0; p < 2; p++) {   \
330       \
331       flags_in = (o ? CC_O : 0)   \
332                | (s ? CC_S : 0)   \
333                | (z ? CC_Z : 0)   \
334                | (a ? CC_A : 0)   \
335                | (c ? CC_C : 0)   \
336                | (p ? CC_P : 0);   \
337       \
338       e_val = val[v2];   \
339       e_val_before = e_val;   \
340       \
341       block[0] = flags_in;   \
342       block[1] = (long)&e_val;   \
343       block[2] = 0;   \
344       __asm__ __volatile__(   \
345           "movq 0(%0), %%rax\n\t"   \
346           "pushq %%rax\n\t"   \
347           "popfq\n\t"   \
348           "movq 8(%0), %%rbx\n\t"   \
349           "lock; " #_name " $" #_imm ",(%%rbx)\n\t"   \
350           "pushfq\n\t"   \
351           "popq %%rax\n\t"   \
352           "movq %%rax, 16(%0)\n\t"   \
353           : : "r"(&block[0]) : "rax","rbx","cc","memory"   \
354       );   \
355       \
356       send( \
357            sprintf(outBuf, \
358            "%s I=%s E=%016llx CCIN=%08llx -> E=%016llx CCOUT=%08llx\n",    \
359              #_name, #_imm, e_val_before, flags_in,         \
360                    e_val, block[2] & CC_MASK));             \
361       \
362     }}}}}}   \
363     \
364     }   \
365   }
366 
367 GEN_do_locked_imm_E(addb,al,0x7F)
368 GEN_do_locked_imm_E(addb,al,0xF1)
369 GEN_do_locked_imm_E(addw,ax,0x7E)
370 GEN_do_locked_imm_E(addw,ax,0x9325)
371 GEN_do_locked_imm_E(addl,eax,0x7D)
372 GEN_do_locked_imm_E(addl,eax,0x31415927)
373 GEN_do_locked_imm_E(addq,rax,0x7D)
374 GEN_do_locked_imm_E(addq,rax,0x31415927)
375 
376 GEN_do_locked_imm_E(orb,al,0x7F)
377 GEN_do_locked_imm_E(orb,al,0xF1)
378 GEN_do_locked_imm_E(orw,ax,0x7E)
379 GEN_do_locked_imm_E(orw,ax,0x9325)
380 GEN_do_locked_imm_E(orl,eax,0x7D)
381 GEN_do_locked_imm_E(orl,eax,0x31415927)
382 GEN_do_locked_imm_E(orq,rax,0x7D)
383 GEN_do_locked_imm_E(orq,rax,0x31415927)
384 
385 GEN_do_locked_imm_E(adcb,al,0x7F)
386 GEN_do_locked_imm_E(adcb,al,0xF1)
387 GEN_do_locked_imm_E(adcw,ax,0x7E)
388 GEN_do_locked_imm_E(adcw,ax,0x9325)
389 GEN_do_locked_imm_E(adcl,eax,0x7D)
390 GEN_do_locked_imm_E(adcl,eax,0x31415927)
391 GEN_do_locked_imm_E(adcq,rax,0x7D)
392 GEN_do_locked_imm_E(adcq,rax,0x31415927)
393 
394 GEN_do_locked_imm_E(sbbb,al,0x7F)
395 GEN_do_locked_imm_E(sbbb,al,0xF1)
396 GEN_do_locked_imm_E(sbbw,ax,0x7E)
397 GEN_do_locked_imm_E(sbbw,ax,0x9325)
398 GEN_do_locked_imm_E(sbbl,eax,0x7D)
399 GEN_do_locked_imm_E(sbbl,eax,0x31415927)
400 GEN_do_locked_imm_E(sbbq,rax,0x7D)
401 GEN_do_locked_imm_E(sbbq,rax,0x31415927)
402 
403 GEN_do_locked_imm_E(andb,al,0x7F)
404 GEN_do_locked_imm_E(andb,al,0xF1)
405 GEN_do_locked_imm_E(andw,ax,0x7E)
406 GEN_do_locked_imm_E(andw,ax,0x9325)
407 GEN_do_locked_imm_E(andl,eax,0x7D)
408 GEN_do_locked_imm_E(andl,eax,0x31415927)
409 GEN_do_locked_imm_E(andq,rax,0x7D)
410 GEN_do_locked_imm_E(andq,rax,0x31415927)
411 
412 GEN_do_locked_imm_E(subb,al,0x7F)
413 GEN_do_locked_imm_E(subb,al,0xF1)
414 GEN_do_locked_imm_E(subw,ax,0x7E)
415 GEN_do_locked_imm_E(subw,ax,0x9325)
416 GEN_do_locked_imm_E(subl,eax,0x7D)
417 GEN_do_locked_imm_E(subl,eax,0x31415927)
418 GEN_do_locked_imm_E(subq,rax,0x7D)
419 GEN_do_locked_imm_E(subq,rax,0x31415927)
420 
421 GEN_do_locked_imm_E(xorb,al,0x7F)
422 GEN_do_locked_imm_E(xorb,al,0xF1)
423 GEN_do_locked_imm_E(xorw,ax,0x7E)
424 GEN_do_locked_imm_E(xorw,ax,0x9325)
425 GEN_do_locked_imm_E(xorl,eax,0x7D)
426 GEN_do_locked_imm_E(xorl,eax,0x31415927)
427 GEN_do_locked_imm_E(xorq,rax,0x7D)
428 GEN_do_locked_imm_E(xorq,rax,0x31415927)
429 
430 #define GEN_do_locked_unary_E(_name,_eax)        \
431   \
432   __attribute__((noinline)) void do_locked_unary_E_##_name ( void )  \
433   {   \
434     volatile Long e_val, e_val_before;   \
435     Long o, s, z, a, c, p, v2, flags_in;     \
436     Long block[3];   \
437     \
438     for (v2 = 0; v2 < NVALS; v2++) {   \
439     \
440     for (o = 0; o < 2; o++) {   \
441     for (s = 0; s < 2; s++) {   \
442     for (z = 0; z < 2; z++) {   \
443     for (a = 0; a < 2; a++) {   \
444     for (c = 0; c < 2; c++) {   \
445     for (p = 0; p < 2; p++) {   \
446       \
447       flags_in = (o ? CC_O : 0)   \
448                | (s ? CC_S : 0)   \
449                | (z ? CC_Z : 0)   \
450                | (a ? CC_A : 0)   \
451                | (c ? CC_C : 0)   \
452                | (p ? CC_P : 0);   \
453       \
454       e_val = val[v2];   \
455       e_val_before = e_val;   \
456       \
457       block[0] = flags_in;   \
458       block[1] = (long)&e_val;   \
459       block[2] = 0;   \
460       __asm__ __volatile__(   \
461           "movq 0(%0), %%rax\n\t"   \
462           "pushq %%rax\n\t"   \
463           "popfq\n\t"   \
464           "movq 8(%0), %%rbx\n\t"   \
465           "lock; " #_name " (%%rbx)\n\t"   \
466           "pushfq\n\t"   \
467           "popq %%rax\n\t"   \
468           "movq %%rax, 16(%0)\n\t"   \
469           : : "r"(&block[0]) : "rax","rbx","cc","memory"   \
470       );   \
471       \
472       send( \
473            sprintf(outBuf, \
474             "%s E=%016llx CCIN=%08llx -> E=%016llx CCOUT=%08llx\n", \
475              #_name, e_val_before, flags_in,         \
476             e_val, block[2] & CC_MASK));                       \
477       \
478     }}}}}}   \
479     \
480     }   \
481   }
482 
483 GEN_do_locked_unary_E(decb,al)
484 GEN_do_locked_unary_E(decw,ax)
485 GEN_do_locked_unary_E(decl,eax)
486 GEN_do_locked_unary_E(decq,rax)
487 
488 GEN_do_locked_unary_E(incb,al)
489 GEN_do_locked_unary_E(incw,ax)
490 GEN_do_locked_unary_E(incl,eax)
491 GEN_do_locked_unary_E(incq,rax)
492 
493 GEN_do_locked_unary_E(negb,al)
494 GEN_do_locked_unary_E(negw,ax)
495 GEN_do_locked_unary_E(negl,eax)
496 GEN_do_locked_unary_E(negq,rax)
497 
498 GEN_do_locked_unary_E(notb,al)
499 GEN_do_locked_unary_E(notw,ax)
500 GEN_do_locked_unary_E(notl,eax)
501 GEN_do_locked_unary_E(notq,rax)
502 
503 
504 /////////////////////////////////////////////////////////////////
505 
506 ULong btsq_mem ( UChar* base, int bitno )
507 {
508    ULong res;
509    __asm__
510    __volatile__("lock; btsq\t%2, %0\n\t"
511                 "setc   %%dl\n\t"
512                 "movzbq %%dl,%1\n"
513                 : "=m" (*base), "=r" (res)
514                 : "r" ((ULong)bitno) : "rdx","cc","memory" );
515    /* Pretty meaningless to dereference base here, but that's what you
516       have to do to get a btsl insn which refers to memory starting at
517       base. */
518    return res;
519 }
btsl_mem(UChar * base,int bitno)520 ULong btsl_mem ( UChar* base, int bitno )
521 {
522    ULong res;
523    __asm__
524    __volatile__("lock; btsl\t%2, %0\n\t"
525                 "setc   %%dl\n\t"
526                 "movzbq %%dl,%1\n"
527                 : "=m" (*base), "=r" (res)
528                 : "r" ((UInt)bitno));
529    return res;
530 }
btsw_mem(UChar * base,int bitno)531 ULong btsw_mem ( UChar* base, int bitno )
532 {
533    ULong res;
534    __asm__
535    __volatile__("lock; btsw\t%w2, %0\n\t"
536                 "setc   %%dl\n\t"
537                 "movzbq %%dl,%1\n"
538                 : "=m" (*base), "=r" (res)
539                 : "r" ((ULong)bitno));
540    return res;
541 }
542 
btrq_mem(UChar * base,int bitno)543 ULong btrq_mem ( UChar* base, int bitno )
544 {
545    ULong res;
546    __asm__
547    __volatile__("lock; btrq\t%2, %0\n\t"
548                 "setc   %%dl\n\t"
549                 "movzbq %%dl,%1\n"
550                 : "=m" (*base), "=r" (res)
551                 : "r" ((ULong)bitno));
552    return res;
553 }
btrl_mem(UChar * base,int bitno)554 ULong btrl_mem ( UChar* base, int bitno )
555 {
556    ULong res;
557    __asm__
558    __volatile__("lock; btrl\t%2, %0\n\t"
559                 "setc   %%dl\n\t"
560                 "movzbq %%dl,%1\n"
561                 : "=m" (*base), "=r" (res)
562                 : "r" ((UInt)bitno));
563    return res;
564 }
btrw_mem(UChar * base,int bitno)565 ULong btrw_mem ( UChar* base, int bitno )
566 {
567    ULong res;
568    __asm__
569    __volatile__("lock; btrw\t%w2, %0\n\t"
570                 "setc   %%dl\n\t"
571                 "movzbq %%dl,%1\n"
572                 : "=m" (*base), "=r" (res)
573                 : "r" ((ULong)bitno));
574    return res;
575 }
576 
btcq_mem(UChar * base,int bitno)577 ULong btcq_mem ( UChar* base, int bitno )
578 {
579    ULong res;
580    __asm__
581    __volatile__("lock; btcq\t%2, %0\n\t"
582                 "setc   %%dl\n\t"
583                 "movzbq %%dl,%1\n"
584                 : "=m" (*base), "=r" (res)
585                 : "r" ((ULong)bitno));
586    return res;
587 }
btcl_mem(UChar * base,int bitno)588 ULong btcl_mem ( UChar* base, int bitno )
589 {
590    ULong res;
591    __asm__
592    __volatile__("lock; btcl\t%2, %0\n\t"
593                 "setc   %%dl\n\t"
594                 "movzbq %%dl,%1\n"
595                 : "=m" (*base), "=r" (res)
596                 : "r" ((UInt)bitno));
597    return res;
598 }
btcw_mem(UChar * base,int bitno)599 ULong btcw_mem ( UChar* base, int bitno )
600 {
601    ULong res;
602    __asm__
603    __volatile__("lock; btcw\t%w2, %0\n\t"
604                 "setc   %%dl\n\t"
605                 "movzbq %%dl,%1\n"
606                 : "=m" (*base), "=r" (res)
607                 : "r" ((ULong)bitno));
608    return res;
609 }
610 
btq_mem(UChar * base,int bitno)611 ULong btq_mem ( UChar* base, int bitno )
612 {
613    ULong res;
614    __asm__
615    __volatile__("btq\t%2, %0\n\t"
616                 "setc   %%dl\n\t"
617                 "movzbq %%dl,%1\n"
618                 : "=m" (*base), "=r" (res)
619                 : "r" ((ULong)bitno)
620                 : "cc", "memory");
621    return res;
622 }
btl_mem(UChar * base,int bitno)623 ULong btl_mem ( UChar* base, int bitno )
624 {
625    ULong res;
626    __asm__
627    __volatile__("btl\t%2, %0\n\t"
628                 "setc   %%dl\n\t"
629                 "movzbq %%dl,%1\n"
630                 : "=m" (*base), "=r" (res)
631                 : "r" ((UInt)bitno)
632                 : "cc", "memory");
633    return res;
634 }
btw_mem(UChar * base,int bitno)635 ULong btw_mem ( UChar* base, int bitno )
636 {
637    ULong res;
638    __asm__
639    __volatile__("btw\t%w2, %0\n\t"
640                 "setc   %%dl\n\t"
641                 "movzbq %%dl,%1\n"
642                 : "=m" (*base), "=r" (res)
643                 : "r" ((ULong)bitno));
644    return res;
645 }
646 
rol1(ULong x)647 ULong rol1 ( ULong x )
648 {
649   return (x << 1) | (x >> 63);
650 }
651 
do_bt_G_E_tests(void)652 void do_bt_G_E_tests ( void )
653 {
654    ULong  n, bitoff, op;
655    ULong  c;
656    UChar* block;
657    ULong  carrydep, res;;
658 
659    /*------------------------ MEM-Q -----------------------*/
660 
661    carrydep = 0;
662    block = calloc(200,1);
663    block += 100;
664    /* Valid bit offsets are -800 .. 799 inclusive. */
665 
666    for (n = 0; n < 10000; n++) {
667       bitoff = (random() % 1600) - 800;
668       op = random() % 4;
669       c = 2;
670       switch (op) {
671          case 0: c = btsq_mem(block, bitoff); break;
672          case 1: c = btrq_mem(block, bitoff); break;
673          case 2: c = btcq_mem(block, bitoff); break;
674          case 3: c = btq_mem(block, bitoff); break;
675       }
676       c &= 255;
677       assert(c == 0 || c == 1);
678       carrydep = c ? (rol1(carrydep) ^ (Long)bitoff) : carrydep;
679    }
680 
681    /* Compute final result */
682    block -= 100;
683    res = 0;
684    for (n = 0; n < 200; n++) {
685       UChar ch = block[n];
686       /* printf("%d ", (int)block[n]); */
687       res = rol1(res) ^ (ULong)ch;
688    }
689 
690    send( sprintf(outBuf,
691                  "bt{s,r,c}q: final res 0x%llx, carrydep 0x%llx\n",
692                  res, carrydep));
693    free(block);
694 
695    /*------------------------ MEM-L -----------------------*/
696 
697    carrydep = 0;
698    block = calloc(200,1);
699    block += 100;
700    /* Valid bit offsets are -800 .. 799 inclusive. */
701 
702    for (n = 0; n < 10000; n++) {
703       bitoff = (random() % 1600) - 800;
704       op = random() % 4;
705       c = 2;
706       switch (op) {
707          case 0: c = btsl_mem(block, bitoff); break;
708          case 1: c = btrl_mem(block, bitoff); break;
709          case 2: c = btcl_mem(block, bitoff); break;
710          case 3: c = btl_mem(block, bitoff); break;
711       }
712       c &= 255;
713       assert(c == 0 || c == 1);
714       carrydep = c ? (rol1(carrydep) ^ (Long)(Int)bitoff) : carrydep;
715    }
716 
717    /* Compute final result */
718    block -= 100;
719    res = 0;
720    for (n = 0; n < 200; n++) {
721       UChar ch = block[n];
722       /* printf("%d ", (int)block[n]); */
723       res = rol1(res) ^ (ULong)ch;
724    }
725 
726    send( sprintf(outBuf,
727                  "bt{s,r,c}l: final res 0x%llx, carrydep 0x%llx\n",
728                  res, carrydep));
729    free(block);
730 
731    /*------------------------ MEM-W -----------------------*/
732 
733    carrydep = 0;
734    block = calloc(200,1);
735    block += 100;
736    /* Valid bit offsets are -800 .. 799 inclusive. */
737 
738    for (n = 0; n < 10000; n++) {
739       bitoff = (random() % 1600) - 800;
740       op = random() % 4;
741       c = 2;
742       switch (op) {
743          case 0: c = btsw_mem(block, bitoff); break;
744          case 1: c = btrw_mem(block, bitoff); break;
745          case 2: c = btcw_mem(block, bitoff); break;
746          case 3: c = btw_mem(block, bitoff); break;
747       }
748       c &= 255;
749       assert(c == 0 || c == 1);
750       carrydep = c ? (rol1(carrydep) ^ (Long)(Int)bitoff) : carrydep;
751    }
752 
753    /* Compute final result */
754    block -= 100;
755    res = 0;
756    for (n = 0; n < 200; n++) {
757       UChar ch = block[n];
758       /* printf("%d ", (int)block[n]); */
759       res = rol1(res) ^ (ULong)ch;
760    }
761 
762    send(sprintf(outBuf,
763                 "bt{s,r,c}w: final res 0x%llx, carrydep 0x%llx\n",
764                 res, carrydep));
765    free(block);
766 }
767 
768 
769 /////////////////////////////////////////////////////////////////
770 
771 /* Given a word, do bt/bts/btr/btc on bits 0, 1, 2 and 3 of it, and
772    also reconstruct the original bits 0, 1, 2, 3 by looking at the
773    carry flag.  Returned result has mashed bits 0-3 at the bottom and
774    the reconstructed original bits 0-3 as 4-7. */
775 
mash_mem_Q(ULong * origp)776 ULong mash_mem_Q ( ULong* origp )
777 {
778   ULong reconstructed, mashed;
779   __asm__ __volatile__ (
780      "movq %2, %%rdx\n\t"
781      ""
782      "movq $0, %%rax\n\t"
783      "\n\t"
784      "btq  $0, (%%rdx)\n\t"
785      "setb %%cl\n\t"
786      "movzbq %%cl, %%rcx\n\t"
787      "orq %%rcx, %%rax\n\t"
788      "\n\t"
789      "lock; btsq $1, (%%rdx)\n\t"
790      "setb %%cl\n\t"
791      "movzbq %%cl, %%rcx\n\t"
792      "shlq $1, %%rcx\n\t"
793      "orq %%rcx, %%rax\n\t"
794      "\n\t"
795      "lock; btrq $2, (%%rdx)\n\t"
796      "setb %%cl\n\t"
797      "movzbq %%cl, %%rcx\n\t"
798      "shlq $2, %%rcx\n\t"
799      "orq %%rcx, %%rax\n\t"
800      "\n\t"
801      "lock; btcq $3, (%%rdx)\n\t"
802      "setb %%cl\n\t"
803      "movzbq %%cl, %%rcx\n\t"
804      "shlq $3, %%rcx\n\t"
805      "orq %%rcx, %%rax\n\t"
806      "\n\t"
807      "movq %%rax, %0\n\t"
808      "movq (%%rdx), %1"
809      : "=r" (reconstructed), "=r" (mashed)
810      : "r" (origp)
811      : "rax", "rcx", "rdx", "cc");
812   return (mashed & 0xF) | ((reconstructed & 0xF) << 4);
813 }
814 
mash_mem_L(UInt * origp)815 ULong mash_mem_L ( UInt* origp )
816 {
817   ULong reconstructed; UInt mashed;
818   __asm__ __volatile__ (
819      "movq %2, %%rdx\n\t"
820      ""
821      "movq $0, %%rax\n\t"
822      "\n\t"
823      "btl  $0, (%%rdx)\n\t"
824      "setb %%cl\n\t"
825      "movzbq %%cl, %%rcx\n\t"
826      "orq %%rcx, %%rax\n\t"
827      "\n\t"
828      "lock; btsl $1, (%%rdx)\n\t"
829      "setb %%cl\n\t"
830      "movzbq %%cl, %%rcx\n\t"
831      "shlq $1, %%rcx\n\t"
832      "orq %%rcx, %%rax\n\t"
833      "\n\t"
834      "lock; btrl $2, (%%rdx)\n\t"
835      "setb %%cl\n\t"
836      "movzbq %%cl, %%rcx\n\t"
837      "shlq $2, %%rcx\n\t"
838      "orq %%rcx, %%rax\n\t"
839      "\n\t"
840      "lock; btcl $3, (%%rdx)\n\t"
841      "setb %%cl\n\t"
842      "movzbq %%cl, %%rcx\n\t"
843      "shlq $3, %%rcx\n\t"
844      "orq %%rcx, %%rax\n\t"
845      "\n\t"
846      "movq %%rax, %0\n\t"
847      "movl (%%rdx), %1"
848      : "=r" (reconstructed), "=r" (mashed)
849      : "r" (origp)
850      : "rax", "rcx", "rdx", "cc");
851   return (mashed & 0xF) | ((reconstructed & 0xF) << 4);
852 }
853 
mash_mem_W(UShort * origp)854 ULong mash_mem_W ( UShort* origp )
855 {
856   ULong reconstructed, mashed;
857   __asm__ __volatile__ (
858      "movq %2, %%rdx\n\t"
859      ""
860      "movq $0, %%rax\n\t"
861      "\n\t"
862      "btw  $0, (%%rdx)\n\t"
863      "setb %%cl\n\t"
864      "movzbq %%cl, %%rcx\n\t"
865      "orq %%rcx, %%rax\n\t"
866      "\n\t"
867      "lock; btsw $1, (%%rdx)\n\t"
868      "setb %%cl\n\t"
869      "movzbq %%cl, %%rcx\n\t"
870      "shlq $1, %%rcx\n\t"
871      "orq %%rcx, %%rax\n\t"
872      "\n\t"
873      "lock; btrw $2, (%%rdx)\n\t"
874      "setb %%cl\n\t"
875      "movzbq %%cl, %%rcx\n\t"
876      "shlq $2, %%rcx\n\t"
877      "orq %%rcx, %%rax\n\t"
878      "\n\t"
879      "lock; btcw $3, (%%rdx)\n\t"
880      "setb %%cl\n\t"
881      "movzbq %%cl, %%rcx\n\t"
882      "shlq $3, %%rcx\n\t"
883      "orq %%rcx, %%rax\n\t"
884      "\n\t"
885      "movq %%rax, %0\n\t"
886      "movzwq (%%rdx), %1"
887      : "=r" (reconstructed), "=r" (mashed)
888      : "r" (origp)
889      : "rax", "rcx", "rdx", "cc");
890   return (mashed & 0xF) | ((reconstructed & 0xF) << 4);
891 }
892 
893 
do_bt_imm_E_tests(void)894 void do_bt_imm_E_tests( void )
895 {
896   ULong i;
897   ULong*  iiq = malloc(sizeof(ULong));
898   UInt*   iil = malloc(sizeof(UInt));
899   UShort* iiw = malloc(sizeof(UShort));
900   for (i = 0; i < 0x10; i++) {
901     *iiq = i;
902     *iil = i;
903     *iiw = i;
904     send(sprintf(outBuf,"0x%llx -> 0x%02llx 0x%02llx 0x%02llx\n", i,
905                  mash_mem_Q(iiq), mash_mem_L(iil), mash_mem_W(iiw)));
906   }
907   free(iiq);
908   free(iil);
909   free(iiw);
910 }
911 
912 
913 /////////////////////////////////////////////////////////////////
914 
main(void)915 int main ( void )
916 {
917   do_locked_G_E_addb();
918   do_locked_G_E_addw();
919   do_locked_G_E_addl();
920   do_locked_G_E_addq();
921 
922   do_locked_G_E_orb();
923   do_locked_G_E_orw();
924   do_locked_G_E_orl();
925   do_locked_G_E_orq();
926 
927   do_locked_G_E_adcb();
928   do_locked_G_E_adcw();
929   do_locked_G_E_adcl();
930   do_locked_G_E_adcq();
931 
932   do_locked_G_E_sbbb();
933   do_locked_G_E_sbbw();
934   do_locked_G_E_sbbl();
935   do_locked_G_E_sbbq();
936 
937   do_locked_G_E_andb();
938   do_locked_G_E_andw();
939   do_locked_G_E_andl();
940   do_locked_G_E_andq();
941 
942   do_locked_G_E_subb();
943   do_locked_G_E_subw();
944   do_locked_G_E_subl();
945   do_locked_G_E_subq();
946 
947   do_locked_G_E_xorb();
948   do_locked_G_E_xorw();
949   do_locked_G_E_xorl();
950   do_locked_G_E_xorq();
951   // 4 * 7
952 
953   do_locked_imm_E_addb_0x7F();
954   do_locked_imm_E_addb_0xF1();
955   do_locked_imm_E_addw_0x7E();
956   do_locked_imm_E_addw_0x9325();
957   do_locked_imm_E_addl_0x7D();
958   do_locked_imm_E_addl_0x31415927();
959   do_locked_imm_E_addq_0x7D();
960   do_locked_imm_E_addq_0x31415927();
961 
962   do_locked_imm_E_orb_0x7F();
963   do_locked_imm_E_orb_0xF1();
964   do_locked_imm_E_orw_0x7E();
965   do_locked_imm_E_orw_0x9325();
966   do_locked_imm_E_orl_0x7D();
967   do_locked_imm_E_orl_0x31415927();
968   do_locked_imm_E_orq_0x7D();
969   do_locked_imm_E_orq_0x31415927();
970 
971   do_locked_imm_E_adcb_0x7F();
972   do_locked_imm_E_adcb_0xF1();
973   do_locked_imm_E_adcw_0x7E();
974   do_locked_imm_E_adcw_0x9325();
975   do_locked_imm_E_adcl_0x7D();
976   do_locked_imm_E_adcl_0x31415927();
977   do_locked_imm_E_adcq_0x7D();
978   do_locked_imm_E_adcq_0x31415927();
979 
980   do_locked_imm_E_sbbb_0x7F();
981   do_locked_imm_E_sbbb_0xF1();
982   do_locked_imm_E_sbbw_0x7E();
983   do_locked_imm_E_sbbw_0x9325();
984   do_locked_imm_E_sbbl_0x7D();
985   do_locked_imm_E_sbbl_0x31415927();
986   do_locked_imm_E_sbbq_0x7D();
987   do_locked_imm_E_sbbq_0x31415927();
988 
989   do_locked_imm_E_andb_0x7F();
990   do_locked_imm_E_andb_0xF1();
991   do_locked_imm_E_andw_0x7E();
992   do_locked_imm_E_andw_0x9325();
993   do_locked_imm_E_andl_0x7D();
994   do_locked_imm_E_andl_0x31415927();
995   do_locked_imm_E_andq_0x7D();
996   do_locked_imm_E_andq_0x31415927();
997 
998   do_locked_imm_E_subb_0x7F();
999   do_locked_imm_E_subb_0xF1();
1000   do_locked_imm_E_subw_0x7E();
1001   do_locked_imm_E_subw_0x9325();
1002   do_locked_imm_E_subl_0x7D();
1003   do_locked_imm_E_subl_0x31415927();
1004   do_locked_imm_E_subq_0x7D();
1005   do_locked_imm_E_subq_0x31415927();
1006 
1007   do_locked_imm_E_xorb_0x7F();
1008   do_locked_imm_E_xorb_0xF1();
1009   do_locked_imm_E_xorw_0x7E();
1010   do_locked_imm_E_xorw_0x9325();
1011   do_locked_imm_E_xorl_0x7D();
1012   do_locked_imm_E_xorl_0x31415927();
1013   do_locked_imm_E_xorq_0x7D();
1014   do_locked_imm_E_xorq_0x31415927();
1015   // 4 * 7 + 8 * 7 == 84
1016 
1017   do_locked_unary_E_decb();
1018   do_locked_unary_E_decw();
1019   do_locked_unary_E_decl();
1020   do_locked_unary_E_decq();
1021 
1022   do_locked_unary_E_incb();
1023   do_locked_unary_E_incw();
1024   do_locked_unary_E_incl();
1025   do_locked_unary_E_incq();
1026 
1027   do_locked_unary_E_negb();
1028   do_locked_unary_E_negw();
1029   do_locked_unary_E_negl();
1030   do_locked_unary_E_negq();
1031 
1032   do_locked_unary_E_notb();
1033   do_locked_unary_E_notw();
1034   do_locked_unary_E_notl();
1035   do_locked_unary_E_notq();
1036   // 100
1037 
1038   do_bt_G_E_tests();
1039   // 109
1040   do_bt_imm_E_tests();
1041   // 118
1042 
1043   // So there should be 118 lock-prefixed instructions in the
1044   // disassembly of this compilation unit.
1045   // confirm with
1046   // objdump -d ./amd64locked | grep lock | grep -v do_lock | grep -v elf64 | wc
1047 
1048 
1049   { UInt crcExpd = 0x1F677629;
1050     theCRC = crcFinalise( theCRC );
1051     if (theCRC == crcExpd) {
1052        printf("amd64locked: PASS: CRCs actual 0x%08X expected 0x%08X\n",
1053               theCRC, crcExpd);
1054     } else {
1055        printf("amd64locked: FAIL: CRCs actual 0x%08X expected 0x%08X\n",
1056               theCRC, crcExpd);
1057        printf("amd64locked: set #define VERBOSE 1 to diagnose\n");
1058     }
1059   }
1060 
1061   return 0;
1062 }
1063