1
2#include <stdio.h>
3#include <stdlib.h>
4#include <assert.h>
5
6#define VERBOSE 0
7
8typedef  unsigned int            UInt;
9typedef  unsigned char           UChar;
10typedef  unsigned long long int  ULong;
11typedef  signed long long int    Long;
12typedef  signed int              Int;
13typedef  unsigned short          UShort;
14typedef  unsigned long           UWord;
15typedef  char                    HChar;
16
17unsigned myrandom(void)
18{
19   /* Simple multiply-with-carry random generator. */
20   static unsigned m_w = 11;
21   static unsigned m_z = 13;
22
23   m_z = 36969 * (m_z & 65535) + (m_z >> 16);
24   m_w = 18000 * (m_w & 65535) + (m_w >> 16);
25
26   return (m_z << 16) + m_w;
27}
28
29/////////////////////////////////////////////////////////////////
30// BEGIN crc32 stuff                                           //
31/////////////////////////////////////////////////////////////////
32
33static const UInt crc32Table[256] = {
34
35   /*-- Ugly, innit? --*/
36
37   0x00000000L, 0x04c11db7L, 0x09823b6eL, 0x0d4326d9L,
38   0x130476dcL, 0x17c56b6bL, 0x1a864db2L, 0x1e475005L,
39   0x2608edb8L, 0x22c9f00fL, 0x2f8ad6d6L, 0x2b4bcb61L,
40   0x350c9b64L, 0x31cd86d3L, 0x3c8ea00aL, 0x384fbdbdL,
41   0x4c11db70L, 0x48d0c6c7L, 0x4593e01eL, 0x4152fda9L,
42   0x5f15adacL, 0x5bd4b01bL, 0x569796c2L, 0x52568b75L,
43   0x6a1936c8L, 0x6ed82b7fL, 0x639b0da6L, 0x675a1011L,
44   0x791d4014L, 0x7ddc5da3L, 0x709f7b7aL, 0x745e66cdL,
45   0x9823b6e0L, 0x9ce2ab57L, 0x91a18d8eL, 0x95609039L,
46   0x8b27c03cL, 0x8fe6dd8bL, 0x82a5fb52L, 0x8664e6e5L,
47   0xbe2b5b58L, 0xbaea46efL, 0xb7a96036L, 0xb3687d81L,
48   0xad2f2d84L, 0xa9ee3033L, 0xa4ad16eaL, 0xa06c0b5dL,
49   0xd4326d90L, 0xd0f37027L, 0xddb056feL, 0xd9714b49L,
50   0xc7361b4cL, 0xc3f706fbL, 0xceb42022L, 0xca753d95L,
51   0xf23a8028L, 0xf6fb9d9fL, 0xfbb8bb46L, 0xff79a6f1L,
52   0xe13ef6f4L, 0xe5ffeb43L, 0xe8bccd9aL, 0xec7dd02dL,
53   0x34867077L, 0x30476dc0L, 0x3d044b19L, 0x39c556aeL,
54   0x278206abL, 0x23431b1cL, 0x2e003dc5L, 0x2ac12072L,
55   0x128e9dcfL, 0x164f8078L, 0x1b0ca6a1L, 0x1fcdbb16L,
56   0x018aeb13L, 0x054bf6a4L, 0x0808d07dL, 0x0cc9cdcaL,
57   0x7897ab07L, 0x7c56b6b0L, 0x71159069L, 0x75d48ddeL,
58   0x6b93dddbL, 0x6f52c06cL, 0x6211e6b5L, 0x66d0fb02L,
59   0x5e9f46bfL, 0x5a5e5b08L, 0x571d7dd1L, 0x53dc6066L,
60   0x4d9b3063L, 0x495a2dd4L, 0x44190b0dL, 0x40d816baL,
61   0xaca5c697L, 0xa864db20L, 0xa527fdf9L, 0xa1e6e04eL,
62   0xbfa1b04bL, 0xbb60adfcL, 0xb6238b25L, 0xb2e29692L,
63   0x8aad2b2fL, 0x8e6c3698L, 0x832f1041L, 0x87ee0df6L,
64   0x99a95df3L, 0x9d684044L, 0x902b669dL, 0x94ea7b2aL,
65   0xe0b41de7L, 0xe4750050L, 0xe9362689L, 0xedf73b3eL,
66   0xf3b06b3bL, 0xf771768cL, 0xfa325055L, 0xfef34de2L,
67   0xc6bcf05fL, 0xc27dede8L, 0xcf3ecb31L, 0xcbffd686L,
68   0xd5b88683L, 0xd1799b34L, 0xdc3abdedL, 0xd8fba05aL,
69   0x690ce0eeL, 0x6dcdfd59L, 0x608edb80L, 0x644fc637L,
70   0x7a089632L, 0x7ec98b85L, 0x738aad5cL, 0x774bb0ebL,
71   0x4f040d56L, 0x4bc510e1L, 0x46863638L, 0x42472b8fL,
72   0x5c007b8aL, 0x58c1663dL, 0x558240e4L, 0x51435d53L,
73   0x251d3b9eL, 0x21dc2629L, 0x2c9f00f0L, 0x285e1d47L,
74   0x36194d42L, 0x32d850f5L, 0x3f9b762cL, 0x3b5a6b9bL,
75   0x0315d626L, 0x07d4cb91L, 0x0a97ed48L, 0x0e56f0ffL,
76   0x1011a0faL, 0x14d0bd4dL, 0x19939b94L, 0x1d528623L,
77   0xf12f560eL, 0xf5ee4bb9L, 0xf8ad6d60L, 0xfc6c70d7L,
78   0xe22b20d2L, 0xe6ea3d65L, 0xeba91bbcL, 0xef68060bL,
79   0xd727bbb6L, 0xd3e6a601L, 0xdea580d8L, 0xda649d6fL,
80   0xc423cd6aL, 0xc0e2d0ddL, 0xcda1f604L, 0xc960ebb3L,
81   0xbd3e8d7eL, 0xb9ff90c9L, 0xb4bcb610L, 0xb07daba7L,
82   0xae3afba2L, 0xaafbe615L, 0xa7b8c0ccL, 0xa379dd7bL,
83   0x9b3660c6L, 0x9ff77d71L, 0x92b45ba8L, 0x9675461fL,
84   0x8832161aL, 0x8cf30badL, 0x81b02d74L, 0x857130c3L,
85   0x5d8a9099L, 0x594b8d2eL, 0x5408abf7L, 0x50c9b640L,
86   0x4e8ee645L, 0x4a4ffbf2L, 0x470cdd2bL, 0x43cdc09cL,
87   0x7b827d21L, 0x7f436096L, 0x7200464fL, 0x76c15bf8L,
88   0x68860bfdL, 0x6c47164aL, 0x61043093L, 0x65c52d24L,
89   0x119b4be9L, 0x155a565eL, 0x18197087L, 0x1cd86d30L,
90   0x029f3d35L, 0x065e2082L, 0x0b1d065bL, 0x0fdc1becL,
91   0x3793a651L, 0x3352bbe6L, 0x3e119d3fL, 0x3ad08088L,
92   0x2497d08dL, 0x2056cd3aL, 0x2d15ebe3L, 0x29d4f654L,
93   0xc5a92679L, 0xc1683bceL, 0xcc2b1d17L, 0xc8ea00a0L,
94   0xd6ad50a5L, 0xd26c4d12L, 0xdf2f6bcbL, 0xdbee767cL,
95   0xe3a1cbc1L, 0xe760d676L, 0xea23f0afL, 0xeee2ed18L,
96   0xf0a5bd1dL, 0xf464a0aaL, 0xf9278673L, 0xfde69bc4L,
97   0x89b8fd09L, 0x8d79e0beL, 0x803ac667L, 0x84fbdbd0L,
98   0x9abc8bd5L, 0x9e7d9662L, 0x933eb0bbL, 0x97ffad0cL,
99   0xafb010b1L, 0xab710d06L, 0xa6322bdfL, 0xa2f33668L,
100   0xbcb4666dL, 0xb8757bdaL, 0xb5365d03L, 0xb1f740b4L
101};
102
103#define UPDATE_CRC(crcVar,cha)                 \
104{                                              \
105   crcVar = (crcVar << 8) ^                    \
106            crc32Table[(crcVar >> 24) ^        \
107                       ((UChar)cha)];          \
108}
109
110static UInt crcBytes ( UChar* bytes, UWord nBytes, UInt crcIn )
111{
112   UInt crc = crcIn;
113   while (nBytes >= 4) {
114      UPDATE_CRC(crc, bytes[0]);
115      UPDATE_CRC(crc, bytes[1]);
116      UPDATE_CRC(crc, bytes[2]);
117      UPDATE_CRC(crc, bytes[3]);
118      bytes += 4;
119      nBytes -= 4;
120   }
121   while (nBytes >= 1) {
122      UPDATE_CRC(crc, bytes[0]);
123      bytes += 1;
124      nBytes -= 1;
125   }
126   return crc;
127}
128
129static UInt crcFinalise ( UInt crc ) {
130   return ~crc;
131}
132
133////////
134
135static UInt theCRC = 0xFFFFFFFF;
136
137static HChar outBuf[1024];
138// take output that's in outBuf, length as specified, and
139// update the running crc.
140static void send ( int nbytes )
141{
142   assert( ((unsigned int)nbytes) < sizeof(outBuf)-1);
143   assert(outBuf[nbytes] == 0);
144   theCRC = crcBytes( (UChar*)&outBuf[0], nbytes, theCRC );
145   if (VERBOSE) printf("SEND %08x %s", theCRC, outBuf);
146}
147
148
149/////////////////////////////////////////////////////////////////
150// END crc32 stuff                                             //
151/////////////////////////////////////////////////////////////////
152
153#if 0
154
155// full version
156#define NVALS 76
157
158static ULong val[NVALS]
159    = { 0x00ULL, 0x01ULL, 0x02ULL, 0x03ULL,
160        0x3FULL, 0x40ULL, 0x41ULL,
161        0x7EULL, 0x7FULL, 0x80ULL, 0x81ULL, 0x82ULL,
162        0xBFULL, 0xC0ULL, 0xC1ULL,
163        0xFCULL, 0xFDULL, 0xFEULL, 0xFFULL,
164
165        0xFF00ULL, 0xFF01ULL, 0xFF02ULL, 0xFF03ULL,
166        0xFF3FULL, 0xFF40ULL, 0xFF41ULL,
167        0xFF7EULL, 0xFF7FULL, 0xFF80ULL, 0xFF81ULL, 0xFF82ULL,
168        0xFFBFULL, 0xFFC0ULL, 0xFFC1ULL,
169        0xFFFCULL, 0xFFFDULL, 0xFFFEULL, 0xFFFFULL,
170
171        0xFFFFFF00ULL, 0xFFFFFF01ULL, 0xFFFFFF02ULL, 0xFFFFFF03ULL,
172        0xFFFFFF3FULL, 0xFFFFFF40ULL, 0xFFFFFF41ULL,
173        0xFFFFFF7EULL, 0xFFFFFF7FULL, 0xFFFFFF80ULL, 0xFFFFFF81ULL, 0xFFFFFF82ULL,
174        0xFFFFFFBFULL, 0xFFFFFFC0ULL, 0xFFFFFFC1ULL,
175        0xFFFFFFFCULL, 0xFFFFFFFDULL, 0xFFFFFFFEULL, 0xFFFFFFFFULL,
176
177        0xFFFFFFFFFFFFFF00ULL, 0xFFFFFFFFFFFFFF01ULL, 0xFFFFFFFFFFFFFF02ULL,
178                               0xFFFFFFFFFFFFFF03ULL,
179        0xFFFFFFFFFFFFFF3FULL, 0xFFFFFFFFFFFFFF40ULL, 0xFFFFFFFFFFFFFF41ULL,
180        0xFFFFFFFFFFFFFF7EULL, 0xFFFFFFFFFFFFFF7FULL, 0xFFFFFFFFFFFFFF80ULL,
181                               0xFFFFFFFFFFFFFF81ULL, 0xFFFFFFFFFFFFFF82ULL,
182        0xFFFFFFFFFFFFFFBFULL, 0xFFFFFFFFFFFFFFC0ULL, 0xFFFFFFFFFFFFFFC1ULL,
183        0xFFFFFFFFFFFFFFFCULL, 0xFFFFFFFFFFFFFFFDULL, 0xFFFFFFFFFFFFFFFEULL,
184                               0xFFFFFFFFFFFFFFFFULL
185      };
186
187#else
188
189// shortened version, for use as valgrind regtest
190#define NVALS 36
191
192static ULong val[NVALS]
193    = { 0x00ULL, 0x01ULL,
194        0x3FULL, 0x40ULL,
195        0x7FULL, 0x80ULL,
196        0xBFULL, 0xC0ULL,
197        0xFFULL,
198
199        0xFF00ULL, 0xFF01ULL,
200        0xFF3FULL, 0xFF40ULL,
201        0xFF7FULL, 0xFF80ULL,
202        0xFFBFULL, 0xFFC0ULL,
203        0xFFFFULL,
204
205        0xFFFFFF00ULL, 0xFFFFFF01ULL,
206        0xFFFFFF3FULL, 0xFFFFFF40ULL,
207        0xFFFFFF7EULL, 0xFFFFFF7FULL,
208        0xFFFFFFBFULL, 0xFFFFFFC0ULL,
209        0xFFFFFFFFULL,
210
211        0xFFFFFFFFFFFFFF00ULL, 0xFFFFFFFFFFFFFF01ULL,
212        0xFFFFFFFFFFFFFF3FULL, 0xFFFFFFFFFFFFFF40ULL,
213        0xFFFFFFFFFFFFFF7FULL, 0xFFFFFFFFFFFFFF80ULL,
214        0xFFFFFFFFFFFFFFBFULL, 0xFFFFFFFFFFFFFFC0ULL,
215        0xFFFFFFFFFFFFFFFFULL
216      };
217
218#endif
219
220/////////////////////////////////////
221
222#define CC_C    0x0001
223#define CC_P    0x0004
224#define CC_A    0x0010
225#define CC_Z    0x0040
226#define CC_S    0x0080
227#define CC_O    0x0800
228
229#define CC_MASK (CC_C | CC_P | CC_A | CC_Z | CC_S | CC_O)
230
231#define GEN_do_locked_G_E(_name,_eax)   \
232  \
233  __attribute__((noinline)) void do_locked_G_E_##_name ( void )  \
234  {   \
235    volatile Long e_val, g_val, e_val_before;   \
236    Long o, s, z, a, c, p, v1, v2, flags_in;       \
237    Long block[4];   \
238    \
239    for (v1 = 0; v1 < NVALS; v1++) {   \
240    for (v2 = 0; v2 < NVALS; v2++) {   \
241    \
242    for (o = 0; o < 2; o++) {   \
243    for (s = 0; s < 2; s++) {   \
244    for (z = 0; z < 2; z++) {   \
245    for (a = 0; a < 2; a++) {   \
246    for (c = 0; c < 2; c++) {   \
247    for (p = 0; p < 2; p++) {   \
248      \
249      flags_in = (o ? CC_O : 0)   \
250               | (s ? CC_S : 0)   \
251               | (z ? CC_Z : 0)   \
252               | (a ? CC_A : 0)   \
253               | (c ? CC_C : 0)   \
254               | (p ? CC_P : 0);   \
255      \
256      g_val = val[v1];   \
257      e_val = val[v2];   \
258      e_val_before = e_val;   \
259      \
260      block[0] = flags_in;   \
261      block[1] = g_val;   \
262      block[2] = (long)&e_val;   \
263      block[3] = 0;   \
264      __asm__ __volatile__(   \
265          "movq 0(%0), %%rax\n\t"   \
266          "pushq %%rax\n\t"   \
267          "popfq\n\t"   \
268          "movq 8(%0), %%rax\n\t"   \
269          "movq 16(%0), %%rbx\n\t"   \
270          "lock; " #_name " %%" #_eax ",(%%rbx)\n\t"   \
271          "pushfq\n\t"   \
272          "popq %%rax\n\t"   \
273          "movq %%rax, 24(%0)\n\t"   \
274          : : "r"(&block[0]) : "rax","rbx","cc","memory"   \
275      );   \
276      \
277      send( \
278      sprintf(outBuf, \
279             "%s G=%016llx E=%016llx CCIN=%08llx -> E=%016llx CCOUT=%08llx\n",       \
280             #_name, g_val, e_val_before, flags_in,   \
281              e_val, block[3] & CC_MASK));            \
282      \
283    }}}}}}   \
284    \
285    }}   \
286  }
287
288GEN_do_locked_G_E(addb,al)
289GEN_do_locked_G_E(addw,ax)
290GEN_do_locked_G_E(addl,eax)
291GEN_do_locked_G_E(addq,rax)
292
293GEN_do_locked_G_E(orb, al)
294GEN_do_locked_G_E(orw, ax)
295GEN_do_locked_G_E(orl, eax)
296GEN_do_locked_G_E(orq, rax)
297
298GEN_do_locked_G_E(adcb,al)
299GEN_do_locked_G_E(adcw,ax)
300GEN_do_locked_G_E(adcl,eax)
301GEN_do_locked_G_E(adcq,rax)
302
303GEN_do_locked_G_E(sbbb,al)
304GEN_do_locked_G_E(sbbw,ax)
305GEN_do_locked_G_E(sbbl,eax)
306GEN_do_locked_G_E(sbbq,rax)
307
308GEN_do_locked_G_E(andb,al)
309GEN_do_locked_G_E(andw,ax)
310GEN_do_locked_G_E(andl,eax)
311GEN_do_locked_G_E(andq,rax)
312
313GEN_do_locked_G_E(subb,al)
314GEN_do_locked_G_E(subw,ax)
315GEN_do_locked_G_E(subl,eax)
316GEN_do_locked_G_E(subq,rax)
317
318GEN_do_locked_G_E(xorb,al)
319GEN_do_locked_G_E(xorw,ax)
320GEN_do_locked_G_E(xorl,eax)
321GEN_do_locked_G_E(xorq,rax)
322
323
324
325
326#define GEN_do_locked_imm_E(_name,_eax,_imm)        \
327  \
328  __attribute__((noinline)) void do_locked_imm_E_##_name##_##_imm ( void )  \
329  {   \
330    volatile Long e_val, e_val_before;   \
331    Long o, s, z, a, c, p, v2, flags_in;   \
332    Long block[3];   \
333    \
334    for (v2 = 0; v2 < NVALS; v2++) {   \
335    \
336    for (o = 0; o < 2; o++) {   \
337    for (s = 0; s < 2; s++) {   \
338    for (z = 0; z < 2; z++) {   \
339    for (a = 0; a < 2; a++) {   \
340    for (c = 0; c < 2; c++) {   \
341    for (p = 0; p < 2; p++) {   \
342      \
343      flags_in = (o ? CC_O : 0)   \
344               | (s ? CC_S : 0)   \
345               | (z ? CC_Z : 0)   \
346               | (a ? CC_A : 0)   \
347               | (c ? CC_C : 0)   \
348               | (p ? CC_P : 0);   \
349      \
350      e_val = val[v2];   \
351      e_val_before = e_val;   \
352      \
353      block[0] = flags_in;   \
354      block[1] = (long)&e_val;   \
355      block[2] = 0;   \
356      __asm__ __volatile__(   \
357          "movq 0(%0), %%rax\n\t"   \
358          "pushq %%rax\n\t"   \
359          "popfq\n\t"   \
360          "movq 8(%0), %%rbx\n\t"   \
361          "lock; " #_name " $" #_imm ",(%%rbx)\n\t"   \
362          "pushfq\n\t"   \
363          "popq %%rax\n\t"   \
364          "movq %%rax, 16(%0)\n\t"   \
365          : : "r"(&block[0]) : "rax","rbx","cc","memory"   \
366      );   \
367      \
368      send( \
369           sprintf(outBuf, \
370           "%s I=%s E=%016llx CCIN=%08llx -> E=%016llx CCOUT=%08llx\n",    \
371             #_name, #_imm, e_val_before, flags_in,         \
372                   e_val, block[2] & CC_MASK));             \
373      \
374    }}}}}}   \
375    \
376    }   \
377  }
378
379GEN_do_locked_imm_E(addb,al,0x7F)
380GEN_do_locked_imm_E(addb,al,0xF1)
381GEN_do_locked_imm_E(addw,ax,0x7E)
382GEN_do_locked_imm_E(addw,ax,0x9325)
383GEN_do_locked_imm_E(addl,eax,0x7D)
384GEN_do_locked_imm_E(addl,eax,0x31415927)
385GEN_do_locked_imm_E(addq,rax,0x7D)
386GEN_do_locked_imm_E(addq,rax,0x31415927)
387
388GEN_do_locked_imm_E(orb,al,0x7F)
389GEN_do_locked_imm_E(orb,al,0xF1)
390GEN_do_locked_imm_E(orw,ax,0x7E)
391GEN_do_locked_imm_E(orw,ax,0x9325)
392GEN_do_locked_imm_E(orl,eax,0x7D)
393GEN_do_locked_imm_E(orl,eax,0x31415927)
394GEN_do_locked_imm_E(orq,rax,0x7D)
395GEN_do_locked_imm_E(orq,rax,0x31415927)
396
397GEN_do_locked_imm_E(adcb,al,0x7F)
398GEN_do_locked_imm_E(adcb,al,0xF1)
399GEN_do_locked_imm_E(adcw,ax,0x7E)
400GEN_do_locked_imm_E(adcw,ax,0x9325)
401GEN_do_locked_imm_E(adcl,eax,0x7D)
402GEN_do_locked_imm_E(adcl,eax,0x31415927)
403GEN_do_locked_imm_E(adcq,rax,0x7D)
404GEN_do_locked_imm_E(adcq,rax,0x31415927)
405
406GEN_do_locked_imm_E(sbbb,al,0x7F)
407GEN_do_locked_imm_E(sbbb,al,0xF1)
408GEN_do_locked_imm_E(sbbw,ax,0x7E)
409GEN_do_locked_imm_E(sbbw,ax,0x9325)
410GEN_do_locked_imm_E(sbbl,eax,0x7D)
411GEN_do_locked_imm_E(sbbl,eax,0x31415927)
412GEN_do_locked_imm_E(sbbq,rax,0x7D)
413GEN_do_locked_imm_E(sbbq,rax,0x31415927)
414
415GEN_do_locked_imm_E(andb,al,0x7F)
416GEN_do_locked_imm_E(andb,al,0xF1)
417GEN_do_locked_imm_E(andw,ax,0x7E)
418GEN_do_locked_imm_E(andw,ax,0x9325)
419GEN_do_locked_imm_E(andl,eax,0x7D)
420GEN_do_locked_imm_E(andl,eax,0x31415927)
421GEN_do_locked_imm_E(andq,rax,0x7D)
422GEN_do_locked_imm_E(andq,rax,0x31415927)
423
424GEN_do_locked_imm_E(subb,al,0x7F)
425GEN_do_locked_imm_E(subb,al,0xF1)
426GEN_do_locked_imm_E(subw,ax,0x7E)
427GEN_do_locked_imm_E(subw,ax,0x9325)
428GEN_do_locked_imm_E(subl,eax,0x7D)
429GEN_do_locked_imm_E(subl,eax,0x31415927)
430GEN_do_locked_imm_E(subq,rax,0x7D)
431GEN_do_locked_imm_E(subq,rax,0x31415927)
432
433GEN_do_locked_imm_E(xorb,al,0x7F)
434GEN_do_locked_imm_E(xorb,al,0xF1)
435GEN_do_locked_imm_E(xorw,ax,0x7E)
436GEN_do_locked_imm_E(xorw,ax,0x9325)
437GEN_do_locked_imm_E(xorl,eax,0x7D)
438GEN_do_locked_imm_E(xorl,eax,0x31415927)
439GEN_do_locked_imm_E(xorq,rax,0x7D)
440GEN_do_locked_imm_E(xorq,rax,0x31415927)
441
442#define GEN_do_locked_unary_E(_name,_eax)        \
443  \
444  __attribute__((noinline)) void do_locked_unary_E_##_name ( void )  \
445  {   \
446    volatile Long e_val, e_val_before;   \
447    Long o, s, z, a, c, p, v2, flags_in;     \
448    Long block[3];   \
449    \
450    for (v2 = 0; v2 < NVALS; v2++) {   \
451    \
452    for (o = 0; o < 2; o++) {   \
453    for (s = 0; s < 2; s++) {   \
454    for (z = 0; z < 2; z++) {   \
455    for (a = 0; a < 2; a++) {   \
456    for (c = 0; c < 2; c++) {   \
457    for (p = 0; p < 2; p++) {   \
458      \
459      flags_in = (o ? CC_O : 0)   \
460               | (s ? CC_S : 0)   \
461               | (z ? CC_Z : 0)   \
462               | (a ? CC_A : 0)   \
463               | (c ? CC_C : 0)   \
464               | (p ? CC_P : 0);   \
465      \
466      e_val = val[v2];   \
467      e_val_before = e_val;   \
468      \
469      block[0] = flags_in;   \
470      block[1] = (long)&e_val;   \
471      block[2] = 0;   \
472      __asm__ __volatile__(   \
473          "movq 0(%0), %%rax\n\t"   \
474          "pushq %%rax\n\t"   \
475          "popfq\n\t"   \
476          "movq 8(%0), %%rbx\n\t"   \
477          "lock; " #_name " (%%rbx)\n\t"   \
478          "pushfq\n\t"   \
479          "popq %%rax\n\t"   \
480          "movq %%rax, 16(%0)\n\t"   \
481          : : "r"(&block[0]) : "rax","rbx","cc","memory"   \
482      );   \
483      \
484      send( \
485           sprintf(outBuf, \
486            "%s E=%016llx CCIN=%08llx -> E=%016llx CCOUT=%08llx\n", \
487             #_name, e_val_before, flags_in,         \
488            e_val, block[2] & CC_MASK));                       \
489      \
490    }}}}}}   \
491    \
492    }   \
493  }
494
495GEN_do_locked_unary_E(decb,al)
496GEN_do_locked_unary_E(decw,ax)
497GEN_do_locked_unary_E(decl,eax)
498GEN_do_locked_unary_E(decq,rax)
499
500GEN_do_locked_unary_E(incb,al)
501GEN_do_locked_unary_E(incw,ax)
502GEN_do_locked_unary_E(incl,eax)
503GEN_do_locked_unary_E(incq,rax)
504
505GEN_do_locked_unary_E(negb,al)
506GEN_do_locked_unary_E(negw,ax)
507GEN_do_locked_unary_E(negl,eax)
508GEN_do_locked_unary_E(negq,rax)
509
510GEN_do_locked_unary_E(notb,al)
511GEN_do_locked_unary_E(notw,ax)
512GEN_do_locked_unary_E(notl,eax)
513GEN_do_locked_unary_E(notq,rax)
514
515
516/////////////////////////////////////////////////////////////////
517
518ULong btsq_mem ( UChar* base, int bitno )
519{
520   ULong res;
521   __asm__
522   __volatile__("lock; btsq\t%2, %0\n\t"
523                "setc   %%dl\n\t"
524                "movzbq %%dl,%1\n"
525                : "=m" (*base), "=r" (res)
526                : "r" ((ULong)bitno) : "rdx","cc","memory" );
527   /* Pretty meaningless to dereference base here, but that's what you
528      have to do to get a btsl insn which refers to memory starting at
529      base. */
530   return res;
531}
532ULong btsl_mem ( UChar* base, int bitno )
533{
534   ULong res;
535   __asm__
536   __volatile__("lock; btsl\t%2, %0\n\t"
537                "setc   %%dl\n\t"
538                "movzbq %%dl,%1\n"
539                : "=m" (*base), "=r" (res)
540                : "r" ((UInt)bitno));
541   return res;
542}
543ULong btsw_mem ( UChar* base, int bitno )
544{
545   ULong res;
546   __asm__
547   __volatile__("lock; btsw\t%w2, %0\n\t"
548                "setc   %%dl\n\t"
549                "movzbq %%dl,%1\n"
550                : "=m" (*base), "=r" (res)
551                : "r" ((ULong)bitno));
552   return res;
553}
554
555ULong btrq_mem ( UChar* base, int bitno )
556{
557   ULong res;
558   __asm__
559   __volatile__("lock; btrq\t%2, %0\n\t"
560                "setc   %%dl\n\t"
561                "movzbq %%dl,%1\n"
562                : "=m" (*base), "=r" (res)
563                : "r" ((ULong)bitno));
564   return res;
565}
566ULong btrl_mem ( UChar* base, int bitno )
567{
568   ULong res;
569   __asm__
570   __volatile__("lock; btrl\t%2, %0\n\t"
571                "setc   %%dl\n\t"
572                "movzbq %%dl,%1\n"
573                : "=m" (*base), "=r" (res)
574                : "r" ((UInt)bitno));
575   return res;
576}
577ULong btrw_mem ( UChar* base, int bitno )
578{
579   ULong res;
580   __asm__
581   __volatile__("lock; btrw\t%w2, %0\n\t"
582                "setc   %%dl\n\t"
583                "movzbq %%dl,%1\n"
584                : "=m" (*base), "=r" (res)
585                : "r" ((ULong)bitno));
586   return res;
587}
588
589ULong btcq_mem ( UChar* base, int bitno )
590{
591   ULong res;
592   __asm__
593   __volatile__("lock; btcq\t%2, %0\n\t"
594                "setc   %%dl\n\t"
595                "movzbq %%dl,%1\n"
596                : "=m" (*base), "=r" (res)
597                : "r" ((ULong)bitno));
598   return res;
599}
600ULong btcl_mem ( UChar* base, int bitno )
601{
602   ULong res;
603   __asm__
604   __volatile__("lock; btcl\t%2, %0\n\t"
605                "setc   %%dl\n\t"
606                "movzbq %%dl,%1\n"
607                : "=m" (*base), "=r" (res)
608                : "r" ((UInt)bitno));
609   return res;
610}
611ULong btcw_mem ( UChar* base, int bitno )
612{
613   ULong res;
614   __asm__
615   __volatile__("lock; btcw\t%w2, %0\n\t"
616                "setc   %%dl\n\t"
617                "movzbq %%dl,%1\n"
618                : "=m" (*base), "=r" (res)
619                : "r" ((ULong)bitno));
620   return res;
621}
622
623ULong btq_mem ( UChar* base, int bitno )
624{
625   ULong res;
626   __asm__
627   __volatile__("btq\t%2, %0\n\t"
628                "setc   %%dl\n\t"
629                "movzbq %%dl,%1\n"
630                : "=m" (*base), "=r" (res)
631                : "r" ((ULong)bitno)
632                : "cc", "memory");
633   return res;
634}
635ULong btl_mem ( UChar* base, int bitno )
636{
637   ULong res;
638   __asm__
639   __volatile__("btl\t%2, %0\n\t"
640                "setc   %%dl\n\t"
641                "movzbq %%dl,%1\n"
642                : "=m" (*base), "=r" (res)
643                : "r" ((UInt)bitno)
644                : "cc", "memory");
645   return res;
646}
647ULong btw_mem ( UChar* base, int bitno )
648{
649   ULong res;
650   __asm__
651   __volatile__("btw\t%w2, %0\n\t"
652                "setc   %%dl\n\t"
653                "movzbq %%dl,%1\n"
654                : "=m" (*base), "=r" (res)
655                : "r" ((ULong)bitno));
656   return res;
657}
658
659ULong rol1 ( ULong x )
660{
661  return (x << 1) | (x >> 63);
662}
663
664void do_bt_G_E_tests ( void )
665{
666   ULong  n, bitoff, op;
667   ULong  c;
668   UChar* block;
669   ULong  carrydep, res;;
670
671   /*------------------------ MEM-Q -----------------------*/
672
673   carrydep = 0;
674   block = calloc(200,1);
675   block += 100;
676   /* Valid bit offsets are -800 .. 799 inclusive. */
677
678   for (n = 0; n < 10000; n++) {
679      bitoff = (myrandom() % 1600) - 800;
680      op = myrandom() % 4;
681      c = 2;
682      switch (op) {
683         case 0: c = btsq_mem(block, bitoff); break;
684         case 1: c = btrq_mem(block, bitoff); break;
685         case 2: c = btcq_mem(block, bitoff); break;
686         case 3: c = btq_mem(block, bitoff); break;
687      }
688      c &= 255;
689      assert(c == 0 || c == 1);
690      carrydep = c ? (rol1(carrydep) ^ (Long)bitoff) : carrydep;
691   }
692
693   /* Compute final result */
694   block -= 100;
695   res = 0;
696   for (n = 0; n < 200; n++) {
697      UChar ch = block[n];
698      /* printf("%d ", (int)block[n]); */
699      res = rol1(res) ^ (ULong)ch;
700   }
701
702   send( sprintf(outBuf,
703                 "bt{s,r,c}q: final res 0x%llx, carrydep 0x%llx\n",
704                 res, carrydep));
705   free(block);
706
707   /*------------------------ MEM-L -----------------------*/
708
709   carrydep = 0;
710   block = calloc(200,1);
711   block += 100;
712   /* Valid bit offsets are -800 .. 799 inclusive. */
713
714   for (n = 0; n < 10000; n++) {
715      bitoff = (myrandom() % 1600) - 800;
716      op = myrandom() % 4;
717      c = 2;
718      switch (op) {
719         case 0: c = btsl_mem(block, bitoff); break;
720         case 1: c = btrl_mem(block, bitoff); break;
721         case 2: c = btcl_mem(block, bitoff); break;
722         case 3: c = btl_mem(block, bitoff); break;
723      }
724      c &= 255;
725      assert(c == 0 || c == 1);
726      carrydep = c ? (rol1(carrydep) ^ (Long)(Int)bitoff) : carrydep;
727   }
728
729   /* Compute final result */
730   block -= 100;
731   res = 0;
732   for (n = 0; n < 200; n++) {
733      UChar ch = block[n];
734      /* printf("%d ", (int)block[n]); */
735      res = rol1(res) ^ (ULong)ch;
736   }
737
738   send( sprintf(outBuf,
739                 "bt{s,r,c}l: final res 0x%llx, carrydep 0x%llx\n",
740                 res, carrydep));
741   free(block);
742
743   /*------------------------ MEM-W -----------------------*/
744
745   carrydep = 0;
746   block = calloc(200,1);
747   block += 100;
748   /* Valid bit offsets are -800 .. 799 inclusive. */
749
750   for (n = 0; n < 10000; n++) {
751      bitoff = (myrandom() % 1600) - 800;
752      op = myrandom() % 4;
753      c = 2;
754      switch (op) {
755         case 0: c = btsw_mem(block, bitoff); break;
756         case 1: c = btrw_mem(block, bitoff); break;
757         case 2: c = btcw_mem(block, bitoff); break;
758         case 3: c = btw_mem(block, bitoff); break;
759      }
760      c &= 255;
761      assert(c == 0 || c == 1);
762      carrydep = c ? (rol1(carrydep) ^ (Long)(Int)bitoff) : carrydep;
763   }
764
765   /* Compute final result */
766   block -= 100;
767   res = 0;
768   for (n = 0; n < 200; n++) {
769      UChar ch = block[n];
770      /* printf("%d ", (int)block[n]); */
771      res = rol1(res) ^ (ULong)ch;
772   }
773
774   send(sprintf(outBuf,
775                "bt{s,r,c}w: final res 0x%llx, carrydep 0x%llx\n",
776                res, carrydep));
777   free(block);
778}
779
780
781/////////////////////////////////////////////////////////////////
782
783/* Given a word, do bt/bts/btr/btc on bits 0, 1, 2 and 3 of it, and
784   also reconstruct the original bits 0, 1, 2, 3 by looking at the
785   carry flag.  Returned result has mashed bits 0-3 at the bottom and
786   the reconstructed original bits 0-3 as 4-7. */
787
788ULong mash_mem_Q ( ULong* origp )
789{
790  ULong reconstructed, mashed;
791  __asm__ __volatile__ (
792     "movq %2, %%rdx\n\t"
793     ""
794     "movq $0, %%rax\n\t"
795     "\n\t"
796     "btq  $0, (%%rdx)\n\t"
797     "setb %%cl\n\t"
798     "movzbq %%cl, %%rcx\n\t"
799     "orq %%rcx, %%rax\n\t"
800     "\n\t"
801     "lock; btsq $1, (%%rdx)\n\t"
802     "setb %%cl\n\t"
803     "movzbq %%cl, %%rcx\n\t"
804     "shlq $1, %%rcx\n\t"
805     "orq %%rcx, %%rax\n\t"
806     "\n\t"
807     "lock; btrq $2, (%%rdx)\n\t"
808     "setb %%cl\n\t"
809     "movzbq %%cl, %%rcx\n\t"
810     "shlq $2, %%rcx\n\t"
811     "orq %%rcx, %%rax\n\t"
812     "\n\t"
813     "lock; btcq $3, (%%rdx)\n\t"
814     "setb %%cl\n\t"
815     "movzbq %%cl, %%rcx\n\t"
816     "shlq $3, %%rcx\n\t"
817     "orq %%rcx, %%rax\n\t"
818     "\n\t"
819     "movq %%rax, %0\n\t"
820     "movq (%%rdx), %1"
821     : "=r" (reconstructed), "=r" (mashed)
822     : "r" (origp)
823     : "rax", "rcx", "rdx", "cc");
824  return (mashed & 0xF) | ((reconstructed & 0xF) << 4);
825}
826
827ULong mash_mem_L ( UInt* origp )
828{
829  ULong reconstructed; UInt mashed;
830  __asm__ __volatile__ (
831     "movq %2, %%rdx\n\t"
832     ""
833     "movq $0, %%rax\n\t"
834     "\n\t"
835     "btl  $0, (%%rdx)\n\t"
836     "setb %%cl\n\t"
837     "movzbq %%cl, %%rcx\n\t"
838     "orq %%rcx, %%rax\n\t"
839     "\n\t"
840     "lock; btsl $1, (%%rdx)\n\t"
841     "setb %%cl\n\t"
842     "movzbq %%cl, %%rcx\n\t"
843     "shlq $1, %%rcx\n\t"
844     "orq %%rcx, %%rax\n\t"
845     "\n\t"
846     "lock; btrl $2, (%%rdx)\n\t"
847     "setb %%cl\n\t"
848     "movzbq %%cl, %%rcx\n\t"
849     "shlq $2, %%rcx\n\t"
850     "orq %%rcx, %%rax\n\t"
851     "\n\t"
852     "lock; btcl $3, (%%rdx)\n\t"
853     "setb %%cl\n\t"
854     "movzbq %%cl, %%rcx\n\t"
855     "shlq $3, %%rcx\n\t"
856     "orq %%rcx, %%rax\n\t"
857     "\n\t"
858     "movq %%rax, %0\n\t"
859     "movl (%%rdx), %1"
860     : "=r" (reconstructed), "=r" (mashed)
861     : "r" (origp)
862     : "rax", "rcx", "rdx", "cc");
863  return (mashed & 0xF) | ((reconstructed & 0xF) << 4);
864}
865
866ULong mash_mem_W ( UShort* origp )
867{
868  ULong reconstructed, mashed;
869  __asm__ __volatile__ (
870     "movq %2, %%rdx\n\t"
871     ""
872     "movq $0, %%rax\n\t"
873     "\n\t"
874     "btw  $0, (%%rdx)\n\t"
875     "setb %%cl\n\t"
876     "movzbq %%cl, %%rcx\n\t"
877     "orq %%rcx, %%rax\n\t"
878     "\n\t"
879     "lock; btsw $1, (%%rdx)\n\t"
880     "setb %%cl\n\t"
881     "movzbq %%cl, %%rcx\n\t"
882     "shlq $1, %%rcx\n\t"
883     "orq %%rcx, %%rax\n\t"
884     "\n\t"
885     "lock; btrw $2, (%%rdx)\n\t"
886     "setb %%cl\n\t"
887     "movzbq %%cl, %%rcx\n\t"
888     "shlq $2, %%rcx\n\t"
889     "orq %%rcx, %%rax\n\t"
890     "\n\t"
891     "lock; btcw $3, (%%rdx)\n\t"
892     "setb %%cl\n\t"
893     "movzbq %%cl, %%rcx\n\t"
894     "shlq $3, %%rcx\n\t"
895     "orq %%rcx, %%rax\n\t"
896     "\n\t"
897     "movq %%rax, %0\n\t"
898     "movzwq (%%rdx), %1"
899     : "=r" (reconstructed), "=r" (mashed)
900     : "r" (origp)
901     : "rax", "rcx", "rdx", "cc");
902  return (mashed & 0xF) | ((reconstructed & 0xF) << 4);
903}
904
905
906void do_bt_imm_E_tests( void )
907{
908  ULong i;
909  ULong*  iiq = malloc(sizeof(ULong));
910  UInt*   iil = malloc(sizeof(UInt));
911  UShort* iiw = malloc(sizeof(UShort));
912  for (i = 0; i < 0x10; i++) {
913    *iiq = i;
914    *iil = i;
915    *iiw = i;
916    send(sprintf(outBuf,"0x%llx -> 0x%02llx 0x%02llx 0x%02llx\n", i,
917                 mash_mem_Q(iiq), mash_mem_L(iil), mash_mem_W(iiw)));
918  }
919  free(iiq);
920  free(iil);
921  free(iiw);
922}
923
924
925/////////////////////////////////////////////////////////////////
926
927int main ( void )
928{
929  do_locked_G_E_addb();
930  do_locked_G_E_addw();
931  do_locked_G_E_addl();
932  do_locked_G_E_addq();
933
934  do_locked_G_E_orb();
935  do_locked_G_E_orw();
936  do_locked_G_E_orl();
937  do_locked_G_E_orq();
938
939  do_locked_G_E_adcb();
940  do_locked_G_E_adcw();
941  do_locked_G_E_adcl();
942  do_locked_G_E_adcq();
943
944  do_locked_G_E_sbbb();
945  do_locked_G_E_sbbw();
946  do_locked_G_E_sbbl();
947  do_locked_G_E_sbbq();
948
949  do_locked_G_E_andb();
950  do_locked_G_E_andw();
951  do_locked_G_E_andl();
952  do_locked_G_E_andq();
953
954  do_locked_G_E_subb();
955  do_locked_G_E_subw();
956  do_locked_G_E_subl();
957  do_locked_G_E_subq();
958
959  do_locked_G_E_xorb();
960  do_locked_G_E_xorw();
961  do_locked_G_E_xorl();
962  do_locked_G_E_xorq();
963  // 4 * 7
964
965  do_locked_imm_E_addb_0x7F();
966  do_locked_imm_E_addb_0xF1();
967  do_locked_imm_E_addw_0x7E();
968  do_locked_imm_E_addw_0x9325();
969  do_locked_imm_E_addl_0x7D();
970  do_locked_imm_E_addl_0x31415927();
971  do_locked_imm_E_addq_0x7D();
972  do_locked_imm_E_addq_0x31415927();
973
974  do_locked_imm_E_orb_0x7F();
975  do_locked_imm_E_orb_0xF1();
976  do_locked_imm_E_orw_0x7E();
977  do_locked_imm_E_orw_0x9325();
978  do_locked_imm_E_orl_0x7D();
979  do_locked_imm_E_orl_0x31415927();
980  do_locked_imm_E_orq_0x7D();
981  do_locked_imm_E_orq_0x31415927();
982
983  do_locked_imm_E_adcb_0x7F();
984  do_locked_imm_E_adcb_0xF1();
985  do_locked_imm_E_adcw_0x7E();
986  do_locked_imm_E_adcw_0x9325();
987  do_locked_imm_E_adcl_0x7D();
988  do_locked_imm_E_adcl_0x31415927();
989  do_locked_imm_E_adcq_0x7D();
990  do_locked_imm_E_adcq_0x31415927();
991
992  do_locked_imm_E_sbbb_0x7F();
993  do_locked_imm_E_sbbb_0xF1();
994  do_locked_imm_E_sbbw_0x7E();
995  do_locked_imm_E_sbbw_0x9325();
996  do_locked_imm_E_sbbl_0x7D();
997  do_locked_imm_E_sbbl_0x31415927();
998  do_locked_imm_E_sbbq_0x7D();
999  do_locked_imm_E_sbbq_0x31415927();
1000
1001  do_locked_imm_E_andb_0x7F();
1002  do_locked_imm_E_andb_0xF1();
1003  do_locked_imm_E_andw_0x7E();
1004  do_locked_imm_E_andw_0x9325();
1005  do_locked_imm_E_andl_0x7D();
1006  do_locked_imm_E_andl_0x31415927();
1007  do_locked_imm_E_andq_0x7D();
1008  do_locked_imm_E_andq_0x31415927();
1009
1010  do_locked_imm_E_subb_0x7F();
1011  do_locked_imm_E_subb_0xF1();
1012  do_locked_imm_E_subw_0x7E();
1013  do_locked_imm_E_subw_0x9325();
1014  do_locked_imm_E_subl_0x7D();
1015  do_locked_imm_E_subl_0x31415927();
1016  do_locked_imm_E_subq_0x7D();
1017  do_locked_imm_E_subq_0x31415927();
1018
1019  do_locked_imm_E_xorb_0x7F();
1020  do_locked_imm_E_xorb_0xF1();
1021  do_locked_imm_E_xorw_0x7E();
1022  do_locked_imm_E_xorw_0x9325();
1023  do_locked_imm_E_xorl_0x7D();
1024  do_locked_imm_E_xorl_0x31415927();
1025  do_locked_imm_E_xorq_0x7D();
1026  do_locked_imm_E_xorq_0x31415927();
1027  // 4 * 7 + 8 * 7 == 84
1028
1029  do_locked_unary_E_decb();
1030  do_locked_unary_E_decw();
1031  do_locked_unary_E_decl();
1032  do_locked_unary_E_decq();
1033
1034  do_locked_unary_E_incb();
1035  do_locked_unary_E_incw();
1036  do_locked_unary_E_incl();
1037  do_locked_unary_E_incq();
1038
1039  do_locked_unary_E_negb();
1040  do_locked_unary_E_negw();
1041  do_locked_unary_E_negl();
1042  do_locked_unary_E_negq();
1043
1044  do_locked_unary_E_notb();
1045  do_locked_unary_E_notw();
1046  do_locked_unary_E_notl();
1047  do_locked_unary_E_notq();
1048  // 100
1049
1050  do_bt_G_E_tests();
1051  // 109
1052  do_bt_imm_E_tests();
1053  // 118
1054
1055  // So there should be 118 lock-prefixed instructions in the
1056  // disassembly of this compilation unit.
1057  // confirm with
1058  // objdump -d ./amd64locked | grep lock | grep -v do_lock | grep -v elf64 | wc
1059
1060
1061  { UInt crcExpd = 0xDF0656F1;
1062    theCRC = crcFinalise( theCRC );
1063    if (theCRC == crcExpd) {
1064       printf("amd64locked: PASS: CRCs actual 0x%08X expected 0x%08X\n",
1065              theCRC, crcExpd);
1066    } else {
1067       printf("amd64locked: FAIL: CRCs actual 0x%08X expected 0x%08X\n",
1068              theCRC, crcExpd);
1069       printf("amd64locked: set #define VERBOSE 1 to diagnose\n");
1070    }
1071  }
1072
1073  return 0;
1074}
1075