1
2#include <stdio.h>
3#include <stdlib.h>
4#include <assert.h>
5
6#define VERBOSE 0
7
8typedef  unsigned int            UInt;
9typedef  unsigned char           UChar;
10typedef  unsigned long long int  ULong;
11typedef  signed long long int    Long;
12typedef  signed int              Int;
13typedef  unsigned short          UShort;
14typedef  unsigned long           UWord;
15typedef  char                    HChar;
16
17/////////////////////////////////////////////////////////////////
18// BEGIN crc32 stuff                                           //
19/////////////////////////////////////////////////////////////////
20
21static const UInt crc32Table[256] = {
22
23   /*-- Ugly, innit? --*/
24
25   0x00000000L, 0x04c11db7L, 0x09823b6eL, 0x0d4326d9L,
26   0x130476dcL, 0x17c56b6bL, 0x1a864db2L, 0x1e475005L,
27   0x2608edb8L, 0x22c9f00fL, 0x2f8ad6d6L, 0x2b4bcb61L,
28   0x350c9b64L, 0x31cd86d3L, 0x3c8ea00aL, 0x384fbdbdL,
29   0x4c11db70L, 0x48d0c6c7L, 0x4593e01eL, 0x4152fda9L,
30   0x5f15adacL, 0x5bd4b01bL, 0x569796c2L, 0x52568b75L,
31   0x6a1936c8L, 0x6ed82b7fL, 0x639b0da6L, 0x675a1011L,
32   0x791d4014L, 0x7ddc5da3L, 0x709f7b7aL, 0x745e66cdL,
33   0x9823b6e0L, 0x9ce2ab57L, 0x91a18d8eL, 0x95609039L,
34   0x8b27c03cL, 0x8fe6dd8bL, 0x82a5fb52L, 0x8664e6e5L,
35   0xbe2b5b58L, 0xbaea46efL, 0xb7a96036L, 0xb3687d81L,
36   0xad2f2d84L, 0xa9ee3033L, 0xa4ad16eaL, 0xa06c0b5dL,
37   0xd4326d90L, 0xd0f37027L, 0xddb056feL, 0xd9714b49L,
38   0xc7361b4cL, 0xc3f706fbL, 0xceb42022L, 0xca753d95L,
39   0xf23a8028L, 0xf6fb9d9fL, 0xfbb8bb46L, 0xff79a6f1L,
40   0xe13ef6f4L, 0xe5ffeb43L, 0xe8bccd9aL, 0xec7dd02dL,
41   0x34867077L, 0x30476dc0L, 0x3d044b19L, 0x39c556aeL,
42   0x278206abL, 0x23431b1cL, 0x2e003dc5L, 0x2ac12072L,
43   0x128e9dcfL, 0x164f8078L, 0x1b0ca6a1L, 0x1fcdbb16L,
44   0x018aeb13L, 0x054bf6a4L, 0x0808d07dL, 0x0cc9cdcaL,
45   0x7897ab07L, 0x7c56b6b0L, 0x71159069L, 0x75d48ddeL,
46   0x6b93dddbL, 0x6f52c06cL, 0x6211e6b5L, 0x66d0fb02L,
47   0x5e9f46bfL, 0x5a5e5b08L, 0x571d7dd1L, 0x53dc6066L,
48   0x4d9b3063L, 0x495a2dd4L, 0x44190b0dL, 0x40d816baL,
49   0xaca5c697L, 0xa864db20L, 0xa527fdf9L, 0xa1e6e04eL,
50   0xbfa1b04bL, 0xbb60adfcL, 0xb6238b25L, 0xb2e29692L,
51   0x8aad2b2fL, 0x8e6c3698L, 0x832f1041L, 0x87ee0df6L,
52   0x99a95df3L, 0x9d684044L, 0x902b669dL, 0x94ea7b2aL,
53   0xe0b41de7L, 0xe4750050L, 0xe9362689L, 0xedf73b3eL,
54   0xf3b06b3bL, 0xf771768cL, 0xfa325055L, 0xfef34de2L,
55   0xc6bcf05fL, 0xc27dede8L, 0xcf3ecb31L, 0xcbffd686L,
56   0xd5b88683L, 0xd1799b34L, 0xdc3abdedL, 0xd8fba05aL,
57   0x690ce0eeL, 0x6dcdfd59L, 0x608edb80L, 0x644fc637L,
58   0x7a089632L, 0x7ec98b85L, 0x738aad5cL, 0x774bb0ebL,
59   0x4f040d56L, 0x4bc510e1L, 0x46863638L, 0x42472b8fL,
60   0x5c007b8aL, 0x58c1663dL, 0x558240e4L, 0x51435d53L,
61   0x251d3b9eL, 0x21dc2629L, 0x2c9f00f0L, 0x285e1d47L,
62   0x36194d42L, 0x32d850f5L, 0x3f9b762cL, 0x3b5a6b9bL,
63   0x0315d626L, 0x07d4cb91L, 0x0a97ed48L, 0x0e56f0ffL,
64   0x1011a0faL, 0x14d0bd4dL, 0x19939b94L, 0x1d528623L,
65   0xf12f560eL, 0xf5ee4bb9L, 0xf8ad6d60L, 0xfc6c70d7L,
66   0xe22b20d2L, 0xe6ea3d65L, 0xeba91bbcL, 0xef68060bL,
67   0xd727bbb6L, 0xd3e6a601L, 0xdea580d8L, 0xda649d6fL,
68   0xc423cd6aL, 0xc0e2d0ddL, 0xcda1f604L, 0xc960ebb3L,
69   0xbd3e8d7eL, 0xb9ff90c9L, 0xb4bcb610L, 0xb07daba7L,
70   0xae3afba2L, 0xaafbe615L, 0xa7b8c0ccL, 0xa379dd7bL,
71   0x9b3660c6L, 0x9ff77d71L, 0x92b45ba8L, 0x9675461fL,
72   0x8832161aL, 0x8cf30badL, 0x81b02d74L, 0x857130c3L,
73   0x5d8a9099L, 0x594b8d2eL, 0x5408abf7L, 0x50c9b640L,
74   0x4e8ee645L, 0x4a4ffbf2L, 0x470cdd2bL, 0x43cdc09cL,
75   0x7b827d21L, 0x7f436096L, 0x7200464fL, 0x76c15bf8L,
76   0x68860bfdL, 0x6c47164aL, 0x61043093L, 0x65c52d24L,
77   0x119b4be9L, 0x155a565eL, 0x18197087L, 0x1cd86d30L,
78   0x029f3d35L, 0x065e2082L, 0x0b1d065bL, 0x0fdc1becL,
79   0x3793a651L, 0x3352bbe6L, 0x3e119d3fL, 0x3ad08088L,
80   0x2497d08dL, 0x2056cd3aL, 0x2d15ebe3L, 0x29d4f654L,
81   0xc5a92679L, 0xc1683bceL, 0xcc2b1d17L, 0xc8ea00a0L,
82   0xd6ad50a5L, 0xd26c4d12L, 0xdf2f6bcbL, 0xdbee767cL,
83   0xe3a1cbc1L, 0xe760d676L, 0xea23f0afL, 0xeee2ed18L,
84   0xf0a5bd1dL, 0xf464a0aaL, 0xf9278673L, 0xfde69bc4L,
85   0x89b8fd09L, 0x8d79e0beL, 0x803ac667L, 0x84fbdbd0L,
86   0x9abc8bd5L, 0x9e7d9662L, 0x933eb0bbL, 0x97ffad0cL,
87   0xafb010b1L, 0xab710d06L, 0xa6322bdfL, 0xa2f33668L,
88   0xbcb4666dL, 0xb8757bdaL, 0xb5365d03L, 0xb1f740b4L
89};
90
91#define UPDATE_CRC(crcVar,cha)                 \
92{                                              \
93   crcVar = (crcVar << 8) ^                    \
94            crc32Table[(crcVar >> 24) ^        \
95                       ((UChar)cha)];          \
96}
97
98static UInt crcBytes ( UChar* bytes, UWord nBytes, UInt crcIn )
99{
100   UInt crc = crcIn;
101   while (nBytes >= 4) {
102      UPDATE_CRC(crc, bytes[0]);
103      UPDATE_CRC(crc, bytes[1]);
104      UPDATE_CRC(crc, bytes[2]);
105      UPDATE_CRC(crc, bytes[3]);
106      bytes += 4;
107      nBytes -= 4;
108   }
109   while (nBytes >= 1) {
110      UPDATE_CRC(crc, bytes[0]);
111      bytes += 1;
112      nBytes -= 1;
113   }
114   return crc;
115}
116
117static UInt crcFinalise ( UInt crc ) {
118   return ~crc;
119}
120
121////////
122
123static UInt theCRC = 0xFFFFFFFF;
124
125static HChar outBuf[1024];
126// take output that's in outBuf, length as specified, and
127// update the running crc.
128static void send ( int nbytes )
129{
130   assert( ((unsigned int)nbytes) < sizeof(outBuf)-1);
131   assert(outBuf[nbytes] == 0);
132   theCRC = crcBytes( (UChar*)&outBuf[0], nbytes, theCRC );
133   if (VERBOSE) printf("SEND %08x %s", theCRC, outBuf);
134}
135
136
137/////////////////////////////////////////////////////////////////
138// END crc32 stuff                                             //
139/////////////////////////////////////////////////////////////////
140
141#if 0
142
143// full version
144#define NVALS 76
145
146static ULong val[NVALS]
147    = { 0x00ULL, 0x01ULL, 0x02ULL, 0x03ULL,
148        0x3FULL, 0x40ULL, 0x41ULL,
149        0x7EULL, 0x7FULL, 0x80ULL, 0x81ULL, 0x82ULL,
150        0xBFULL, 0xC0ULL, 0xC1ULL,
151        0xFCULL, 0xFDULL, 0xFEULL, 0xFFULL,
152
153        0xFF00ULL, 0xFF01ULL, 0xFF02ULL, 0xFF03ULL,
154        0xFF3FULL, 0xFF40ULL, 0xFF41ULL,
155        0xFF7EULL, 0xFF7FULL, 0xFF80ULL, 0xFF81ULL, 0xFF82ULL,
156        0xFFBFULL, 0xFFC0ULL, 0xFFC1ULL,
157        0xFFFCULL, 0xFFFDULL, 0xFFFEULL, 0xFFFFULL,
158
159        0xFFFFFF00ULL, 0xFFFFFF01ULL, 0xFFFFFF02ULL, 0xFFFFFF03ULL,
160        0xFFFFFF3FULL, 0xFFFFFF40ULL, 0xFFFFFF41ULL,
161        0xFFFFFF7EULL, 0xFFFFFF7FULL, 0xFFFFFF80ULL, 0xFFFFFF81ULL, 0xFFFFFF82ULL,
162        0xFFFFFFBFULL, 0xFFFFFFC0ULL, 0xFFFFFFC1ULL,
163        0xFFFFFFFCULL, 0xFFFFFFFDULL, 0xFFFFFFFEULL, 0xFFFFFFFFULL,
164
165        0xFFFFFFFFFFFFFF00ULL, 0xFFFFFFFFFFFFFF01ULL, 0xFFFFFFFFFFFFFF02ULL,
166                               0xFFFFFFFFFFFFFF03ULL,
167        0xFFFFFFFFFFFFFF3FULL, 0xFFFFFFFFFFFFFF40ULL, 0xFFFFFFFFFFFFFF41ULL,
168        0xFFFFFFFFFFFFFF7EULL, 0xFFFFFFFFFFFFFF7FULL, 0xFFFFFFFFFFFFFF80ULL,
169                               0xFFFFFFFFFFFFFF81ULL, 0xFFFFFFFFFFFFFF82ULL,
170        0xFFFFFFFFFFFFFFBFULL, 0xFFFFFFFFFFFFFFC0ULL, 0xFFFFFFFFFFFFFFC1ULL,
171        0xFFFFFFFFFFFFFFFCULL, 0xFFFFFFFFFFFFFFFDULL, 0xFFFFFFFFFFFFFFFEULL,
172                               0xFFFFFFFFFFFFFFFFULL
173      };
174
175#else
176
177// shortened version, for use as valgrind regtest
178#define NVALS 36
179
180static ULong val[NVALS]
181    = { 0x00ULL, 0x01ULL,
182        0x3FULL, 0x40ULL,
183        0x7FULL, 0x80ULL,
184        0xBFULL, 0xC0ULL,
185        0xFFULL,
186
187        0xFF00ULL, 0xFF01ULL,
188        0xFF3FULL, 0xFF40ULL,
189        0xFF7FULL, 0xFF80ULL,
190        0xFFBFULL, 0xFFC0ULL,
191        0xFFFFULL,
192
193        0xFFFFFF00ULL, 0xFFFFFF01ULL,
194        0xFFFFFF3FULL, 0xFFFFFF40ULL,
195        0xFFFFFF7EULL, 0xFFFFFF7FULL,
196        0xFFFFFFBFULL, 0xFFFFFFC0ULL,
197        0xFFFFFFFFULL,
198
199        0xFFFFFFFFFFFFFF00ULL, 0xFFFFFFFFFFFFFF01ULL,
200        0xFFFFFFFFFFFFFF3FULL, 0xFFFFFFFFFFFFFF40ULL,
201        0xFFFFFFFFFFFFFF7FULL, 0xFFFFFFFFFFFFFF80ULL,
202        0xFFFFFFFFFFFFFFBFULL, 0xFFFFFFFFFFFFFFC0ULL,
203        0xFFFFFFFFFFFFFFFFULL
204      };
205
206#endif
207
208/////////////////////////////////////
209
210#define CC_C    0x0001
211#define CC_P    0x0004
212#define CC_A    0x0010
213#define CC_Z    0x0040
214#define CC_S    0x0080
215#define CC_O    0x0800
216
217#define CC_MASK (CC_C | CC_P | CC_A | CC_Z | CC_S | CC_O)
218
219#define GEN_do_locked_G_E(_name,_eax)   \
220  \
221  __attribute__((noinline)) void do_locked_G_E_##_name ( void )  \
222  {   \
223    volatile Long e_val, g_val, e_val_before;   \
224    Long o, s, z, a, c, p, v1, v2, flags_in;       \
225    Long block[4];   \
226    \
227    for (v1 = 0; v1 < NVALS; v1++) {   \
228    for (v2 = 0; v2 < NVALS; v2++) {   \
229    \
230    for (o = 0; o < 2; o++) {   \
231    for (s = 0; s < 2; s++) {   \
232    for (z = 0; z < 2; z++) {   \
233    for (a = 0; a < 2; a++) {   \
234    for (c = 0; c < 2; c++) {   \
235    for (p = 0; p < 2; p++) {   \
236      \
237      flags_in = (o ? CC_O : 0)   \
238               | (s ? CC_S : 0)   \
239               | (z ? CC_Z : 0)   \
240               | (a ? CC_A : 0)   \
241               | (c ? CC_C : 0)   \
242               | (p ? CC_P : 0);   \
243      \
244      g_val = val[v1];   \
245      e_val = val[v2];   \
246      e_val_before = e_val;   \
247      \
248      block[0] = flags_in;   \
249      block[1] = g_val;   \
250      block[2] = (long)&e_val;   \
251      block[3] = 0;   \
252      __asm__ __volatile__(   \
253          "movq 0(%0), %%rax\n\t"   \
254          "pushq %%rax\n\t"   \
255          "popfq\n\t"   \
256          "movq 8(%0), %%rax\n\t"   \
257          "movq 16(%0), %%rbx\n\t"   \
258          "lock; " #_name " %%" #_eax ",(%%rbx)\n\t"   \
259          "pushfq\n\t"   \
260          "popq %%rax\n\t"   \
261          "movq %%rax, 24(%0)\n\t"   \
262          : : "r"(&block[0]) : "rax","rbx","cc","memory"   \
263      );   \
264      \
265      send( \
266      sprintf(outBuf, \
267             "%s G=%016llx E=%016llx CCIN=%08llx -> E=%016llx CCOUT=%08llx\n",       \
268             #_name, g_val, e_val_before, flags_in,   \
269              e_val, block[3] & CC_MASK));            \
270      \
271    }}}}}}   \
272    \
273    }}   \
274  }
275
276GEN_do_locked_G_E(addb,al)
277GEN_do_locked_G_E(addw,ax)
278GEN_do_locked_G_E(addl,eax)
279GEN_do_locked_G_E(addq,rax)
280
281GEN_do_locked_G_E(orb, al)
282GEN_do_locked_G_E(orw, ax)
283GEN_do_locked_G_E(orl, eax)
284GEN_do_locked_G_E(orq, rax)
285
286GEN_do_locked_G_E(adcb,al)
287GEN_do_locked_G_E(adcw,ax)
288GEN_do_locked_G_E(adcl,eax)
289GEN_do_locked_G_E(adcq,rax)
290
291GEN_do_locked_G_E(sbbb,al)
292GEN_do_locked_G_E(sbbw,ax)
293GEN_do_locked_G_E(sbbl,eax)
294GEN_do_locked_G_E(sbbq,rax)
295
296GEN_do_locked_G_E(andb,al)
297GEN_do_locked_G_E(andw,ax)
298GEN_do_locked_G_E(andl,eax)
299GEN_do_locked_G_E(andq,rax)
300
301GEN_do_locked_G_E(subb,al)
302GEN_do_locked_G_E(subw,ax)
303GEN_do_locked_G_E(subl,eax)
304GEN_do_locked_G_E(subq,rax)
305
306GEN_do_locked_G_E(xorb,al)
307GEN_do_locked_G_E(xorw,ax)
308GEN_do_locked_G_E(xorl,eax)
309GEN_do_locked_G_E(xorq,rax)
310
311
312
313
314#define GEN_do_locked_imm_E(_name,_eax,_imm)        \
315  \
316  __attribute__((noinline)) void do_locked_imm_E_##_name##_##_imm ( void )  \
317  {   \
318    volatile Long e_val, e_val_before;   \
319    Long o, s, z, a, c, p, v2, flags_in;   \
320    Long block[3];   \
321    \
322    for (v2 = 0; v2 < NVALS; v2++) {   \
323    \
324    for (o = 0; o < 2; o++) {   \
325    for (s = 0; s < 2; s++) {   \
326    for (z = 0; z < 2; z++) {   \
327    for (a = 0; a < 2; a++) {   \
328    for (c = 0; c < 2; c++) {   \
329    for (p = 0; p < 2; p++) {   \
330      \
331      flags_in = (o ? CC_O : 0)   \
332               | (s ? CC_S : 0)   \
333               | (z ? CC_Z : 0)   \
334               | (a ? CC_A : 0)   \
335               | (c ? CC_C : 0)   \
336               | (p ? CC_P : 0);   \
337      \
338      e_val = val[v2];   \
339      e_val_before = e_val;   \
340      \
341      block[0] = flags_in;   \
342      block[1] = (long)&e_val;   \
343      block[2] = 0;   \
344      __asm__ __volatile__(   \
345          "movq 0(%0), %%rax\n\t"   \
346          "pushq %%rax\n\t"   \
347          "popfq\n\t"   \
348          "movq 8(%0), %%rbx\n\t"   \
349          "lock; " #_name " $" #_imm ",(%%rbx)\n\t"   \
350          "pushfq\n\t"   \
351          "popq %%rax\n\t"   \
352          "movq %%rax, 16(%0)\n\t"   \
353          : : "r"(&block[0]) : "rax","rbx","cc","memory"   \
354      );   \
355      \
356      send( \
357           sprintf(outBuf, \
358           "%s I=%s E=%016llx CCIN=%08llx -> E=%016llx CCOUT=%08llx\n",    \
359             #_name, #_imm, e_val_before, flags_in,         \
360                   e_val, block[2] & CC_MASK));             \
361      \
362    }}}}}}   \
363    \
364    }   \
365  }
366
367GEN_do_locked_imm_E(addb,al,0x7F)
368GEN_do_locked_imm_E(addb,al,0xF1)
369GEN_do_locked_imm_E(addw,ax,0x7E)
370GEN_do_locked_imm_E(addw,ax,0x9325)
371GEN_do_locked_imm_E(addl,eax,0x7D)
372GEN_do_locked_imm_E(addl,eax,0x31415927)
373GEN_do_locked_imm_E(addq,rax,0x7D)
374GEN_do_locked_imm_E(addq,rax,0x31415927)
375
376GEN_do_locked_imm_E(orb,al,0x7F)
377GEN_do_locked_imm_E(orb,al,0xF1)
378GEN_do_locked_imm_E(orw,ax,0x7E)
379GEN_do_locked_imm_E(orw,ax,0x9325)
380GEN_do_locked_imm_E(orl,eax,0x7D)
381GEN_do_locked_imm_E(orl,eax,0x31415927)
382GEN_do_locked_imm_E(orq,rax,0x7D)
383GEN_do_locked_imm_E(orq,rax,0x31415927)
384
385GEN_do_locked_imm_E(adcb,al,0x7F)
386GEN_do_locked_imm_E(adcb,al,0xF1)
387GEN_do_locked_imm_E(adcw,ax,0x7E)
388GEN_do_locked_imm_E(adcw,ax,0x9325)
389GEN_do_locked_imm_E(adcl,eax,0x7D)
390GEN_do_locked_imm_E(adcl,eax,0x31415927)
391GEN_do_locked_imm_E(adcq,rax,0x7D)
392GEN_do_locked_imm_E(adcq,rax,0x31415927)
393
394GEN_do_locked_imm_E(sbbb,al,0x7F)
395GEN_do_locked_imm_E(sbbb,al,0xF1)
396GEN_do_locked_imm_E(sbbw,ax,0x7E)
397GEN_do_locked_imm_E(sbbw,ax,0x9325)
398GEN_do_locked_imm_E(sbbl,eax,0x7D)
399GEN_do_locked_imm_E(sbbl,eax,0x31415927)
400GEN_do_locked_imm_E(sbbq,rax,0x7D)
401GEN_do_locked_imm_E(sbbq,rax,0x31415927)
402
403GEN_do_locked_imm_E(andb,al,0x7F)
404GEN_do_locked_imm_E(andb,al,0xF1)
405GEN_do_locked_imm_E(andw,ax,0x7E)
406GEN_do_locked_imm_E(andw,ax,0x9325)
407GEN_do_locked_imm_E(andl,eax,0x7D)
408GEN_do_locked_imm_E(andl,eax,0x31415927)
409GEN_do_locked_imm_E(andq,rax,0x7D)
410GEN_do_locked_imm_E(andq,rax,0x31415927)
411
412GEN_do_locked_imm_E(subb,al,0x7F)
413GEN_do_locked_imm_E(subb,al,0xF1)
414GEN_do_locked_imm_E(subw,ax,0x7E)
415GEN_do_locked_imm_E(subw,ax,0x9325)
416GEN_do_locked_imm_E(subl,eax,0x7D)
417GEN_do_locked_imm_E(subl,eax,0x31415927)
418GEN_do_locked_imm_E(subq,rax,0x7D)
419GEN_do_locked_imm_E(subq,rax,0x31415927)
420
421GEN_do_locked_imm_E(xorb,al,0x7F)
422GEN_do_locked_imm_E(xorb,al,0xF1)
423GEN_do_locked_imm_E(xorw,ax,0x7E)
424GEN_do_locked_imm_E(xorw,ax,0x9325)
425GEN_do_locked_imm_E(xorl,eax,0x7D)
426GEN_do_locked_imm_E(xorl,eax,0x31415927)
427GEN_do_locked_imm_E(xorq,rax,0x7D)
428GEN_do_locked_imm_E(xorq,rax,0x31415927)
429
430#define GEN_do_locked_unary_E(_name,_eax)        \
431  \
432  __attribute__((noinline)) void do_locked_unary_E_##_name ( void )  \
433  {   \
434    volatile Long e_val, e_val_before;   \
435    Long o, s, z, a, c, p, v2, flags_in;     \
436    Long block[3];   \
437    \
438    for (v2 = 0; v2 < NVALS; v2++) {   \
439    \
440    for (o = 0; o < 2; o++) {   \
441    for (s = 0; s < 2; s++) {   \
442    for (z = 0; z < 2; z++) {   \
443    for (a = 0; a < 2; a++) {   \
444    for (c = 0; c < 2; c++) {   \
445    for (p = 0; p < 2; p++) {   \
446      \
447      flags_in = (o ? CC_O : 0)   \
448               | (s ? CC_S : 0)   \
449               | (z ? CC_Z : 0)   \
450               | (a ? CC_A : 0)   \
451               | (c ? CC_C : 0)   \
452               | (p ? CC_P : 0);   \
453      \
454      e_val = val[v2];   \
455      e_val_before = e_val;   \
456      \
457      block[0] = flags_in;   \
458      block[1] = (long)&e_val;   \
459      block[2] = 0;   \
460      __asm__ __volatile__(   \
461          "movq 0(%0), %%rax\n\t"   \
462          "pushq %%rax\n\t"   \
463          "popfq\n\t"   \
464          "movq 8(%0), %%rbx\n\t"   \
465          "lock; " #_name " (%%rbx)\n\t"   \
466          "pushfq\n\t"   \
467          "popq %%rax\n\t"   \
468          "movq %%rax, 16(%0)\n\t"   \
469          : : "r"(&block[0]) : "rax","rbx","cc","memory"   \
470      );   \
471      \
472      send( \
473           sprintf(outBuf, \
474            "%s E=%016llx CCIN=%08llx -> E=%016llx CCOUT=%08llx\n", \
475             #_name, e_val_before, flags_in,         \
476            e_val, block[2] & CC_MASK));                       \
477      \
478    }}}}}}   \
479    \
480    }   \
481  }
482
483GEN_do_locked_unary_E(decb,al)
484GEN_do_locked_unary_E(decw,ax)
485GEN_do_locked_unary_E(decl,eax)
486GEN_do_locked_unary_E(decq,rax)
487
488GEN_do_locked_unary_E(incb,al)
489GEN_do_locked_unary_E(incw,ax)
490GEN_do_locked_unary_E(incl,eax)
491GEN_do_locked_unary_E(incq,rax)
492
493GEN_do_locked_unary_E(negb,al)
494GEN_do_locked_unary_E(negw,ax)
495GEN_do_locked_unary_E(negl,eax)
496GEN_do_locked_unary_E(negq,rax)
497
498GEN_do_locked_unary_E(notb,al)
499GEN_do_locked_unary_E(notw,ax)
500GEN_do_locked_unary_E(notl,eax)
501GEN_do_locked_unary_E(notq,rax)
502
503
504/////////////////////////////////////////////////////////////////
505
506ULong btsq_mem ( UChar* base, int bitno )
507{
508   ULong res;
509   __asm__
510   __volatile__("lock; btsq\t%2, %0\n\t"
511                "setc   %%dl\n\t"
512                "movzbq %%dl,%1\n"
513                : "=m" (*base), "=r" (res)
514                : "r" ((ULong)bitno) : "rdx","cc","memory" );
515   /* Pretty meaningless to dereference base here, but that's what you
516      have to do to get a btsl insn which refers to memory starting at
517      base. */
518   return res;
519}
520ULong btsl_mem ( UChar* base, int bitno )
521{
522   ULong res;
523   __asm__
524   __volatile__("lock; btsl\t%2, %0\n\t"
525                "setc   %%dl\n\t"
526                "movzbq %%dl,%1\n"
527                : "=m" (*base), "=r" (res)
528                : "r" ((UInt)bitno));
529   return res;
530}
531ULong btsw_mem ( UChar* base, int bitno )
532{
533   ULong res;
534   __asm__
535   __volatile__("lock; btsw\t%w2, %0\n\t"
536                "setc   %%dl\n\t"
537                "movzbq %%dl,%1\n"
538                : "=m" (*base), "=r" (res)
539                : "r" ((ULong)bitno));
540   return res;
541}
542
543ULong btrq_mem ( UChar* base, int bitno )
544{
545   ULong res;
546   __asm__
547   __volatile__("lock; btrq\t%2, %0\n\t"
548                "setc   %%dl\n\t"
549                "movzbq %%dl,%1\n"
550                : "=m" (*base), "=r" (res)
551                : "r" ((ULong)bitno));
552   return res;
553}
554ULong btrl_mem ( UChar* base, int bitno )
555{
556   ULong res;
557   __asm__
558   __volatile__("lock; btrl\t%2, %0\n\t"
559                "setc   %%dl\n\t"
560                "movzbq %%dl,%1\n"
561                : "=m" (*base), "=r" (res)
562                : "r" ((UInt)bitno));
563   return res;
564}
565ULong btrw_mem ( UChar* base, int bitno )
566{
567   ULong res;
568   __asm__
569   __volatile__("lock; btrw\t%w2, %0\n\t"
570                "setc   %%dl\n\t"
571                "movzbq %%dl,%1\n"
572                : "=m" (*base), "=r" (res)
573                : "r" ((ULong)bitno));
574   return res;
575}
576
577ULong btcq_mem ( UChar* base, int bitno )
578{
579   ULong res;
580   __asm__
581   __volatile__("lock; btcq\t%2, %0\n\t"
582                "setc   %%dl\n\t"
583                "movzbq %%dl,%1\n"
584                : "=m" (*base), "=r" (res)
585                : "r" ((ULong)bitno));
586   return res;
587}
588ULong btcl_mem ( UChar* base, int bitno )
589{
590   ULong res;
591   __asm__
592   __volatile__("lock; btcl\t%2, %0\n\t"
593                "setc   %%dl\n\t"
594                "movzbq %%dl,%1\n"
595                : "=m" (*base), "=r" (res)
596                : "r" ((UInt)bitno));
597   return res;
598}
599ULong btcw_mem ( UChar* base, int bitno )
600{
601   ULong res;
602   __asm__
603   __volatile__("lock; btcw\t%w2, %0\n\t"
604                "setc   %%dl\n\t"
605                "movzbq %%dl,%1\n"
606                : "=m" (*base), "=r" (res)
607                : "r" ((ULong)bitno));
608   return res;
609}
610
611ULong btq_mem ( UChar* base, int bitno )
612{
613   ULong res;
614   __asm__
615   __volatile__("btq\t%2, %0\n\t"
616                "setc   %%dl\n\t"
617                "movzbq %%dl,%1\n"
618                : "=m" (*base), "=r" (res)
619                : "r" ((ULong)bitno)
620                : "cc", "memory");
621   return res;
622}
623ULong btl_mem ( UChar* base, int bitno )
624{
625   ULong res;
626   __asm__
627   __volatile__("btl\t%2, %0\n\t"
628                "setc   %%dl\n\t"
629                "movzbq %%dl,%1\n"
630                : "=m" (*base), "=r" (res)
631                : "r" ((UInt)bitno)
632                : "cc", "memory");
633   return res;
634}
635ULong btw_mem ( UChar* base, int bitno )
636{
637   ULong res;
638   __asm__
639   __volatile__("btw\t%w2, %0\n\t"
640                "setc   %%dl\n\t"
641                "movzbq %%dl,%1\n"
642                : "=m" (*base), "=r" (res)
643                : "r" ((ULong)bitno));
644   return res;
645}
646
647ULong rol1 ( ULong x )
648{
649  return (x << 1) | (x >> 63);
650}
651
652void do_bt_G_E_tests ( void )
653{
654   ULong  n, bitoff, op;
655   ULong  c;
656   UChar* block;
657   ULong  carrydep, res;;
658
659   /*------------------------ MEM-Q -----------------------*/
660
661   carrydep = 0;
662   block = calloc(200,1);
663   block += 100;
664   /* Valid bit offsets are -800 .. 799 inclusive. */
665
666   for (n = 0; n < 10000; n++) {
667      bitoff = (random() % 1600) - 800;
668      op = random() % 4;
669      c = 2;
670      switch (op) {
671         case 0: c = btsq_mem(block, bitoff); break;
672         case 1: c = btrq_mem(block, bitoff); break;
673         case 2: c = btcq_mem(block, bitoff); break;
674         case 3: c = btq_mem(block, bitoff); break;
675      }
676      c &= 255;
677      assert(c == 0 || c == 1);
678      carrydep = c ? (rol1(carrydep) ^ (Long)bitoff) : carrydep;
679   }
680
681   /* Compute final result */
682   block -= 100;
683   res = 0;
684   for (n = 0; n < 200; n++) {
685      UChar ch = block[n];
686      /* printf("%d ", (int)block[n]); */
687      res = rol1(res) ^ (ULong)ch;
688   }
689
690   send( sprintf(outBuf,
691                 "bt{s,r,c}q: final res 0x%llx, carrydep 0x%llx\n",
692                 res, carrydep));
693   free(block);
694
695   /*------------------------ MEM-L -----------------------*/
696
697   carrydep = 0;
698   block = calloc(200,1);
699   block += 100;
700   /* Valid bit offsets are -800 .. 799 inclusive. */
701
702   for (n = 0; n < 10000; n++) {
703      bitoff = (random() % 1600) - 800;
704      op = random() % 4;
705      c = 2;
706      switch (op) {
707         case 0: c = btsl_mem(block, bitoff); break;
708         case 1: c = btrl_mem(block, bitoff); break;
709         case 2: c = btcl_mem(block, bitoff); break;
710         case 3: c = btl_mem(block, bitoff); break;
711      }
712      c &= 255;
713      assert(c == 0 || c == 1);
714      carrydep = c ? (rol1(carrydep) ^ (Long)(Int)bitoff) : carrydep;
715   }
716
717   /* Compute final result */
718   block -= 100;
719   res = 0;
720   for (n = 0; n < 200; n++) {
721      UChar ch = block[n];
722      /* printf("%d ", (int)block[n]); */
723      res = rol1(res) ^ (ULong)ch;
724   }
725
726   send( sprintf(outBuf,
727                 "bt{s,r,c}l: final res 0x%llx, carrydep 0x%llx\n",
728                 res, carrydep));
729   free(block);
730
731   /*------------------------ MEM-W -----------------------*/
732
733   carrydep = 0;
734   block = calloc(200,1);
735   block += 100;
736   /* Valid bit offsets are -800 .. 799 inclusive. */
737
738   for (n = 0; n < 10000; n++) {
739      bitoff = (random() % 1600) - 800;
740      op = random() % 4;
741      c = 2;
742      switch (op) {
743         case 0: c = btsw_mem(block, bitoff); break;
744         case 1: c = btrw_mem(block, bitoff); break;
745         case 2: c = btcw_mem(block, bitoff); break;
746         case 3: c = btw_mem(block, bitoff); break;
747      }
748      c &= 255;
749      assert(c == 0 || c == 1);
750      carrydep = c ? (rol1(carrydep) ^ (Long)(Int)bitoff) : carrydep;
751   }
752
753   /* Compute final result */
754   block -= 100;
755   res = 0;
756   for (n = 0; n < 200; n++) {
757      UChar ch = block[n];
758      /* printf("%d ", (int)block[n]); */
759      res = rol1(res) ^ (ULong)ch;
760   }
761
762   send(sprintf(outBuf,
763                "bt{s,r,c}w: final res 0x%llx, carrydep 0x%llx\n",
764                res, carrydep));
765   free(block);
766}
767
768
769/////////////////////////////////////////////////////////////////
770
771/* Given a word, do bt/bts/btr/btc on bits 0, 1, 2 and 3 of it, and
772   also reconstruct the original bits 0, 1, 2, 3 by looking at the
773   carry flag.  Returned result has mashed bits 0-3 at the bottom and
774   the reconstructed original bits 0-3 as 4-7. */
775
776ULong mash_mem_Q ( ULong* origp )
777{
778  ULong reconstructed, mashed;
779  __asm__ __volatile__ (
780     "movq %2, %%rdx\n\t"
781     ""
782     "movq $0, %%rax\n\t"
783     "\n\t"
784     "btq  $0, (%%rdx)\n\t"
785     "setb %%cl\n\t"
786     "movzbq %%cl, %%rcx\n\t"
787     "orq %%rcx, %%rax\n\t"
788     "\n\t"
789     "lock; btsq $1, (%%rdx)\n\t"
790     "setb %%cl\n\t"
791     "movzbq %%cl, %%rcx\n\t"
792     "shlq $1, %%rcx\n\t"
793     "orq %%rcx, %%rax\n\t"
794     "\n\t"
795     "lock; btrq $2, (%%rdx)\n\t"
796     "setb %%cl\n\t"
797     "movzbq %%cl, %%rcx\n\t"
798     "shlq $2, %%rcx\n\t"
799     "orq %%rcx, %%rax\n\t"
800     "\n\t"
801     "lock; btcq $3, (%%rdx)\n\t"
802     "setb %%cl\n\t"
803     "movzbq %%cl, %%rcx\n\t"
804     "shlq $3, %%rcx\n\t"
805     "orq %%rcx, %%rax\n\t"
806     "\n\t"
807     "movq %%rax, %0\n\t"
808     "movq (%%rdx), %1"
809     : "=r" (reconstructed), "=r" (mashed)
810     : "r" (origp)
811     : "rax", "rcx", "rdx", "cc");
812  return (mashed & 0xF) | ((reconstructed & 0xF) << 4);
813}
814
815ULong mash_mem_L ( UInt* origp )
816{
817  ULong reconstructed; UInt mashed;
818  __asm__ __volatile__ (
819     "movq %2, %%rdx\n\t"
820     ""
821     "movq $0, %%rax\n\t"
822     "\n\t"
823     "btl  $0, (%%rdx)\n\t"
824     "setb %%cl\n\t"
825     "movzbq %%cl, %%rcx\n\t"
826     "orq %%rcx, %%rax\n\t"
827     "\n\t"
828     "lock; btsl $1, (%%rdx)\n\t"
829     "setb %%cl\n\t"
830     "movzbq %%cl, %%rcx\n\t"
831     "shlq $1, %%rcx\n\t"
832     "orq %%rcx, %%rax\n\t"
833     "\n\t"
834     "lock; btrl $2, (%%rdx)\n\t"
835     "setb %%cl\n\t"
836     "movzbq %%cl, %%rcx\n\t"
837     "shlq $2, %%rcx\n\t"
838     "orq %%rcx, %%rax\n\t"
839     "\n\t"
840     "lock; btcl $3, (%%rdx)\n\t"
841     "setb %%cl\n\t"
842     "movzbq %%cl, %%rcx\n\t"
843     "shlq $3, %%rcx\n\t"
844     "orq %%rcx, %%rax\n\t"
845     "\n\t"
846     "movq %%rax, %0\n\t"
847     "movl (%%rdx), %1"
848     : "=r" (reconstructed), "=r" (mashed)
849     : "r" (origp)
850     : "rax", "rcx", "rdx", "cc");
851  return (mashed & 0xF) | ((reconstructed & 0xF) << 4);
852}
853
854ULong mash_mem_W ( UShort* origp )
855{
856  ULong reconstructed, mashed;
857  __asm__ __volatile__ (
858     "movq %2, %%rdx\n\t"
859     ""
860     "movq $0, %%rax\n\t"
861     "\n\t"
862     "btw  $0, (%%rdx)\n\t"
863     "setb %%cl\n\t"
864     "movzbq %%cl, %%rcx\n\t"
865     "orq %%rcx, %%rax\n\t"
866     "\n\t"
867     "lock; btsw $1, (%%rdx)\n\t"
868     "setb %%cl\n\t"
869     "movzbq %%cl, %%rcx\n\t"
870     "shlq $1, %%rcx\n\t"
871     "orq %%rcx, %%rax\n\t"
872     "\n\t"
873     "lock; btrw $2, (%%rdx)\n\t"
874     "setb %%cl\n\t"
875     "movzbq %%cl, %%rcx\n\t"
876     "shlq $2, %%rcx\n\t"
877     "orq %%rcx, %%rax\n\t"
878     "\n\t"
879     "lock; btcw $3, (%%rdx)\n\t"
880     "setb %%cl\n\t"
881     "movzbq %%cl, %%rcx\n\t"
882     "shlq $3, %%rcx\n\t"
883     "orq %%rcx, %%rax\n\t"
884     "\n\t"
885     "movq %%rax, %0\n\t"
886     "movzwq (%%rdx), %1"
887     : "=r" (reconstructed), "=r" (mashed)
888     : "r" (origp)
889     : "rax", "rcx", "rdx", "cc");
890  return (mashed & 0xF) | ((reconstructed & 0xF) << 4);
891}
892
893
894void do_bt_imm_E_tests( void )
895{
896  ULong i;
897  ULong*  iiq = malloc(sizeof(ULong));
898  UInt*   iil = malloc(sizeof(UInt));
899  UShort* iiw = malloc(sizeof(UShort));
900  for (i = 0; i < 0x10; i++) {
901    *iiq = i;
902    *iil = i;
903    *iiw = i;
904    send(sprintf(outBuf,"0x%llx -> 0x%02llx 0x%02llx 0x%02llx\n", i,
905                 mash_mem_Q(iiq), mash_mem_L(iil), mash_mem_W(iiw)));
906  }
907  free(iiq);
908  free(iil);
909  free(iiw);
910}
911
912
913/////////////////////////////////////////////////////////////////
914
915int main ( void )
916{
917  do_locked_G_E_addb();
918  do_locked_G_E_addw();
919  do_locked_G_E_addl();
920  do_locked_G_E_addq();
921
922  do_locked_G_E_orb();
923  do_locked_G_E_orw();
924  do_locked_G_E_orl();
925  do_locked_G_E_orq();
926
927  do_locked_G_E_adcb();
928  do_locked_G_E_adcw();
929  do_locked_G_E_adcl();
930  do_locked_G_E_adcq();
931
932  do_locked_G_E_sbbb();
933  do_locked_G_E_sbbw();
934  do_locked_G_E_sbbl();
935  do_locked_G_E_sbbq();
936
937  do_locked_G_E_andb();
938  do_locked_G_E_andw();
939  do_locked_G_E_andl();
940  do_locked_G_E_andq();
941
942  do_locked_G_E_subb();
943  do_locked_G_E_subw();
944  do_locked_G_E_subl();
945  do_locked_G_E_subq();
946
947  do_locked_G_E_xorb();
948  do_locked_G_E_xorw();
949  do_locked_G_E_xorl();
950  do_locked_G_E_xorq();
951  // 4 * 7
952
953  do_locked_imm_E_addb_0x7F();
954  do_locked_imm_E_addb_0xF1();
955  do_locked_imm_E_addw_0x7E();
956  do_locked_imm_E_addw_0x9325();
957  do_locked_imm_E_addl_0x7D();
958  do_locked_imm_E_addl_0x31415927();
959  do_locked_imm_E_addq_0x7D();
960  do_locked_imm_E_addq_0x31415927();
961
962  do_locked_imm_E_orb_0x7F();
963  do_locked_imm_E_orb_0xF1();
964  do_locked_imm_E_orw_0x7E();
965  do_locked_imm_E_orw_0x9325();
966  do_locked_imm_E_orl_0x7D();
967  do_locked_imm_E_orl_0x31415927();
968  do_locked_imm_E_orq_0x7D();
969  do_locked_imm_E_orq_0x31415927();
970
971  do_locked_imm_E_adcb_0x7F();
972  do_locked_imm_E_adcb_0xF1();
973  do_locked_imm_E_adcw_0x7E();
974  do_locked_imm_E_adcw_0x9325();
975  do_locked_imm_E_adcl_0x7D();
976  do_locked_imm_E_adcl_0x31415927();
977  do_locked_imm_E_adcq_0x7D();
978  do_locked_imm_E_adcq_0x31415927();
979
980  do_locked_imm_E_sbbb_0x7F();
981  do_locked_imm_E_sbbb_0xF1();
982  do_locked_imm_E_sbbw_0x7E();
983  do_locked_imm_E_sbbw_0x9325();
984  do_locked_imm_E_sbbl_0x7D();
985  do_locked_imm_E_sbbl_0x31415927();
986  do_locked_imm_E_sbbq_0x7D();
987  do_locked_imm_E_sbbq_0x31415927();
988
989  do_locked_imm_E_andb_0x7F();
990  do_locked_imm_E_andb_0xF1();
991  do_locked_imm_E_andw_0x7E();
992  do_locked_imm_E_andw_0x9325();
993  do_locked_imm_E_andl_0x7D();
994  do_locked_imm_E_andl_0x31415927();
995  do_locked_imm_E_andq_0x7D();
996  do_locked_imm_E_andq_0x31415927();
997
998  do_locked_imm_E_subb_0x7F();
999  do_locked_imm_E_subb_0xF1();
1000  do_locked_imm_E_subw_0x7E();
1001  do_locked_imm_E_subw_0x9325();
1002  do_locked_imm_E_subl_0x7D();
1003  do_locked_imm_E_subl_0x31415927();
1004  do_locked_imm_E_subq_0x7D();
1005  do_locked_imm_E_subq_0x31415927();
1006
1007  do_locked_imm_E_xorb_0x7F();
1008  do_locked_imm_E_xorb_0xF1();
1009  do_locked_imm_E_xorw_0x7E();
1010  do_locked_imm_E_xorw_0x9325();
1011  do_locked_imm_E_xorl_0x7D();
1012  do_locked_imm_E_xorl_0x31415927();
1013  do_locked_imm_E_xorq_0x7D();
1014  do_locked_imm_E_xorq_0x31415927();
1015  // 4 * 7 + 8 * 7 == 84
1016
1017  do_locked_unary_E_decb();
1018  do_locked_unary_E_decw();
1019  do_locked_unary_E_decl();
1020  do_locked_unary_E_decq();
1021
1022  do_locked_unary_E_incb();
1023  do_locked_unary_E_incw();
1024  do_locked_unary_E_incl();
1025  do_locked_unary_E_incq();
1026
1027  do_locked_unary_E_negb();
1028  do_locked_unary_E_negw();
1029  do_locked_unary_E_negl();
1030  do_locked_unary_E_negq();
1031
1032  do_locked_unary_E_notb();
1033  do_locked_unary_E_notw();
1034  do_locked_unary_E_notl();
1035  do_locked_unary_E_notq();
1036  // 100
1037
1038  do_bt_G_E_tests();
1039  // 109
1040  do_bt_imm_E_tests();
1041  // 118
1042
1043  // So there should be 118 lock-prefixed instructions in the
1044  // disassembly of this compilation unit.
1045  // confirm with
1046  // objdump -d ./amd64locked | grep lock | grep -v do_lock | grep -v elf64 | wc
1047
1048
1049  { UInt crcExpd = 0x1F677629;
1050    theCRC = crcFinalise( theCRC );
1051    if (theCRC == crcExpd) {
1052       printf("amd64locked: PASS: CRCs actual 0x%08X expected 0x%08X\n",
1053              theCRC, crcExpd);
1054    } else {
1055       printf("amd64locked: FAIL: CRCs actual 0x%08X expected 0x%08X\n",
1056              theCRC, crcExpd);
1057       printf("amd64locked: set #define VERBOSE 1 to diagnose\n");
1058    }
1059  }
1060
1061  return 0;
1062}
1063