1
2/* This is an example of a program which does atomic memory operations
3   between two processes which share a page.  Valgrind 3.4.1 and
4   earlier produce incorrect answers because it does not preserve
5   atomicity of the relevant instructions in the generated code; but
6   the post-DCAS-merge versions of Valgrind do behave correctly. */
7
8/* On ARM, this can be compiled into either ARM or Thumb code, so as
9   to test both A and T encodings of LDREX/STREX et al.  Also on ARM,
10   it tests doubleword atomics (LDREXD, STREXD) which I don't think it
11   does on any other platform. */
12
13#include <stdlib.h>
14#include <stdio.h>
15#include <string.h>
16#include <assert.h>
17#include <unistd.h>
18#include <sys/wait.h>
19#include "tests/sys_mman.h"
20
21#define NNN 3456987
22
23#define IS_8_ALIGNED(_ptr)   (0 == (((unsigned long)(_ptr)) & 7))
24
25
26__attribute__((noinline)) void atomic_add_8bit ( char* p, int n )
27{
28#if defined(VGA_x86)
29   unsigned long block[2];
30   block[0] = (unsigned long)p;
31   block[1] = n;
32   __asm__ __volatile__(
33      "movl 0(%%esi),%%eax"      "\n\t"
34      "movl 4(%%esi),%%ebx"      "\n\t"
35      "lock; addb %%bl,(%%eax)"  "\n"
36      : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
37   );
38#elif defined(VGA_amd64)
39   unsigned long block[2];
40   block[0] = (unsigned long)p;
41   block[1] = n;
42   __asm__ __volatile__(
43      "movq 0(%%rsi),%%rax"      "\n\t"
44      "movq 8(%%rsi),%%rbx"      "\n\t"
45      "lock; addb %%bl,(%%rax)"  "\n"
46      : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
47   );
48#elif defined(VGA_ppc32)
49   /* Nasty hack.  Does correctly atomically do *p += n, but only if p
50      is 4-aligned -- guaranteed by caller. */
51   unsigned long success;
52   do {
53      __asm__ __volatile__(
54         "lwarx  15,0,%1"    "\n\t"
55         "add    15,15,%2"   "\n\t"
56         "stwcx. 15,0,%1"    "\n\t"
57         "mfcr   %0"         "\n\t"
58         "srwi   %0,%0,29"   "\n\t"
59         "andi.  %0,%0,1"    "\n"
60         : /*out*/"=b"(success)
61         : /*in*/ "b"(p), "b"(((unsigned long)n) << 24)
62         : /*trash*/ "memory", "cc", "r15"
63      );
64   } while (success != 1);
65#elif defined(VGA_ppc64be)
66   /* Nasty hack.  Does correctly atomically do *p += n, but only if p
67      is 8-aligned -- guaranteed by caller. */
68   unsigned long success;
69   do {
70      __asm__ __volatile__(
71         "ldarx  15,0,%1"    "\n\t"
72         "add    15,15,%2"   "\n\t"
73         "stdcx. 15,0,%1"    "\n\t"
74         "mfcr   %0"         "\n\t"
75         "srwi   %0,%0,29"   "\n\t"
76         "andi.  %0,%0,1"    "\n"
77         : /*out*/"=b"(success)
78         : /*in*/ "b"(p), "b"(((unsigned long)n) << 56)
79         : /*trash*/ "memory", "cc", "r15"
80      );
81   } while (success != 1);
82#elif defined(VGA_ppc64le)
83   /* Nasty hack.  Does correctly atomically do *p += n, but only if p
84      is 8-aligned -- guaranteed by caller. */
85   unsigned long success;
86   do {
87      __asm__ __volatile__(
88         "ldarx  15,0,%1"    "\n\t"
89         "add    15,15,%2"   "\n\t"
90         "stdcx. 15,0,%1"    "\n\t"
91         "mfcr   %0"         "\n\t"
92         "srwi   %0,%0,29"   "\n\t"
93         "andi.  %0,%0,1"    "\n"
94         : /*out*/"=b"(success)
95         : /*in*/ "b"(p), "b"(((unsigned long)n))
96         : /*trash*/ "memory", "cc", "r15"
97      );
98   } while (success != 1);
99#elif defined(VGA_arm)
100   unsigned int block[3]
101      = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF };
102   do {
103      __asm__ __volatile__(
104         "mov    r5, %0"         "\n\t"
105         "ldr    r9, [r5, #0]"   "\n\t" // p
106         "ldr    r10, [r5, #4]"  "\n\t" // n
107         "ldrexb r8, [r9]"       "\n\t"
108         "add    r8, r8, r10"    "\n\t"
109         "strexb r4, r8, [r9]"   "\n\t"
110         "str    r4, [r5, #8]"   "\n\t"
111         : /*out*/
112         : /*in*/ "r"(&block[0])
113         : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4"
114      );
115   } while (block[2] != 0);
116#elif defined(VGA_arm64)
117   unsigned long long int block[3]
118      = { (unsigned long long int)p, (unsigned long long int)n,
119          0xFFFFFFFFFFFFFFFFULL};
120   do {
121      __asm__ __volatile__(
122         "mov   x5, %0"         "\n\t"
123         "ldr   x9, [x5, #0]"   "\n\t" // p
124         "ldr   x10, [x5, #8]"  "\n\t" // n
125         "ldxrb w8, [x9]"       "\n\t"
126         "add   x8, x8, x10"    "\n\t"
127         "stxrb w4, w8, [x9]"    "\n\t"
128         "str   x4, [x5, #16]"   "\n\t"
129         : /*out*/
130         : /*in*/ "r"(&block[0])
131         : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
132      );
133   } while (block[2] != 0);
134#elif defined(VGA_s390x)
135   int dummy;
136   __asm__ __volatile__(
137      "   l	0,%0\n\t"
138      "0: st	0,%1\n\t"
139      "   icm	1,1,%1\n\t"
140      "   ar	1,%2\n\t"
141      "   stcm  1,1,%1\n\t"
142      "   l     1,%1\n\t"
143      "   cs	0,1,%0\n\t"
144      "   jl    0b\n\t"
145      : "+m" (*p), "+m" (dummy)
146      : "d" (n)
147      : "cc", "memory", "0", "1");
148#elif defined(VGA_mips32)
149   /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
150      exception that can cause this function to fail. */
151#if defined (_MIPSEL)
152   unsigned int block[3]
153      = { (unsigned int)p, (unsigned int)n, 0x0 };
154   do {
155      __asm__ __volatile__(
156         "move $t0, %0"           "\n\t"
157         "lw   $t1, 0($t0)"       "\n\t"  // p
158         "lw   $t2, 4($t0)"       "\n\t"  // n
159         "andi $t2, $t2, 0xFF"    "\n\t"  // n = n and 0xFF
160         "li   $t4, 0xFF"         "\n\t"
161         "nor  $t4, $t4, $zero"   "\n\t"  // $t4 = 0xFFFFFF00
162         "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
163         "and  $t4, $t4, $t3"     "\n\t"  // $t4 = $t3 and 0xFFFFFF00
164         "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
165         "andi $t3, $t3, 0xFF"    "\n\t"  // $t3 = $t3 and 0xFF
166         "or   $t3, $t3, $t4"     "\n\t"  // $t3 = $t3 or $t4
167         "sc   $t3, 0($t1)"       "\n\t"
168         "sw   $t3, 8($t0)"       "\n\t"  // save result
169         : /*out*/
170         : /*in*/ "r"(&block[0])
171         : /*trash*/ "memory", "t0", "t1", "t2", "t3", "t4"
172      );
173   } while (block[2] != 1);
174#elif defined (_MIPSEB)
175   unsigned int block[3]
176      = { (unsigned int)p, (unsigned int)n << 24, 0x0 };
177   do {
178      __asm__ __volatile__(
179         "move $t0, %0"          "\n\t"
180         "lw   $t1, 0($t0)"      "\n\t"  // p
181         "lw   $t2, 4($t0)"      "\n\t"  // n
182         "ll   $t3, 0($t1)"      "\n\t"
183         "addu $t3, $t3, $t2"    "\n\t"
184         "sc   $t3, 0($t1)"      "\n\t"
185         "sw   $t3, 8($t0)"      "\n\t"
186         : /*out*/
187         : /*in*/ "r"(&block[0])
188         : /*trash*/ "memory", "t0", "t1", "t2", "t3"
189      );
190   } while (block[2] != 1);
191#endif
192#elif defined(VGA_mips64)
193   /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
194      exception that can cause this function to fail. */
195#if defined (_MIPSEL)
196   unsigned long block[3]
197      = { (unsigned long)p, (unsigned long)n, 0x0ULL };
198   do {
199      __asm__ __volatile__(
200         "move $t0, %0"           "\n\t"
201         "ld   $t1, 0($t0)"       "\n\t"  // p
202         "ld   $t2, 8($t0)"       "\n\t"  // n
203         "andi $t2, $t2, 0xFF"    "\n\t"  // n = n and 0xFF
204         "li   $s0, 0xFF"         "\n\t"
205         "nor  $s0, $s0, $zero"   "\n\t"  // $s0 = 0xFFFFFF00
206         "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
207         "and  $s0, $s0, $t3"     "\n\t"  // $s0 = $t3 and 0xFFFFFF00
208         "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
209         "andi $t3, $t3, 0xFF"    "\n\t"  // $t3 = $t3 and 0xFF
210         "or   $t3, $t3, $s0"     "\n\t"  // $t3 = $t3 or $s0
211         "sc   $t3, 0($t1)"       "\n\t"
212         "sw   $t3, 16($t0)"      "\n\t"  // save result
213         : /*out*/
214         : /*in*/ "r"(&block[0])
215         : /*trash*/ "memory", "t0", "t1", "t2", "t3", "s0"
216      );
217   } while (block[2] != 1);
218#elif defined (_MIPSEB)
219   unsigned long block[3]
220      = { (unsigned long)p, (unsigned long)n << 56, 0x0 };
221   do {
222      __asm__ __volatile__(
223         "move  $t0, %0"          "\n\t"
224         "ld    $t1, 0($t0)"      "\n\t"  // p
225         "ld    $t2, 8($t0)"      "\n\t"  // n
226         "lld   $t3, 0($t1)"      "\n\t"
227         "daddu $t3, $t3, $t2"    "\n\t"
228         "scd   $t3, 0($t1)"      "\n\t"
229         "sd    $t3, 16($t0)"     "\n\t"
230         : /*out*/
231         : /*in*/ "r"(&block[0])
232         : /*trash*/ "memory", "t0", "t1", "t2", "t3"
233      );
234   } while (block[2] != 1);
235#endif
236#elif defined(VGA_tilegx)
237   int i;
238   unsigned int *p4 = (unsigned int *)(((unsigned long long)p + 3) & (~3ULL));
239   unsigned int  mask = (0xff) << ((int)p & 3);
240   unsigned int  add = (n & 0xff) << ((int)p & 3);
241   unsigned int x, new;
242
243   while(1) {
244      x = *p4;
245      new = (x & (~mask)) | ((x + add) & mask);
246      __insn_mtspr(0x2780, x);
247      if ( __insn_cmpexch4(p4, new) == x)
248         break;
249   }
250#else
251# error "Unsupported arch"
252#endif
253}
254
255
256__attribute__((noinline)) void atomic_add_16bit ( short* p, int n )
257{
258#if defined(VGA_x86)
259   unsigned long block[2];
260   block[0] = (unsigned long)p;
261   block[1] = n;
262   __asm__ __volatile__(
263      "movl 0(%%esi),%%eax"      "\n\t"
264      "movl 4(%%esi),%%ebx"      "\n\t"
265      "lock; addw %%bx,(%%eax)"  "\n"
266      : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
267   );
268#elif defined(VGA_amd64)
269   unsigned long block[2];
270   block[0] = (unsigned long)p;
271   block[1] = n;
272   __asm__ __volatile__(
273      "movq 0(%%rsi),%%rax"      "\n\t"
274      "movq 8(%%rsi),%%rbx"      "\n\t"
275      "lock; addw %%bx,(%%rax)"  "\n"
276      : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
277   );
278#elif defined(VGA_ppc32)
279   /* Nasty hack.  Does correctly atomically do *p += n, but only if p
280      is 8-aligned -- guaranteed by caller. */
281   unsigned long success;
282   do {
283      __asm__ __volatile__(
284         "lwarx  15,0,%1"    "\n\t"
285         "add    15,15,%2"   "\n\t"
286         "stwcx. 15,0,%1"    "\n\t"
287         "mfcr   %0"         "\n\t"
288         "srwi   %0,%0,29"   "\n\t"
289         "andi.  %0,%0,1"    "\n"
290         : /*out*/"=b"(success)
291         : /*in*/ "b"(p), "b"(((unsigned long)n) << 16)
292         : /*trash*/ "memory", "cc", "r15"
293      );
294   } while (success != 1);
295#elif defined(VGA_ppc64be)
296   /* Nasty hack.  Does correctly atomically do *p += n, but only if p
297      is 8-aligned -- guaranteed by caller. */
298   unsigned long success;
299   do {
300      __asm__ __volatile__(
301         "ldarx  15,0,%1"    "\n\t"
302         "add    15,15,%2"   "\n\t"
303         "stdcx. 15,0,%1"    "\n\t"
304         "mfcr   %0"         "\n\t"
305         "srwi   %0,%0,29"   "\n\t"
306         "andi.  %0,%0,1"    "\n"
307         : /*out*/"=b"(success)
308         : /*in*/ "b"(p), "b"(((unsigned long)n) << 48)
309         : /*trash*/ "memory", "cc", "r15"
310      );
311   } while (success != 1);
312#elif defined(VGA_ppc64le)
313   /* Nasty hack.  Does correctly atomically do *p += n, but only if p
314      is 8-aligned -- guaranteed by caller. */
315   unsigned long success;
316   do {
317      __asm__ __volatile__(
318         "ldarx  15,0,%1"    "\n\t"
319         "add    15,15,%2"   "\n\t"
320         "stdcx. 15,0,%1"    "\n\t"
321         "mfcr   %0"         "\n\t"
322         "srwi   %0,%0,29"   "\n\t"
323         "andi.  %0,%0,1"    "\n"
324         : /*out*/"=b"(success)
325         : /*in*/ "b"(p), "b"(((unsigned long)n))
326         : /*trash*/ "memory", "cc", "r15"
327      );
328   } while (success != 1);
329#elif defined(VGA_arm)
330   unsigned int block[3]
331      = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF };
332   do {
333      __asm__ __volatile__(
334         "mov    r5, %0"         "\n\t"
335         "ldr    r9, [r5, #0]"   "\n\t" // p
336         "ldr    r10, [r5, #4]"  "\n\t" // n
337         "ldrexh r8, [r9]"       "\n\t"
338         "add    r8, r8, r10"    "\n\t"
339         "strexh r4, r8, [r9]"   "\n\t"
340         "str    r4, [r5, #8]"   "\n\t"
341         : /*out*/
342         : /*in*/ "r"(&block[0])
343         : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4"
344      );
345   } while (block[2] != 0);
346#elif defined(VGA_arm64)
347   unsigned long long int block[3]
348      = { (unsigned long long int)p, (unsigned long long int)n,
349          0xFFFFFFFFFFFFFFFFULL};
350   do {
351      __asm__ __volatile__(
352         "mov   x5, %0"         "\n\t"
353         "ldr   x9, [x5, #0]"   "\n\t" // p
354         "ldr   x10, [x5, #8]"  "\n\t" // n
355         "ldxrh w8, [x9]"       "\n\t"
356         "add   x8, x8, x10"    "\n\t"
357         "stxrh w4, w8, [x9]"    "\n\t"
358         "str   x4, [x5, #16]"   "\n\t"
359         : /*out*/
360         : /*in*/ "r"(&block[0])
361         : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
362      );
363   } while (block[2] != 0);
364#elif defined(VGA_s390x)
365   int dummy;
366   __asm__ __volatile__(
367      "   l	0,%0\n\t"
368      "0: st	0,%1\n\t"
369      "   icm	1,3,%1\n\t"
370      "   ar	1,%2\n\t"
371      "   stcm  1,3,%1\n\t"
372      "   l     1,%1\n\t"
373      "   cs	0,1,%0\n\t"
374      "   jl    0b\n\t"
375      : "+m" (*p), "+m" (dummy)
376      : "d" (n)
377      : "cc", "memory", "0", "1");
378#elif defined(VGA_mips32)
379   /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
380      exception that can cause this function to fail. */
381#if defined (_MIPSEL)
382   unsigned int block[3]
383      = { (unsigned int)p, (unsigned int)n, 0x0 };
384   do {
385      __asm__ __volatile__(
386         "move $t0, %0"           "\n\t"
387         "lw   $t1, 0($t0)"       "\n\t"  // p
388         "lw   $t2, 4($t0)"       "\n\t"  // n
389         "andi $t2, $t2, 0xFFFF"  "\n\t"  // n = n and 0xFFFF
390         "li   $t4, 0xFFFF"       "\n\t"
391         "nor  $t4, $t4, $zero"   "\n\t"  // $t4 = 0xFFFF0000
392         "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
393         "and  $t4, $t4, $t3"     "\n\t"  // $t4 = $t3 and 0xFFFF0000
394         "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
395         "andi $t3, $t3, 0xFFFF"  "\n\t"  // $t3 = $t3 and 0xFFFF
396         "or   $t3, $t3, $t4"     "\n\t"  // $t3 = $t3 or $t4
397         "sc   $t3, 0($t1)"       "\n\t"
398         "sw   $t3, 8($t0)"       "\n\t"  // save result
399         : /*out*/
400         : /*in*/ "r"(&block[0])
401         : /*trash*/ "memory", "t0", "t1", "t2", "t3", "t4"
402      );
403   } while (block[2] != 1);
404#elif defined (_MIPSEB)
405   unsigned int block[3]
406      = { (unsigned int)p, (unsigned int)n << 16, 0x0 };
407   do {
408      __asm__ __volatile__(
409         "move $t0, %0"          "\n\t"
410         "lw   $t1, 0($t0)"      "\n\t"  // p
411         "lw   $t2, 4($t0)"      "\n\t"  // n
412         "ll   $t3, 0($t1)"      "\n\t"
413         "addu $t3, $t3, $t2"    "\n\t"
414         "sc   $t3, 0($t1)"      "\n\t"
415         "sw   $t3, 8($t0)"      "\n\t"
416         : /*out*/
417         : /*in*/ "r"(&block[0])
418         : /*trash*/ "memory", "t0", "t1", "t2", "t3"
419      );
420   } while (block[2] != 1);
421#endif
422#elif defined(VGA_mips64)
423   /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
424      exception that can cause this function to fail. */
425#if defined (_MIPSEL)
426   unsigned long block[3]
427      = { (unsigned long)p, (unsigned long)n, 0x0ULL };
428   do {
429      __asm__ __volatile__(
430         "move $t0, %0"           "\n\t"
431         "ld   $t1, 0($t0)"       "\n\t"  // p
432         "ld   $t2, 8($t0)"       "\n\t"  // n
433         "andi $t2, $t2, 0xFFFF"  "\n\t"  // n = n and 0xFFFF
434         "li   $s0, 0xFFFF"       "\n\t"
435         "nor  $s0, $s0, $zero"   "\n\t"  // $s0= 0xFFFF0000
436         "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
437         "and  $s0, $s0, $t3"     "\n\t"  // $s0 = $t3 and 0xFFFF0000
438         "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
439         "andi $t3, $t3, 0xFFFF"  "\n\t"  // $t3 = $t3 and 0xFFFF
440         "or   $t3, $t3, $s0"     "\n\t"  // $t3 = $t3 or $s0
441         "sc   $t3, 0($t1)"       "\n\t"
442         "sw   $t3, 16($t0)"      "\n\t"  // save result
443         : /*out*/
444         : /*in*/ "r"(&block[0])
445         : /*trash*/ "memory", "t0", "t1", "t2", "t3", "s0"
446      );
447   } while (block[2] != 1);
448#elif defined (_MIPSEB)
449   unsigned long block[3]
450      = { (unsigned long)p, (unsigned long)n << 48, 0x0 };
451   do {
452      __asm__ __volatile__(
453         "move  $t0, %0"          "\n\t"
454         "ld    $t1, 0($t0)"      "\n\t"  // p
455         "ld    $t2, 8($t0)"      "\n\t"  // n
456         "lld   $t3, 0($t1)"      "\n\t"
457         "daddu $t3, $t3, $t2"    "\n\t"
458         "scd   $t3, 0($t1)"      "\n\t"
459         "sd    $t3, 16($t0)"     "\n\t"
460         : /*out*/
461         : /*in*/ "r"(&block[0])
462         : /*trash*/ "memory", "t0", "t1", "t2", "t3"
463      );
464   } while (block[2] != 1);
465#endif
466#elif defined(VGA_tilegx)
467   int i;
468   unsigned int *p4 = (unsigned int *)(((unsigned long long)p + 3) & (~3ULL));
469   unsigned int  mask = (0xffff) << ((int)p & 3);
470   unsigned int  add = (n & 0xffff) << ((int)p & 3);
471   unsigned int x, new;
472
473   while(1) {
474      x = *p4;
475      new = (x & (~mask)) | ((x + add) & mask);
476      __insn_mtspr(0x2780, x);
477      if ( __insn_cmpexch4(p4, new) == x)
478         break;
479   }
480#else
481# error "Unsupported arch"
482#endif
483}
484
485__attribute__((noinline)) void atomic_add_32bit ( int* p, int n )
486{
487#if defined(VGA_x86)
488   unsigned long block[2];
489   block[0] = (unsigned long)p;
490   block[1] = n;
491   __asm__ __volatile__(
492      "movl 0(%%esi),%%eax"       "\n\t"
493      "movl 4(%%esi),%%ebx"       "\n\t"
494      "lock; addl %%ebx,(%%eax)"  "\n"
495      : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
496   );
497#elif defined(VGA_amd64)
498   unsigned long block[2];
499   block[0] = (unsigned long)p;
500   block[1] = n;
501   __asm__ __volatile__(
502      "movq 0(%%rsi),%%rax"       "\n\t"
503      "movq 8(%%rsi),%%rbx"       "\n\t"
504      "lock; addl %%ebx,(%%rax)"  "\n"
505      : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
506   );
507#elif defined(VGA_ppc32)
508   unsigned long success;
509   do {
510      __asm__ __volatile__(
511         "lwarx  15,0,%1"    "\n\t"
512         "add    15,15,%2"   "\n\t"
513         "stwcx. 15,0,%1"    "\n\t"
514         "mfcr   %0"         "\n\t"
515         "srwi   %0,%0,29"   "\n\t"
516         "andi.  %0,%0,1"    "\n"
517         : /*out*/"=b"(success)
518         : /*in*/ "b"(p), "b"(n)
519         : /*trash*/ "memory", "cc", "r15"
520      );
521   } while (success != 1);
522#elif defined(VGA_ppc64be)
523   /* Nasty hack.  Does correctly atomically do *p += n, but only if p
524      is 8-aligned -- guaranteed by caller. */
525   unsigned long success;
526   do {
527      __asm__ __volatile__(
528         "ldarx  15,0,%1"    "\n\t"
529         "add    15,15,%2"   "\n\t"
530         "stdcx. 15,0,%1"    "\n\t"
531         "mfcr   %0"         "\n\t"
532         "srwi   %0,%0,29"   "\n\t"
533         "andi.  %0,%0,1"    "\n"
534         : /*out*/"=b"(success)
535         : /*in*/ "b"(p), "b"(((unsigned long)n) << 32)
536         : /*trash*/ "memory", "cc", "r15"
537      );
538   } while (success != 1);
539#elif defined(VGA_ppc64le)
540   /* Nasty hack.  Does correctly atomically do *p += n, but only if p
541      is 8-aligned -- guaranteed by caller. */
542   unsigned long success;
543   do {
544      __asm__ __volatile__(
545         "ldarx  15,0,%1"    "\n\t"
546         "add    15,15,%2"   "\n\t"
547         "stdcx. 15,0,%1"    "\n\t"
548         "mfcr   %0"         "\n\t"
549         "srwi   %0,%0,29"   "\n\t"
550         "andi.  %0,%0,1"    "\n"
551         : /*out*/"=b"(success)
552         : /*in*/ "b"(p), "b"(((unsigned long)n))
553         : /*trash*/ "memory", "cc", "r15"
554      );
555   } while (success != 1);
556#elif defined(VGA_arm)
557   unsigned int block[3]
558      = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF };
559   do {
560      __asm__ __volatile__(
561         "mov   r5, %0"         "\n\t"
562         "ldr   r9, [r5, #0]"   "\n\t" // p
563         "ldr   r10, [r5, #4]"  "\n\t" // n
564         "ldrex r8, [r9]"       "\n\t"
565         "add   r8, r8, r10"    "\n\t"
566         "strex r4, r8, [r9]"   "\n\t"
567         "str   r4, [r5, #8]"   "\n\t"
568         : /*out*/
569         : /*in*/ "r"(&block[0])
570         : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4"
571      );
572   } while (block[2] != 0);
573#elif defined(VGA_arm64)
574   unsigned long long int block[3]
575      = { (unsigned long long int)p, (unsigned long long int)n,
576          0xFFFFFFFFFFFFFFFFULL};
577   do {
578      __asm__ __volatile__(
579         "mov   x5, %0"         "\n\t"
580         "ldr   x9, [x5, #0]"   "\n\t" // p
581         "ldr   x10, [x5, #8]"  "\n\t" // n
582         "ldxr  w8, [x9]"       "\n\t"
583         "add   x8, x8, x10"    "\n\t"
584         "stxr  w4, w8, [x9]"    "\n\t"
585         "str   x4, [x5, #16]"   "\n\t"
586         : /*out*/
587         : /*in*/ "r"(&block[0])
588         : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
589      );
590   } while (block[2] != 0);
591#elif defined(VGA_s390x)
592   __asm__ __volatile__(
593      "   l	0,%0\n\t"
594      "0: lr	1,0\n\t"
595      "   ar	1,%1\n\t"
596      "   cs	0,1,%0\n\t"
597      "   jl    0b\n\t"
598      : "+m" (*p)
599      : "d" (n)
600      : "cc", "memory", "0", "1");
601#elif defined(VGA_mips32)
602   unsigned int block[3]
603      = { (unsigned int)p, (unsigned int)n, 0x0 };
604   do {
605      __asm__ __volatile__(
606         "move $t0, %0"        "\n\t"
607         "lw   $t1, 0($t0)"    "\n\t"  // p
608         "lw   $t2, 4($t0)"    "\n\t"  // n
609         "ll   $t3, 0($t1)"    "\n\t"
610         "addu $t3, $t3, $t2"  "\n\t"
611         "sc   $t3, 0($t1)"    "\n\t"
612         "sw   $t3, 8($t0)"    "\n\t"
613         : /*out*/
614         : /*in*/ "r"(&block[0])
615         : /*trash*/ "memory", "t0", "t1", "t2", "t3"
616      );
617   } while (block[2] != 1);
618#elif defined(VGA_mips64)
619   unsigned long block[3]
620      = { (unsigned long)p, (unsigned long)n, 0x0ULL };
621   do {
622      __asm__ __volatile__(
623         "move  $t0, %0"        "\n\t"
624         "ld    $t1, 0($t0)"    "\n\t"  // p
625         "ld    $t2, 8($t0)"    "\n\t"  // n
626         "ll    $t3, 0($t1)"    "\n\t"
627         "addu  $t3, $t3, $t2"  "\n\t"
628         "sc    $t3, 0($t1)"    "\n\t"
629         "sd    $t3, 16($t0)"   "\n\t"
630         : /*out*/
631         : /*in*/ "r"(&block[0])
632         : /*trash*/ "memory", "t0", "t1", "t2", "t3"
633      );
634   } while (block[2] != 1);
635#elif defined(VGA_tilegx)
636    __insn_fetchadd4(p, n);
637#else
638# error "Unsupported arch"
639#endif
640}
641
642__attribute__((noinline)) void atomic_add_64bit ( long long int* p, int n )
643{
644#if defined(VGA_x86) || defined(VGA_ppc32) || defined(VGA_mips32)
645   /* do nothing; is not supported */
646#elif defined(VGA_amd64)
647   // this is a bit subtle.  It relies on the fact that, on a 64-bit platform,
648   // sizeof(unsigned long long int) == sizeof(unsigned long) == sizeof(void*)
649   unsigned long long int block[2];
650   block[0] = (unsigned long long int)(unsigned long)p;
651   block[1] = n;
652   __asm__ __volatile__(
653      "movq 0(%%rsi),%%rax"      "\n\t"
654      "movq 8(%%rsi),%%rbx"      "\n\t"
655      "lock; addq %%rbx,(%%rax)" "\n"
656      : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
657   );
658#elif defined(VGA_ppc64be) || defined(VGA_ppc64le)
659   unsigned long success;
660   do {
661      __asm__ __volatile__(
662         "ldarx  15,0,%1"    "\n\t"
663         "add    15,15,%2"   "\n\t"
664         "stdcx. 15,0,%1"    "\n\t"
665         "mfcr   %0"         "\n\t"
666         "srwi   %0,%0,29"   "\n\t"
667         "andi.  %0,%0,1"    "\n"
668         : /*out*/"=b"(success)
669         : /*in*/ "b"(p), "b"(n)
670         : /*trash*/ "memory", "cc", "r15"
671      );
672   } while (success != 1);
673#elif defined(VGA_arm)
674   unsigned long long int block[3]
675     = { (unsigned long long int)(unsigned long)p,
676         (unsigned long long int)n,
677         0xFFFFFFFFFFFFFFFFULL };
678   do {
679      __asm__ __volatile__(
680         "mov    r5, %0"             "\n\t"
681         "ldr    r8,     [r5, #0]"   "\n\t" // p
682         "ldrd   r2, r3, [r5, #8]"   "\n\t" // n
683         "ldrexd r0, r1, [r8]"       "\n\t"
684         "adds   r2, r2, r0"         "\n\t"
685         "adc    r3, r3, r1"         "\n\t"
686         "strexd r1, r2, r3, [r8]"   "\n\t"
687         "str    r1, [r5, #16]"      "\n\t"
688         : /*out*/
689         : /*in*/ "r"(&block[0])
690         : /*trash*/ "memory", "cc", "r5", "r0", "r1", "r8", "r2", "r3"
691      );
692   } while (block[2] != 0xFFFFFFFF00000000ULL);
693#elif defined(VGA_arm64)
694   unsigned long long int block[3]
695      = { (unsigned long long int)p, (unsigned long long int)n,
696          0xFFFFFFFFFFFFFFFFULL};
697   do {
698      __asm__ __volatile__(
699         "mov   x5, %0"         "\n\t"
700         "ldr   x9, [x5, #0]"   "\n\t" // p
701         "ldr   x10, [x5, #8]"  "\n\t" // n
702         "ldxr  x8, [x9]"       "\n\t"
703         "add   x8, x8, x10"    "\n\t"
704         "stxr  w4, x8, [x9]"   "\n\t"
705         "str   x4, [x5, #16]"   "\n\t"
706         : /*out*/
707         : /*in*/ "r"(&block[0])
708         : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
709      );
710   } while (block[2] != 0);
711#elif defined(VGA_s390x)
712   __asm__ __volatile__(
713      "   lg	0,%0\n\t"
714      "0: lgr	1,0\n\t"
715      "   agr	1,%1\n\t"
716      "   csg	0,1,%0\n\t"
717      "   jl    0b\n\t"
718      : "+m" (*p)
719      : "d" (n)
720      : "cc", "memory", "0", "1");
721#elif defined(VGA_mips64)
722   unsigned long block[3]
723      = { (unsigned long)p, (unsigned long)n, 0x0ULL };
724   do {
725      __asm__ __volatile__(
726         "move  $t0, %0"        "\n\t"
727         "ld    $t1, 0($t0)"    "\n\t" // p
728         "ld    $t2, 8($t0)"    "\n\t" // n
729         "lld   $t3, 0($t1)"    "\n\t"
730         "daddu $t3, $t3, $t2"  "\n\t"
731         "scd   $t3, 0($t1)"    "\n\t"
732         "sd    $t3, 16($t0)"   "\n\t"
733         : /*out*/
734         : /*in*/ "r"(&block[0])
735         : /*trash*/ "memory", "t0", "t1", "t2", "t3"
736      );
737   } while (block[2] != 1);
738#elif defined(VGA_tilegx)
739    __insn_fetchadd(p, n);
740#else
741# error "Unsupported arch"
742#endif
743}
744
745int main ( int argc, char** argv )
746{
747   int    i, status;
748   char*  page;
749   char*  p8;
750   short* p16;
751   int*   p32;
752   long long int* p64;
753   pid_t  child, p2;
754
755   printf("parent, pre-fork\n");
756
757   page = mmap( 0, sysconf(_SC_PAGESIZE),
758                   PROT_READ|PROT_WRITE,
759                   MAP_ANONYMOUS|MAP_SHARED, -1, 0 );
760   if (page == MAP_FAILED) {
761      perror("mmap failed");
762      exit(1);
763   }
764
765   p8  = (char*)(page+0);
766   p16 = (short*)(page+256);
767   p32 = (int*)(page+512);
768   p64 = (long long int*)(page+768);
769
770   assert( IS_8_ALIGNED(p8) );
771   assert( IS_8_ALIGNED(p16) );
772   assert( IS_8_ALIGNED(p32) );
773   assert( IS_8_ALIGNED(p64) );
774
775   memset(page, 0, 1024);
776
777   *p8  = 0;
778   *p16 = 0;
779   *p32 = 0;
780   *p64 = 0;
781
782   child = fork();
783   if (child == -1) {
784      perror("fork() failed\n");
785      return 1;
786   }
787
788   if (child == 0) {
789      /* --- CHILD --- */
790      printf("child\n");
791      for (i = 0; i < NNN; i++) {
792         atomic_add_8bit(p8, 1);
793         atomic_add_16bit(p16, 1);
794         atomic_add_32bit(p32, 1);
795         atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
796      }
797      return 1;
798      /* NOTREACHED */
799
800   }
801
802   /* --- PARENT --- */
803
804   printf("parent\n");
805
806   for (i = 0; i < NNN; i++) {
807      atomic_add_8bit(p8, 1);
808      atomic_add_16bit(p16, 1);
809      atomic_add_32bit(p32, 1);
810      atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
811   }
812
813   p2 = waitpid(child, &status, 0);
814   assert(p2 == child);
815
816   /* assert that child finished normally */
817   assert(WIFEXITED(status));
818
819   printf("FINAL VALUES:  8 bit %d,  16 bit %d,  32 bit %d,  64 bit %lld\n",
820          (int)(*(signed char*)p8), (int)(*p16), *p32, *p64 );
821
822   if (-74 == (int)(*(signed char*)p8)
823       && 32694 == (int)(*p16)
824       && 6913974 == *p32
825       && (0LL == *p64 || 682858642110LL == *p64)) {
826      printf("PASS\n");
827   } else {
828      printf("FAIL -- see source code for expected values\n");
829   }
830
831   printf("parent exits\n");
832
833   return 0;
834}
835