1
2/* This is an example of a program which does atomic memory operations
3   between two processes which share a page.  Valgrind 3.4.1 and
4   earlier produce incorrect answers because it does not preserve
5   atomicity of the relevant instructions in the generated code; but
6   the post-DCAS-merge versions of Valgrind do behave correctly. */
7
8/* On ARM, this can be compiled into either ARM or Thumb code, so as
9   to test both A and T encodings of LDREX/STREX et al.  Also on ARM,
10   it tests doubleword atomics (LDREXD, STREXD) which I don't think it
11   does on any other platform. */
12
13#include <stdlib.h>
14#include <stdio.h>
15#include <string.h>
16#include <assert.h>
17#include <unistd.h>
18#include <sys/wait.h>
19#include "tests/sys_mman.h"
20
21#define NNN 3456987
22
23#define IS_8_ALIGNED(_ptr)   (0 == (((unsigned long)(_ptr)) & 7))
24
25
26__attribute__((noinline)) void atomic_add_8bit ( char* p, int n )
27{
28#if defined(VGA_x86)
29   unsigned long block[2];
30   block[0] = (unsigned long)p;
31   block[1] = n;
32   __asm__ __volatile__(
33      "movl 0(%%esi),%%eax"      "\n\t"
34      "movl 4(%%esi),%%ebx"      "\n\t"
35      "lock; addb %%bl,(%%eax)"  "\n"
36      : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
37   );
38#elif defined(VGA_amd64)
39   unsigned long block[2];
40   block[0] = (unsigned long)p;
41   block[1] = n;
42   __asm__ __volatile__(
43      "movq 0(%%rsi),%%rax"      "\n\t"
44      "movq 8(%%rsi),%%rbx"      "\n\t"
45      "lock; addb %%bl,(%%rax)"  "\n"
46      : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
47   );
48#elif defined(VGA_ppc32)
49   /* Nasty hack.  Does correctly atomically do *p += n, but only if p
50      is 4-aligned -- guaranteed by caller. */
51   unsigned long success;
52   do {
53      __asm__ __volatile__(
54         "lwarx  15,0,%1"    "\n\t"
55         "add    15,15,%2"   "\n\t"
56         "stwcx. 15,0,%1"    "\n\t"
57         "mfcr   %0"         "\n\t"
58         "srwi   %0,%0,29"   "\n\t"
59         "andi.  %0,%0,1"    "\n"
60         : /*out*/"=b"(success)
61         : /*in*/ "b"(p), "b"(((unsigned long)n) << 24)
62         : /*trash*/ "memory", "cc", "r15"
63      );
64   } while (success != 1);
65#elif defined(VGA_ppc64)
66   /* Nasty hack.  Does correctly atomically do *p += n, but only if p
67      is 8-aligned -- guaranteed by caller. */
68   unsigned long success;
69   do {
70      __asm__ __volatile__(
71         "ldarx  15,0,%1"    "\n\t"
72         "add    15,15,%2"   "\n\t"
73         "stdcx. 15,0,%1"    "\n\t"
74         "mfcr   %0"         "\n\t"
75         "srwi   %0,%0,29"   "\n\t"
76         "andi.  %0,%0,1"    "\n"
77         : /*out*/"=b"(success)
78         : /*in*/ "b"(p), "b"(((unsigned long)n) << 56)
79         : /*trash*/ "memory", "cc", "r15"
80      );
81   } while (success != 1);
82#elif defined(VGA_arm)
83   unsigned int block[3]
84      = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF };
85   do {
86      __asm__ __volatile__(
87         "mov    r5, %0"         "\n\t"
88         "ldr    r9, [r5, #0]"   "\n\t" // p
89         "ldr    r10, [r5, #4]"  "\n\t" // n
90         "ldrexb r8, [r9]"       "\n\t"
91         "add    r8, r8, r10"    "\n\t"
92         "strexb r4, r8, [r9]"   "\n\t"
93         "str    r4, [r5, #8]"   "\n\t"
94         : /*out*/
95         : /*in*/ "r"(&block[0])
96         : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4"
97      );
98   } while (block[2] != 0);
99#elif defined(VGA_arm64)
100   unsigned long long int block[3]
101      = { (unsigned long long int)p, (unsigned long long int)n,
102          0xFFFFFFFFFFFFFFFFULL};
103   do {
104      __asm__ __volatile__(
105         "mov   x5, %0"         "\n\t"
106         "ldr   x9, [x5, #0]"   "\n\t" // p
107         "ldr   x10, [x5, #8]"  "\n\t" // n
108         "ldxrb w8, [x9]"       "\n\t"
109         "add   x8, x8, x10"    "\n\t"
110         "stxrb w4, w8, [x9]"    "\n\t"
111         "str   x4, [x5, #16]"   "\n\t"
112         : /*out*/
113         : /*in*/ "r"(&block[0])
114         : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
115      );
116   } while (block[2] != 0);
117#elif defined(VGA_s390x)
118   int dummy;
119   __asm__ __volatile__(
120      "   l	0,%0\n\t"
121      "0: st	0,%1\n\t"
122      "   icm	1,1,%1\n\t"
123      "   ar	1,%2\n\t"
124      "   stcm  1,1,%1\n\t"
125      "   l     1,%1\n\t"
126      "   cs	0,1,%0\n\t"
127      "   jl    0b\n\t"
128      : "+m" (*p), "+m" (dummy)
129      : "d" (n)
130      : "cc", "memory", "0", "1");
131#elif defined(VGA_mips32)
132   /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
133      exception that can cause this function to fail. */
134#if defined (_MIPSEL)
135   unsigned int block[3]
136      = { (unsigned int)p, (unsigned int)n, 0x0 };
137   do {
138      __asm__ __volatile__(
139         "move $t0, %0"           "\n\t"
140         "lw   $t1, 0($t0)"       "\n\t"  // p
141         "lw   $t2, 4($t0)"       "\n\t"  // n
142         "andi $t2, $t2, 0xFF"    "\n\t"  // n = n and 0xFF
143         "li   $t4, 0xFF"         "\n\t"
144         "nor  $t4, $t4, $zero"   "\n\t"  // $t4 = 0xFFFFFF00
145         "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
146         "and  $t4, $t4, $t3"     "\n\t"  // $t4 = $t3 and 0xFFFFFF00
147         "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
148         "andi $t3, $t3, 0xFF"    "\n\t"  // $t3 = $t3 and 0xFF
149         "or   $t3, $t3, $t4"     "\n\t"  // $t3 = $t3 or $t4
150         "sc   $t3, 0($t1)"       "\n\t"
151         "sw   $t3, 8($t0)"       "\n\t"  // save result
152         : /*out*/
153         : /*in*/ "r"(&block[0])
154         : /*trash*/ "memory", "t0", "t1", "t2", "t3", "t4"
155      );
156   } while (block[2] != 1);
157#elif defined (_MIPSEB)
158   unsigned int block[3]
159      = { (unsigned int)p, (unsigned int)n << 24, 0x0 };
160   do {
161      __asm__ __volatile__(
162         "move $t0, %0"          "\n\t"
163         "lw   $t1, 0($t0)"      "\n\t"  // p
164         "lw   $t2, 4($t0)"      "\n\t"  // n
165         "ll   $t3, 0($t1)"      "\n\t"
166         "addu $t3, $t3, $t2"    "\n\t"
167         "sc   $t3, 0($t1)"      "\n\t"
168         "sw   $t3, 8($t0)"      "\n\t"
169         : /*out*/
170         : /*in*/ "r"(&block[0])
171         : /*trash*/ "memory", "t0", "t1", "t2", "t3"
172      );
173   } while (block[2] != 1);
174#endif
175#elif defined(VGA_mips64)
176   /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
177      exception that can cause this function to fail. */
178#if defined (_MIPSEL)
179   unsigned long block[3]
180      = { (unsigned long)p, (unsigned long)n, 0x0ULL };
181   do {
182      __asm__ __volatile__(
183         "move $t0, %0"           "\n\t"
184         "ld   $t1, 0($t0)"       "\n\t"  // p
185         "ld   $t2, 8($t0)"       "\n\t"  // n
186         "andi $t2, $t2, 0xFF"    "\n\t"  // n = n and 0xFF
187         "li   $s0, 0xFF"         "\n\t"
188         "nor  $s0, $s0, $zero"   "\n\t"  // $s0 = 0xFFFFFF00
189         "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
190         "and  $s0, $s0, $t3"     "\n\t"  // $s0 = $t3 and 0xFFFFFF00
191         "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
192         "andi $t3, $t3, 0xFF"    "\n\t"  // $t3 = $t3 and 0xFF
193         "or   $t3, $t3, $s0"     "\n\t"  // $t3 = $t3 or $s0
194         "sc   $t3, 0($t1)"       "\n\t"
195         "sw   $t3, 16($t0)"      "\n\t"  // save result
196         : /*out*/
197         : /*in*/ "r"(&block[0])
198         : /*trash*/ "memory", "t0", "t1", "t2", "t3", "s0"
199      );
200   } while (block[2] != 1);
201#elif defined (_MIPSEB)
202   unsigned long block[3]
203      = { (unsigned long)p, (unsigned long)n << 56, 0x0 };
204   do {
205      __asm__ __volatile__(
206         "move  $t0, %0"          "\n\t"
207         "ld    $t1, 0($t0)"      "\n\t"  // p
208         "ld    $t2, 8($t0)"      "\n\t"  // n
209         "lld   $t3, 0($t1)"      "\n\t"
210         "daddu $t3, $t3, $t2"    "\n\t"
211         "scd   $t3, 0($t1)"      "\n\t"
212         "sd    $t3, 16($t0)"     "\n\t"
213         : /*out*/
214         : /*in*/ "r"(&block[0])
215         : /*trash*/ "memory", "t0", "t1", "t2", "t3"
216      );
217   } while (block[2] != 1);
218#endif
219#else
220# error "Unsupported arch"
221#endif
222}
223
224
225__attribute__((noinline)) void atomic_add_16bit ( short* p, int n )
226{
227#if defined(VGA_x86)
228   unsigned long block[2];
229   block[0] = (unsigned long)p;
230   block[1] = n;
231   __asm__ __volatile__(
232      "movl 0(%%esi),%%eax"      "\n\t"
233      "movl 4(%%esi),%%ebx"      "\n\t"
234      "lock; addw %%bx,(%%eax)"  "\n"
235      : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
236   );
237#elif defined(VGA_amd64)
238   unsigned long block[2];
239   block[0] = (unsigned long)p;
240   block[1] = n;
241   __asm__ __volatile__(
242      "movq 0(%%rsi),%%rax"      "\n\t"
243      "movq 8(%%rsi),%%rbx"      "\n\t"
244      "lock; addw %%bx,(%%rax)"  "\n"
245      : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
246   );
247#elif defined(VGA_ppc32)
248   /* Nasty hack.  Does correctly atomically do *p += n, but only if p
249      is 8-aligned -- guaranteed by caller. */
250   unsigned long success;
251   do {
252      __asm__ __volatile__(
253         "lwarx  15,0,%1"    "\n\t"
254         "add    15,15,%2"   "\n\t"
255         "stwcx. 15,0,%1"    "\n\t"
256         "mfcr   %0"         "\n\t"
257         "srwi   %0,%0,29"   "\n\t"
258         "andi.  %0,%0,1"    "\n"
259         : /*out*/"=b"(success)
260         : /*in*/ "b"(p), "b"(((unsigned long)n) << 16)
261         : /*trash*/ "memory", "cc", "r15"
262      );
263   } while (success != 1);
264#elif defined(VGA_ppc64)
265   /* Nasty hack.  Does correctly atomically do *p += n, but only if p
266      is 8-aligned -- guaranteed by caller. */
267   unsigned long success;
268   do {
269      __asm__ __volatile__(
270         "ldarx  15,0,%1"    "\n\t"
271         "add    15,15,%2"   "\n\t"
272         "stdcx. 15,0,%1"    "\n\t"
273         "mfcr   %0"         "\n\t"
274         "srwi   %0,%0,29"   "\n\t"
275         "andi.  %0,%0,1"    "\n"
276         : /*out*/"=b"(success)
277         : /*in*/ "b"(p), "b"(((unsigned long)n) << 48)
278         : /*trash*/ "memory", "cc", "r15"
279      );
280   } while (success != 1);
281#elif defined(VGA_arm)
282   unsigned int block[3]
283      = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF };
284   do {
285      __asm__ __volatile__(
286         "mov    r5, %0"         "\n\t"
287         "ldr    r9, [r5, #0]"   "\n\t" // p
288         "ldr    r10, [r5, #4]"  "\n\t" // n
289         "ldrexh r8, [r9]"       "\n\t"
290         "add    r8, r8, r10"    "\n\t"
291         "strexh r4, r8, [r9]"   "\n\t"
292         "str    r4, [r5, #8]"   "\n\t"
293         : /*out*/
294         : /*in*/ "r"(&block[0])
295         : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4"
296      );
297   } while (block[2] != 0);
298#elif defined(VGA_arm64)
299   unsigned long long int block[3]
300      = { (unsigned long long int)p, (unsigned long long int)n,
301          0xFFFFFFFFFFFFFFFFULL};
302   do {
303      __asm__ __volatile__(
304         "mov   x5, %0"         "\n\t"
305         "ldr   x9, [x5, #0]"   "\n\t" // p
306         "ldr   x10, [x5, #8]"  "\n\t" // n
307         "ldxrh w8, [x9]"       "\n\t"
308         "add   x8, x8, x10"    "\n\t"
309         "stxrh w4, w8, [x9]"    "\n\t"
310         "str   x4, [x5, #16]"   "\n\t"
311         : /*out*/
312         : /*in*/ "r"(&block[0])
313         : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
314      );
315   } while (block[2] != 0);
316#elif defined(VGA_s390x)
317   int dummy;
318   __asm__ __volatile__(
319      "   l	0,%0\n\t"
320      "0: st	0,%1\n\t"
321      "   icm	1,3,%1\n\t"
322      "   ar	1,%2\n\t"
323      "   stcm  1,3,%1\n\t"
324      "   l     1,%1\n\t"
325      "   cs	0,1,%0\n\t"
326      "   jl    0b\n\t"
327      : "+m" (*p), "+m" (dummy)
328      : "d" (n)
329      : "cc", "memory", "0", "1");
330#elif defined(VGA_mips32)
331   /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
332      exception that can cause this function to fail. */
333#if defined (_MIPSEL)
334   unsigned int block[3]
335      = { (unsigned int)p, (unsigned int)n, 0x0 };
336   do {
337      __asm__ __volatile__(
338         "move $t0, %0"           "\n\t"
339         "lw   $t1, 0($t0)"       "\n\t"  // p
340         "lw   $t2, 4($t0)"       "\n\t"  // n
341         "andi $t2, $t2, 0xFFFF"  "\n\t"  // n = n and 0xFFFF
342         "li   $t4, 0xFFFF"       "\n\t"
343         "nor  $t4, $t4, $zero"   "\n\t"  // $t4 = 0xFFFF0000
344         "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
345         "and  $t4, $t4, $t3"     "\n\t"  // $t4 = $t3 and 0xFFFF0000
346         "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
347         "andi $t3, $t3, 0xFFFF"  "\n\t"  // $t3 = $t3 and 0xFFFF
348         "or   $t3, $t3, $t4"     "\n\t"  // $t3 = $t3 or $t4
349         "sc   $t3, 0($t1)"       "\n\t"
350         "sw   $t3, 8($t0)"       "\n\t"  // save result
351         : /*out*/
352         : /*in*/ "r"(&block[0])
353         : /*trash*/ "memory", "t0", "t1", "t2", "t3", "t4"
354      );
355   } while (block[2] != 1);
356#elif defined (_MIPSEB)
357   unsigned int block[3]
358      = { (unsigned int)p, (unsigned int)n << 16, 0x0 };
359   do {
360      __asm__ __volatile__(
361         "move $t0, %0"          "\n\t"
362         "lw   $t1, 0($t0)"      "\n\t"  // p
363         "lw   $t2, 4($t0)"      "\n\t"  // n
364         "ll   $t3, 0($t1)"      "\n\t"
365         "addu $t3, $t3, $t2"    "\n\t"
366         "sc   $t3, 0($t1)"      "\n\t"
367         "sw   $t3, 8($t0)"      "\n\t"
368         : /*out*/
369         : /*in*/ "r"(&block[0])
370         : /*trash*/ "memory", "t0", "t1", "t2", "t3"
371      );
372   } while (block[2] != 1);
373#endif
374#elif defined(VGA_mips64)
375   /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
376      exception that can cause this function to fail. */
377#if defined (_MIPSEL)
378   unsigned long block[3]
379      = { (unsigned long)p, (unsigned long)n, 0x0ULL };
380   do {
381      __asm__ __volatile__(
382         "move $t0, %0"           "\n\t"
383         "ld   $t1, 0($t0)"       "\n\t"  // p
384         "ld   $t2, 8($t0)"       "\n\t"  // n
385         "andi $t2, $t2, 0xFFFF"  "\n\t"  // n = n and 0xFFFF
386         "li   $s0, 0xFFFF"       "\n\t"
387         "nor  $s0, $s0, $zero"   "\n\t"  // $s0= 0xFFFF0000
388         "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
389         "and  $s0, $s0, $t3"     "\n\t"  // $s0 = $t3 and 0xFFFF0000
390         "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
391         "andi $t3, $t3, 0xFFFF"  "\n\t"  // $t3 = $t3 and 0xFFFF
392         "or   $t3, $t3, $s0"     "\n\t"  // $t3 = $t3 or $s0
393         "sc   $t3, 0($t1)"       "\n\t"
394         "sw   $t3, 16($t0)"      "\n\t"  // save result
395         : /*out*/
396         : /*in*/ "r"(&block[0])
397         : /*trash*/ "memory", "t0", "t1", "t2", "t3", "s0"
398      );
399   } while (block[2] != 1);
400#elif defined (_MIPSEB)
401   unsigned long block[3]
402      = { (unsigned long)p, (unsigned long)n << 48, 0x0 };
403   do {
404      __asm__ __volatile__(
405         "move  $t0, %0"          "\n\t"
406         "ld    $t1, 0($t0)"      "\n\t"  // p
407         "ld    $t2, 8($t0)"      "\n\t"  // n
408         "lld   $t3, 0($t1)"      "\n\t"
409         "daddu $t3, $t3, $t2"    "\n\t"
410         "scd   $t3, 0($t1)"      "\n\t"
411         "sd    $t3, 16($t0)"     "\n\t"
412         : /*out*/
413         : /*in*/ "r"(&block[0])
414         : /*trash*/ "memory", "t0", "t1", "t2", "t3"
415      );
416   } while (block[2] != 1);
417#endif
418#else
419# error "Unsupported arch"
420#endif
421}
422
423__attribute__((noinline)) void atomic_add_32bit ( int* p, int n )
424{
425#if defined(VGA_x86)
426   unsigned long block[2];
427   block[0] = (unsigned long)p;
428   block[1] = n;
429   __asm__ __volatile__(
430      "movl 0(%%esi),%%eax"       "\n\t"
431      "movl 4(%%esi),%%ebx"       "\n\t"
432      "lock; addl %%ebx,(%%eax)"  "\n"
433      : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
434   );
435#elif defined(VGA_amd64)
436   unsigned long block[2];
437   block[0] = (unsigned long)p;
438   block[1] = n;
439   __asm__ __volatile__(
440      "movq 0(%%rsi),%%rax"       "\n\t"
441      "movq 8(%%rsi),%%rbx"       "\n\t"
442      "lock; addl %%ebx,(%%rax)"  "\n"
443      : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
444   );
445#elif defined(VGA_ppc32)
446   unsigned long success;
447   do {
448      __asm__ __volatile__(
449         "lwarx  15,0,%1"    "\n\t"
450         "add    15,15,%2"   "\n\t"
451         "stwcx. 15,0,%1"    "\n\t"
452         "mfcr   %0"         "\n\t"
453         "srwi   %0,%0,29"   "\n\t"
454         "andi.  %0,%0,1"    "\n"
455         : /*out*/"=b"(success)
456         : /*in*/ "b"(p), "b"(n)
457         : /*trash*/ "memory", "cc", "r15"
458      );
459   } while (success != 1);
460#elif defined(VGA_ppc64)
461   /* Nasty hack.  Does correctly atomically do *p += n, but only if p
462      is 8-aligned -- guaranteed by caller. */
463   unsigned long success;
464   do {
465      __asm__ __volatile__(
466         "ldarx  15,0,%1"    "\n\t"
467         "add    15,15,%2"   "\n\t"
468         "stdcx. 15,0,%1"    "\n\t"
469         "mfcr   %0"         "\n\t"
470         "srwi   %0,%0,29"   "\n\t"
471         "andi.  %0,%0,1"    "\n"
472         : /*out*/"=b"(success)
473         : /*in*/ "b"(p), "b"(((unsigned long)n) << 32)
474         : /*trash*/ "memory", "cc", "r15"
475      );
476   } while (success != 1);
477#elif defined(VGA_arm)
478   unsigned int block[3]
479      = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF };
480   do {
481      __asm__ __volatile__(
482         "mov   r5, %0"         "\n\t"
483         "ldr   r9, [r5, #0]"   "\n\t" // p
484         "ldr   r10, [r5, #4]"  "\n\t" // n
485         "ldrex r8, [r9]"       "\n\t"
486         "add   r8, r8, r10"    "\n\t"
487         "strex r4, r8, [r9]"   "\n\t"
488         "str   r4, [r5, #8]"   "\n\t"
489         : /*out*/
490         : /*in*/ "r"(&block[0])
491         : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4"
492      );
493   } while (block[2] != 0);
494#elif defined(VGA_arm64)
495   unsigned long long int block[3]
496      = { (unsigned long long int)p, (unsigned long long int)n,
497          0xFFFFFFFFFFFFFFFFULL};
498   do {
499      __asm__ __volatile__(
500         "mov   x5, %0"         "\n\t"
501         "ldr   x9, [x5, #0]"   "\n\t" // p
502         "ldr   x10, [x5, #8]"  "\n\t" // n
503         "ldxr  w8, [x9]"       "\n\t"
504         "add   x8, x8, x10"    "\n\t"
505         "stxr  w4, w8, [x9]"    "\n\t"
506         "str   x4, [x5, #16]"   "\n\t"
507         : /*out*/
508         : /*in*/ "r"(&block[0])
509         : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
510      );
511   } while (block[2] != 0);
512#elif defined(VGA_s390x)
513   __asm__ __volatile__(
514      "   l	0,%0\n\t"
515      "0: lr	1,0\n\t"
516      "   ar	1,%1\n\t"
517      "   cs	0,1,%0\n\t"
518      "   jl    0b\n\t"
519      : "+m" (*p)
520      : "d" (n)
521      : "cc", "memory", "0", "1");
522#elif defined(VGA_mips32)
523   unsigned int block[3]
524      = { (unsigned int)p, (unsigned int)n, 0x0 };
525   do {
526      __asm__ __volatile__(
527         "move $t0, %0"        "\n\t"
528         "lw   $t1, 0($t0)"    "\n\t"  // p
529         "lw   $t2, 4($t0)"    "\n\t"  // n
530         "ll   $t3, 0($t1)"    "\n\t"
531         "addu $t3, $t3, $t2"  "\n\t"
532         "sc   $t3, 0($t1)"    "\n\t"
533         "sw   $t3, 8($t0)"    "\n\t"
534         : /*out*/
535         : /*in*/ "r"(&block[0])
536         : /*trash*/ "memory", "t0", "t1", "t2", "t3"
537      );
538   } while (block[2] != 1);
539#elif defined(VGA_mips64)
540   unsigned long block[3]
541      = { (unsigned long)p, (unsigned long)n, 0x0ULL };
542   do {
543      __asm__ __volatile__(
544         "move  $t0, %0"        "\n\t"
545         "ld    $t1, 0($t0)"    "\n\t"  // p
546         "ld    $t2, 8($t0)"    "\n\t"  // n
547         "ll    $t3, 0($t1)"    "\n\t"
548         "addu  $t3, $t3, $t2"  "\n\t"
549         "sc    $t3, 0($t1)"    "\n\t"
550         "sd    $t3, 16($t0)"   "\n\t"
551         : /*out*/
552         : /*in*/ "r"(&block[0])
553         : /*trash*/ "memory", "t0", "t1", "t2", "t3"
554      );
555   } while (block[2] != 1);
556#else
557# error "Unsupported arch"
558#endif
559}
560
561__attribute__((noinline)) void atomic_add_64bit ( long long int* p, int n )
562{
563#if defined(VGA_x86) || defined(VGA_ppc32) || defined(VGA_mips32)
564   /* do nothing; is not supported */
565#elif defined(VGA_amd64)
566   // this is a bit subtle.  It relies on the fact that, on a 64-bit platform,
567   // sizeof(unsigned long long int) == sizeof(unsigned long) == sizeof(void*)
568   unsigned long long int block[2];
569   block[0] = (unsigned long long int)(unsigned long)p;
570   block[1] = n;
571   __asm__ __volatile__(
572      "movq 0(%%rsi),%%rax"      "\n\t"
573      "movq 8(%%rsi),%%rbx"      "\n\t"
574      "lock; addq %%rbx,(%%rax)" "\n"
575      : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
576   );
577#elif defined(VGA_ppc64)
578   unsigned long success;
579   do {
580      __asm__ __volatile__(
581         "ldarx  15,0,%1"    "\n\t"
582         "add    15,15,%2"   "\n\t"
583         "stdcx. 15,0,%1"    "\n\t"
584         "mfcr   %0"         "\n\t"
585         "srwi   %0,%0,29"   "\n\t"
586         "andi.  %0,%0,1"    "\n"
587         : /*out*/"=b"(success)
588         : /*in*/ "b"(p), "b"(n)
589         : /*trash*/ "memory", "cc", "r15"
590      );
591   } while (success != 1);
592#elif defined(VGA_arm)
593   unsigned long long int block[3]
594     = { (unsigned long long int)(unsigned long)p,
595         (unsigned long long int)n,
596         0xFFFFFFFFFFFFFFFFULL };
597   do {
598      __asm__ __volatile__(
599         "mov    r5, %0"             "\n\t"
600         "ldr    r8,     [r5, #0]"   "\n\t" // p
601         "ldrd   r2, r3, [r5, #8]"   "\n\t" // n
602         "ldrexd r0, r1, [r8]"       "\n\t"
603         "adds   r2, r2, r0"         "\n\t"
604         "adc    r3, r3, r1"         "\n\t"
605         "strexd r1, r2, r3, [r8]"   "\n\t"
606         "str    r1, [r5, #16]"      "\n\t"
607         : /*out*/
608         : /*in*/ "r"(&block[0])
609         : /*trash*/ "memory", "cc", "r5", "r0", "r1", "r8", "r2", "r3"
610      );
611   } while (block[2] != 0xFFFFFFFF00000000ULL);
612#elif defined(VGA_arm64)
613   unsigned long long int block[3]
614      = { (unsigned long long int)p, (unsigned long long int)n,
615          0xFFFFFFFFFFFFFFFFULL};
616   do {
617      __asm__ __volatile__(
618         "mov   x5, %0"         "\n\t"
619         "ldr   x9, [x5, #0]"   "\n\t" // p
620         "ldr   x10, [x5, #8]"  "\n\t" // n
621         "ldxr  x8, [x9]"       "\n\t"
622         "add   x8, x8, x10"    "\n\t"
623         "stxr  w4, x8, [x9]"   "\n\t"
624         "str   x4, [x5, #16]"   "\n\t"
625         : /*out*/
626         : /*in*/ "r"(&block[0])
627         : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
628      );
629   } while (block[2] != 0);
630#elif defined(VGA_s390x)
631   __asm__ __volatile__(
632      "   lg	0,%0\n\t"
633      "0: lgr	1,0\n\t"
634      "   agr	1,%1\n\t"
635      "   csg	0,1,%0\n\t"
636      "   jl    0b\n\t"
637      : "+m" (*p)
638      : "d" (n)
639      : "cc", "memory", "0", "1");
640#elif defined(VGA_mips64)
641   unsigned long block[3]
642      = { (unsigned long)p, (unsigned long)n, 0x0ULL };
643   do {
644      __asm__ __volatile__(
645         "move  $t0, %0"        "\n\t"
646         "ld    $t1, 0($t0)"    "\n\t" // p
647         "ld    $t2, 8($t0)"    "\n\t" // n
648         "lld   $t3, 0($t1)"    "\n\t"
649         "daddu $t3, $t3, $t2"  "\n\t"
650         "scd   $t3, 0($t1)"    "\n\t"
651         "sd    $t3, 16($t0)"   "\n\t"
652         : /*out*/
653         : /*in*/ "r"(&block[0])
654         : /*trash*/ "memory", "t0", "t1", "t2", "t3"
655      );
656   } while (block[2] != 1);
657#else
658# error "Unsupported arch"
659#endif
660}
661
662int main ( int argc, char** argv )
663{
664   int    i, status;
665   char*  page;
666   char*  p8;
667   short* p16;
668   int*   p32;
669   long long int* p64;
670   pid_t  child, p2;
671
672   printf("parent, pre-fork\n");
673
674   page = mmap( 0, sysconf(_SC_PAGESIZE),
675                   PROT_READ|PROT_WRITE,
676                   MAP_ANONYMOUS|MAP_SHARED, -1, 0 );
677   if (page == MAP_FAILED) {
678      perror("mmap failed");
679      exit(1);
680   }
681
682   p8  = (char*)(page+0);
683   p16 = (short*)(page+256);
684   p32 = (int*)(page+512);
685   p64 = (long long int*)(page+768);
686
687   assert( IS_8_ALIGNED(p8) );
688   assert( IS_8_ALIGNED(p16) );
689   assert( IS_8_ALIGNED(p32) );
690   assert( IS_8_ALIGNED(p64) );
691
692   memset(page, 0, 1024);
693
694   *p8  = 0;
695   *p16 = 0;
696   *p32 = 0;
697   *p64 = 0;
698
699   child = fork();
700   if (child == -1) {
701      perror("fork() failed\n");
702      return 1;
703   }
704
705   if (child == 0) {
706      /* --- CHILD --- */
707      printf("child\n");
708      for (i = 0; i < NNN; i++) {
709         atomic_add_8bit(p8, 1);
710         atomic_add_16bit(p16, 1);
711         atomic_add_32bit(p32, 1);
712         atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
713      }
714      return 1;
715      /* NOTREACHED */
716
717   }
718
719   /* --- PARENT --- */
720
721   printf("parent\n");
722
723   for (i = 0; i < NNN; i++) {
724      atomic_add_8bit(p8, 1);
725      atomic_add_16bit(p16, 1);
726      atomic_add_32bit(p32, 1);
727      atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
728   }
729
730   p2 = waitpid(child, &status, 0);
731   assert(p2 == child);
732
733   /* assert that child finished normally */
734   assert(WIFEXITED(status));
735
736   printf("FINAL VALUES:  8 bit %d,  16 bit %d,  32 bit %d,  64 bit %lld\n",
737          (int)(*(signed char*)p8), (int)(*p16), *p32, *p64 );
738
739   if (-74 == (int)(*(signed char*)p8)
740       && 32694 == (int)(*p16)
741       && 6913974 == *p32
742       && (0LL == *p64 || 682858642110LL == *p64)) {
743      printf("PASS\n");
744   } else {
745      printf("FAIL -- see source code for expected values\n");
746   }
747
748   printf("parent exits\n");
749
750   return 0;
751}
752