1// Copyright 2010 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5// This file is an internal atomic implementation, use atomicops.h instead. 6// 7// LinuxKernelCmpxchg and Barrier_AtomicIncrement are from Google Gears. 8 9#ifndef V8_BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ 10#define V8_BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ 11 12#if defined(__QNXNTO__) 13#include <sys/cpuinline.h> 14#endif 15 16namespace v8 { 17namespace base { 18 19// Memory barriers on ARM are funky, but the kernel is here to help: 20// 21// * ARMv5 didn't support SMP, there is no memory barrier instruction at 22// all on this architecture, or when targeting its machine code. 23// 24// * Some ARMv6 CPUs support SMP. A full memory barrier can be produced by 25// writing a random value to a very specific coprocessor register. 26// 27// * On ARMv7, the "dmb" instruction is used to perform a full memory 28// barrier (though writing to the co-processor will still work). 29// However, on single core devices (e.g. Nexus One, or Nexus S), 30// this instruction will take up to 200 ns, which is huge, even though 31// it's completely un-needed on these devices. 32// 33// * There is no easy way to determine at runtime if the device is 34// single or multi-core. However, the kernel provides a useful helper 35// function at a fixed memory address (0xffff0fa0), which will always 36// perform a memory barrier in the most efficient way. I.e. on single 37// core devices, this is an empty function that exits immediately. 38// On multi-core devices, it implements a full memory barrier. 39// 40// * This source could be compiled to ARMv5 machine code that runs on a 41// multi-core ARMv6 or ARMv7 device. In this case, memory barriers 42// are needed for correct execution. Always call the kernel helper, even 43// when targeting ARMv5TE. 44// 45 46inline void MemoryBarrier() { 47#if defined(__linux__) || defined(__ANDROID__) 48 // Note: This is a function call, which is also an implicit compiler barrier. 49 typedef void (*KernelMemoryBarrierFunc)(); 50 ((KernelMemoryBarrierFunc)0xffff0fa0)(); 51#elif defined(__QNXNTO__) 52 __cpu_membarrier(); 53#else 54#error MemoryBarrier() is not implemented on this platform. 55#endif 56} 57 58// An ARM toolchain would only define one of these depending on which 59// variant of the target architecture is being used. This tests against 60// any known ARMv6 or ARMv7 variant, where it is possible to directly 61// use ldrex/strex instructions to implement fast atomic operations. 62#if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ 63 defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || \ 64 defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ 65 defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \ 66 defined(__ARM_ARCH_6KZ__) || defined(__ARM_ARCH_6T2__) 67 68inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, 69 Atomic32 old_value, 70 Atomic32 new_value) { 71 Atomic32 prev_value; 72 int reloop; 73 do { 74 // The following is equivalent to: 75 // 76 // prev_value = LDREX(ptr) 77 // reloop = 0 78 // if (prev_value != old_value) 79 // reloop = STREX(ptr, new_value) 80 __asm__ __volatile__(" ldrex %0, [%3]\n" 81 " mov %1, #0\n" 82 " cmp %0, %4\n" 83#ifdef __thumb2__ 84 " it eq\n" 85#endif 86 " strexeq %1, %5, [%3]\n" 87 : "=&r"(prev_value), "=&r"(reloop), "+m"(*ptr) 88 : "r"(ptr), "r"(old_value), "r"(new_value) 89 : "cc", "memory"); 90 } while (reloop != 0); 91 return prev_value; 92} 93 94inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, 95 Atomic32 old_value, 96 Atomic32 new_value) { 97 Atomic32 result = NoBarrier_CompareAndSwap(ptr, old_value, new_value); 98 MemoryBarrier(); 99 return result; 100} 101 102inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, 103 Atomic32 old_value, 104 Atomic32 new_value) { 105 MemoryBarrier(); 106 return NoBarrier_CompareAndSwap(ptr, old_value, new_value); 107} 108 109inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, 110 Atomic32 increment) { 111 Atomic32 value; 112 int reloop; 113 do { 114 // Equivalent to: 115 // 116 // value = LDREX(ptr) 117 // value += increment 118 // reloop = STREX(ptr, value) 119 // 120 __asm__ __volatile__(" ldrex %0, [%3]\n" 121 " add %0, %0, %4\n" 122 " strex %1, %0, [%3]\n" 123 : "=&r"(value), "=&r"(reloop), "+m"(*ptr) 124 : "r"(ptr), "r"(increment) 125 : "cc", "memory"); 126 } while (reloop); 127 return value; 128} 129 130inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, 131 Atomic32 increment) { 132 // TODO(digit): Investigate if it's possible to implement this with 133 // a single MemoryBarrier() operation between the LDREX and STREX. 134 // See http://crbug.com/246514 135 MemoryBarrier(); 136 Atomic32 result = NoBarrier_AtomicIncrement(ptr, increment); 137 MemoryBarrier(); 138 return result; 139} 140 141inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, 142 Atomic32 new_value) { 143 Atomic32 old_value; 144 int reloop; 145 do { 146 // old_value = LDREX(ptr) 147 // reloop = STREX(ptr, new_value) 148 __asm__ __volatile__(" ldrex %0, [%3]\n" 149 " strex %1, %4, [%3]\n" 150 : "=&r"(old_value), "=&r"(reloop), "+m"(*ptr) 151 : "r"(ptr), "r"(new_value) 152 : "cc", "memory"); 153 } while (reloop != 0); 154 return old_value; 155} 156 157// This tests against any known ARMv5 variant. 158#elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \ 159 defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) 160 161// The kernel also provides a helper function to perform an atomic 162// compare-and-swap operation at the hard-wired address 0xffff0fc0. 163// On ARMv5, this is implemented by a special code path that the kernel 164// detects and treats specially when thread pre-emption happens. 165// On ARMv6 and higher, it uses LDREX/STREX instructions instead. 166// 167// Note that this always perform a full memory barrier, there is no 168// need to add calls MemoryBarrier() before or after it. It also 169// returns 0 on success, and 1 on exit. 170// 171// Available and reliable since Linux 2.6.24. Both Android and ChromeOS 172// use newer kernel revisions, so this should not be a concern. 173namespace { 174 175inline int LinuxKernelCmpxchg(Atomic32 old_value, 176 Atomic32 new_value, 177 volatile Atomic32* ptr) { 178 typedef int (*KernelCmpxchgFunc)(Atomic32, Atomic32, volatile Atomic32*); 179 return ((KernelCmpxchgFunc)0xffff0fc0)(old_value, new_value, ptr); 180} 181 182} // namespace 183 184inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, 185 Atomic32 old_value, 186 Atomic32 new_value) { 187 Atomic32 prev_value; 188 for (;;) { 189 prev_value = *ptr; 190 if (prev_value != old_value) 191 return prev_value; 192 if (!LinuxKernelCmpxchg(old_value, new_value, ptr)) 193 return old_value; 194 } 195} 196 197inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, 198 Atomic32 new_value) { 199 Atomic32 old_value; 200 do { 201 old_value = *ptr; 202 } while (LinuxKernelCmpxchg(old_value, new_value, ptr)); 203 return old_value; 204} 205 206inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, 207 Atomic32 increment) { 208 return Barrier_AtomicIncrement(ptr, increment); 209} 210 211inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, 212 Atomic32 increment) { 213 for (;;) { 214 // Atomic exchange the old value with an incremented one. 215 Atomic32 old_value = *ptr; 216 Atomic32 new_value = old_value + increment; 217 if (!LinuxKernelCmpxchg(old_value, new_value, ptr)) { 218 // The exchange took place as expected. 219 return new_value; 220 } 221 // Otherwise, *ptr changed mid-loop and we need to retry. 222 } 223} 224 225inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, 226 Atomic32 old_value, 227 Atomic32 new_value) { 228 Atomic32 prev_value; 229 for (;;) { 230 prev_value = *ptr; 231 if (prev_value != old_value) { 232 // Always ensure acquire semantics. 233 MemoryBarrier(); 234 return prev_value; 235 } 236 if (!LinuxKernelCmpxchg(old_value, new_value, ptr)) 237 return old_value; 238 } 239} 240 241inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, 242 Atomic32 old_value, 243 Atomic32 new_value) { 244 // This could be implemented as: 245 // MemoryBarrier(); 246 // return NoBarrier_CompareAndSwap(); 247 // 248 // But would use 3 barriers per succesful CAS. To save performance, 249 // use Acquire_CompareAndSwap(). Its implementation guarantees that: 250 // - A succesful swap uses only 2 barriers (in the kernel helper). 251 // - An early return due to (prev_value != old_value) performs 252 // a memory barrier with no store, which is equivalent to the 253 // generic implementation above. 254 return Acquire_CompareAndSwap(ptr, old_value, new_value); 255} 256 257#else 258# error "Your CPU's ARM architecture is not supported yet" 259#endif 260 261// NOTE: Atomicity of the following load and store operations is only 262// guaranteed in case of 32-bit alignement of |ptr| values. 263 264inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { 265 *ptr = value; 266} 267 268inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { 269 *ptr = value; 270 MemoryBarrier(); 271} 272 273inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { 274 MemoryBarrier(); 275 *ptr = value; 276} 277 278inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { return *ptr; } 279 280inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { 281 Atomic32 value = *ptr; 282 MemoryBarrier(); 283 return value; 284} 285 286inline Atomic32 Release_Load(volatile const Atomic32* ptr) { 287 MemoryBarrier(); 288 return *ptr; 289} 290 291// Byte accessors. 292 293inline void NoBarrier_Store(volatile Atomic8* ptr, Atomic8 value) { 294 *ptr = value; 295} 296 297inline Atomic8 NoBarrier_Load(volatile const Atomic8* ptr) { return *ptr; } 298 299} } // namespace v8::base 300 301#endif // V8_BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ 302