1 // Copyright 2010 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // This file is an internal atomic implementation, use atomicops.h instead.
6 //
7 // LinuxKernelCmpxchg and Barrier_AtomicIncrement are from Google Gears.
8 
9 #ifndef V8_BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_
10 #define V8_BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_
11 
12 #if defined(__QNXNTO__)
13 #include <sys/cpuinline.h>
14 #endif
15 
16 namespace v8 {
17 namespace base {
18 
19 // Memory barriers on ARM are funky, but the kernel is here to help:
20 //
21 // * ARMv5 didn't support SMP, there is no memory barrier instruction at
22 //   all on this architecture, or when targeting its machine code.
23 //
24 // * Some ARMv6 CPUs support SMP. A full memory barrier can be produced by
25 //   writing a random value to a very specific coprocessor register.
26 //
27 // * On ARMv7, the "dmb" instruction is used to perform a full memory
28 //   barrier (though writing to the co-processor will still work).
29 //   However, on single core devices (e.g. Nexus One, or Nexus S),
30 //   this instruction will take up to 200 ns, which is huge, even though
31 //   it's completely un-needed on these devices.
32 //
33 // * There is no easy way to determine at runtime if the device is
34 //   single or multi-core. However, the kernel provides a useful helper
35 //   function at a fixed memory address (0xffff0fa0), which will always
36 //   perform a memory barrier in the most efficient way. I.e. on single
37 //   core devices, this is an empty function that exits immediately.
38 //   On multi-core devices, it implements a full memory barrier.
39 //
40 // * This source could be compiled to ARMv5 machine code that runs on a
41 //   multi-core ARMv6 or ARMv7 device. In this case, memory barriers
42 //   are needed for correct execution. Always call the kernel helper, even
43 //   when targeting ARMv5TE.
44 //
45 
MemoryBarrier()46 inline void MemoryBarrier() {
47 #if defined(__linux__) || defined(__ANDROID__)
48   // Note: This is a function call, which is also an implicit compiler barrier.
49   typedef void (*KernelMemoryBarrierFunc)();
50   ((KernelMemoryBarrierFunc)0xffff0fa0)();
51 #elif defined(__QNXNTO__)
52   __cpu_membarrier();
53 #else
54 #error MemoryBarrier() is not implemented on this platform.
55 #endif
56 }
57 
58 // An ARM toolchain would only define one of these depending on which
59 // variant of the target architecture is being used. This tests against
60 // any known ARMv6 or ARMv7 variant, where it is possible to directly
61 // use ldrex/strex instructions to implement fast atomic operations.
62 #if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
63     defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || \
64     defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
65     defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \
66     defined(__ARM_ARCH_6KZ__) || defined(__ARM_ARCH_6T2__)
67 
NoBarrier_CompareAndSwap(volatile Atomic32 * ptr,Atomic32 old_value,Atomic32 new_value)68 inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
69                                          Atomic32 old_value,
70                                          Atomic32 new_value) {
71   Atomic32 prev_value;
72   int reloop;
73   do {
74     // The following is equivalent to:
75     //
76     //   prev_value = LDREX(ptr)
77     //   reloop = 0
78     //   if (prev_value != old_value)
79     //      reloop = STREX(ptr, new_value)
80     __asm__ __volatile__("    ldrex %0, [%3]\n"
81                          "    mov %1, #0\n"
82                          "    cmp %0, %4\n"
83 #ifdef __thumb2__
84                          "    it eq\n"
85 #endif
86                          "    strexeq %1, %5, [%3]\n"
87                          : "=&r"(prev_value), "=&r"(reloop), "+m"(*ptr)
88                          : "r"(ptr), "r"(old_value), "r"(new_value)
89                          : "cc", "memory");
90   } while (reloop != 0);
91   return prev_value;
92 }
93 
Acquire_CompareAndSwap(volatile Atomic32 * ptr,Atomic32 old_value,Atomic32 new_value)94 inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
95                                        Atomic32 old_value,
96                                        Atomic32 new_value) {
97   Atomic32 result = NoBarrier_CompareAndSwap(ptr, old_value, new_value);
98   MemoryBarrier();
99   return result;
100 }
101 
Release_CompareAndSwap(volatile Atomic32 * ptr,Atomic32 old_value,Atomic32 new_value)102 inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
103                                        Atomic32 old_value,
104                                        Atomic32 new_value) {
105   MemoryBarrier();
106   return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
107 }
108 
NoBarrier_AtomicIncrement(volatile Atomic32 * ptr,Atomic32 increment)109 inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr,
110                                           Atomic32 increment) {
111   Atomic32 value;
112   int reloop;
113   do {
114     // Equivalent to:
115     //
116     //  value = LDREX(ptr)
117     //  value += increment
118     //  reloop = STREX(ptr, value)
119     //
120     __asm__ __volatile__("    ldrex %0, [%3]\n"
121                          "    add %0, %0, %4\n"
122                          "    strex %1, %0, [%3]\n"
123                          : "=&r"(value), "=&r"(reloop), "+m"(*ptr)
124                          : "r"(ptr), "r"(increment)
125                          : "cc", "memory");
126   } while (reloop);
127   return value;
128 }
129 
Barrier_AtomicIncrement(volatile Atomic32 * ptr,Atomic32 increment)130 inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr,
131                                         Atomic32 increment) {
132   // TODO(digit): Investigate if it's possible to implement this with
133   // a single MemoryBarrier() operation between the LDREX and STREX.
134   // See http://crbug.com/246514
135   MemoryBarrier();
136   Atomic32 result = NoBarrier_AtomicIncrement(ptr, increment);
137   MemoryBarrier();
138   return result;
139 }
140 
NoBarrier_AtomicExchange(volatile Atomic32 * ptr,Atomic32 new_value)141 inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,
142                                          Atomic32 new_value) {
143   Atomic32 old_value;
144   int reloop;
145   do {
146     // old_value = LDREX(ptr)
147     // reloop = STREX(ptr, new_value)
148     __asm__ __volatile__("   ldrex %0, [%3]\n"
149                          "   strex %1, %4, [%3]\n"
150                          : "=&r"(old_value), "=&r"(reloop), "+m"(*ptr)
151                          : "r"(ptr), "r"(new_value)
152                          : "cc", "memory");
153   } while (reloop != 0);
154   return old_value;
155 }
156 
157 // This tests against any known ARMv5 variant.
158 #elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \
159       defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
160 
161 // The kernel also provides a helper function to perform an atomic
162 // compare-and-swap operation at the hard-wired address 0xffff0fc0.
163 // On ARMv5, this is implemented by a special code path that the kernel
164 // detects and treats specially when thread pre-emption happens.
165 // On ARMv6 and higher, it uses LDREX/STREX instructions instead.
166 //
167 // Note that this always perform a full memory barrier, there is no
168 // need to add calls MemoryBarrier() before or after it. It also
169 // returns 0 on success, and 1 on exit.
170 //
171 // Available and reliable since Linux 2.6.24. Both Android and ChromeOS
172 // use newer kernel revisions, so this should not be a concern.
173 namespace {
174 
LinuxKernelCmpxchg(Atomic32 old_value,Atomic32 new_value,volatile Atomic32 * ptr)175 inline int LinuxKernelCmpxchg(Atomic32 old_value,
176                               Atomic32 new_value,
177                               volatile Atomic32* ptr) {
178   typedef int (*KernelCmpxchgFunc)(Atomic32, Atomic32, volatile Atomic32*);
179   return ((KernelCmpxchgFunc)0xffff0fc0)(old_value, new_value, ptr);
180 }
181 
182 }  // namespace
183 
NoBarrier_CompareAndSwap(volatile Atomic32 * ptr,Atomic32 old_value,Atomic32 new_value)184 inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
185                                          Atomic32 old_value,
186                                          Atomic32 new_value) {
187   Atomic32 prev_value;
188   for (;;) {
189     prev_value = *ptr;
190     if (prev_value != old_value)
191       return prev_value;
192     if (!LinuxKernelCmpxchg(old_value, new_value, ptr))
193       return old_value;
194   }
195 }
196 
NoBarrier_AtomicExchange(volatile Atomic32 * ptr,Atomic32 new_value)197 inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,
198                                          Atomic32 new_value) {
199   Atomic32 old_value;
200   do {
201     old_value = *ptr;
202   } while (LinuxKernelCmpxchg(old_value, new_value, ptr));
203   return old_value;
204 }
205 
NoBarrier_AtomicIncrement(volatile Atomic32 * ptr,Atomic32 increment)206 inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr,
207                                           Atomic32 increment) {
208   return Barrier_AtomicIncrement(ptr, increment);
209 }
210 
Barrier_AtomicIncrement(volatile Atomic32 * ptr,Atomic32 increment)211 inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr,
212                                         Atomic32 increment) {
213   for (;;) {
214     // Atomic exchange the old value with an incremented one.
215     Atomic32 old_value = *ptr;
216     Atomic32 new_value = old_value + increment;
217     if (!LinuxKernelCmpxchg(old_value, new_value, ptr)) {
218       // The exchange took place as expected.
219       return new_value;
220     }
221     // Otherwise, *ptr changed mid-loop and we need to retry.
222   }
223 }
224 
Acquire_CompareAndSwap(volatile Atomic32 * ptr,Atomic32 old_value,Atomic32 new_value)225 inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
226                                        Atomic32 old_value,
227                                        Atomic32 new_value) {
228   Atomic32 prev_value;
229   for (;;) {
230     prev_value = *ptr;
231     if (prev_value != old_value) {
232       // Always ensure acquire semantics.
233       MemoryBarrier();
234       return prev_value;
235     }
236     if (!LinuxKernelCmpxchg(old_value, new_value, ptr))
237       return old_value;
238   }
239 }
240 
Release_CompareAndSwap(volatile Atomic32 * ptr,Atomic32 old_value,Atomic32 new_value)241 inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
242                                        Atomic32 old_value,
243                                        Atomic32 new_value) {
244   // This could be implemented as:
245   //    MemoryBarrier();
246   //    return NoBarrier_CompareAndSwap();
247   //
248   // But would use 3 barriers per succesful CAS. To save performance,
249   // use Acquire_CompareAndSwap(). Its implementation guarantees that:
250   // - A succesful swap uses only 2 barriers (in the kernel helper).
251   // - An early return due to (prev_value != old_value) performs
252   //   a memory barrier with no store, which is equivalent to the
253   //   generic implementation above.
254   return Acquire_CompareAndSwap(ptr, old_value, new_value);
255 }
256 
257 #else
258 #  error "Your CPU's ARM architecture is not supported yet"
259 #endif
260 
261 // NOTE: Atomicity of the following load and store operations is only
262 // guaranteed in case of 32-bit alignement of |ptr| values.
263 
NoBarrier_Store(volatile Atomic32 * ptr,Atomic32 value)264 inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) {
265   *ptr = value;
266 }
267 
Acquire_Store(volatile Atomic32 * ptr,Atomic32 value)268 inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {
269   *ptr = value;
270   MemoryBarrier();
271 }
272 
Release_Store(volatile Atomic32 * ptr,Atomic32 value)273 inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) {
274   MemoryBarrier();
275   *ptr = value;
276 }
277 
NoBarrier_Load(volatile const Atomic32 * ptr)278 inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { return *ptr; }
279 
Acquire_Load(volatile const Atomic32 * ptr)280 inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) {
281   Atomic32 value = *ptr;
282   MemoryBarrier();
283   return value;
284 }
285 
Release_Load(volatile const Atomic32 * ptr)286 inline Atomic32 Release_Load(volatile const Atomic32* ptr) {
287   MemoryBarrier();
288   return *ptr;
289 }
290 
291 // Byte accessors.
292 
NoBarrier_Store(volatile Atomic8 * ptr,Atomic8 value)293 inline void NoBarrier_Store(volatile Atomic8* ptr, Atomic8 value) {
294   *ptr = value;
295 }
296 
NoBarrier_Load(volatile const Atomic8 * ptr)297 inline Atomic8 NoBarrier_Load(volatile const Atomic8* ptr) { return *ptr; }
298 
299 } }  // namespace v8::base
300 
301 #endif  // V8_BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_
302