1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // Our goal is to measure the cost of various C++ atomic operations.
18 // Android doesn't really control those. But since some of these operations can be quite
19 // expensive, this may be useful input for development of higher level code.
20 // Expected mappings from C++ atomics to hardware primitives can be found at
21 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html .
22 
23 #include <benchmark/benchmark.h>
24 #include <atomic>
25 #include <mutex>
26 
27 // We time atomic operations separated by a volatile (not atomic!) increment.  This ensures
28 // that the compiler emits memory instructions (e.g. load or store) prior to any fence or the
29 // like.  That in turn ensures that the CPU has outstanding memory operations when the fence
30 // is executed.
31 
32 // In most respects, we compute best case values. Since there is only one thread, there are no
33 // coherence misses.
34 
35 // We assume that the compiler is not smart enough to optimize away fences in a single-threaded
36 // program. If that changes, we'll need to add a second thread.
37 
38 volatile unsigned counter;
39 
40 std::atomic<int> test_loc(0);
41 
42 volatile unsigned sink;
43 
44 std::mutex mtx;
45 
BM_empty(benchmark::State & state)46 void BM_empty(benchmark::State& state) {
47   while (state.KeepRunning()) {
48     ++counter;
49   }
50 }
51 BENCHMARK(BM_empty);
52 
BM_load_relaxed(benchmark::State & state)53 static void BM_load_relaxed(benchmark::State& state) {
54   unsigned result = 0;
55   while (state.KeepRunning()) {
56     result += test_loc.load(std::memory_order_relaxed);
57     ++counter;
58   }
59   sink = result;
60 }
61 BENCHMARK(BM_load_relaxed);
62 
BM_load_acquire(benchmark::State & state)63 static void BM_load_acquire(benchmark::State& state) {
64   unsigned result = 0;
65   while (state.KeepRunning()) {
66     result += test_loc.load(std::memory_order_acquire);
67     ++counter;
68   }
69   sink = result;
70 }
71 BENCHMARK(BM_load_acquire);
72 
BM_store_release(benchmark::State & state)73 static void BM_store_release(benchmark::State& state) {
74   int i = counter;
75   while (state.KeepRunning()) {
76     test_loc.store(++i, std::memory_order_release);
77     ++counter;
78   }
79 }
80 BENCHMARK(BM_store_release);
81 
BM_store_seq_cst(benchmark::State & state)82 static void BM_store_seq_cst(benchmark::State& state) {
83   int i = counter;
84   while (state.KeepRunning()) {
85     test_loc.store(++i, std::memory_order_seq_cst);
86     ++counter;
87   }
88 }
89 BENCHMARK(BM_store_seq_cst);
90 
BM_fetch_add_relaxed(benchmark::State & state)91 static void BM_fetch_add_relaxed(benchmark::State& state) {
92   unsigned result = 0;
93   while (state.KeepRunning()) {
94     result += test_loc.fetch_add(1, std::memory_order_relaxed);
95     ++counter;
96   }
97   sink = result;
98 }
99 BENCHMARK(BM_fetch_add_relaxed);
100 
BM_fetch_add_seq_cst(benchmark::State & state)101 static void BM_fetch_add_seq_cst(benchmark::State& state) {
102   unsigned result = 0;
103   while (state.KeepRunning()) {
104     result += test_loc.fetch_add(1, std::memory_order_seq_cst);
105     ++counter;
106   }
107   sink = result;
108 }
109 BENCHMARK(BM_fetch_add_seq_cst);
110 
111 // The fence benchmarks include a relaxed load to make it much harder to optimize away
112 // the fence.
113 
BM_acquire_fence(benchmark::State & state)114 static void BM_acquire_fence(benchmark::State& state) {
115   unsigned result = 0;
116   while (state.KeepRunning()) {
117     result += test_loc.load(std::memory_order_relaxed);
118     std::atomic_thread_fence(std::memory_order_acquire);
119     ++counter;
120   }
121   sink = result;
122 }
123 BENCHMARK(BM_acquire_fence);
124 
BM_seq_cst_fence(benchmark::State & state)125 static void BM_seq_cst_fence(benchmark::State& state) {
126   unsigned result = 0;
127   while (state.KeepRunning()) {
128     result += test_loc.load(std::memory_order_relaxed);
129     std::atomic_thread_fence(std::memory_order_seq_cst);
130     ++counter;
131   }
132   sink = result;
133 }
134 BENCHMARK(BM_seq_cst_fence);
135 
136 // For comparison, also throw in a critical section version:
137 
BM_fetch_add_cs(benchmark::State & state)138 static void BM_fetch_add_cs(benchmark::State& state) {
139   unsigned result = 0;
140   while (state.KeepRunning()) {
141     {
142       std::lock_guard<std::mutex> _(mtx);
143       result += ++counter;
144     }
145   }
146   sink = result;
147 }
148 BENCHMARK(BM_fetch_add_cs);
149