1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 // Our goal is to measure the cost of various C++ atomic operations.
18 // Android doesn't really control those. But since some of these operations can be quite
19 // expensive, this may be useful input for development of higher level code.
20 // Expected mappings from C++ atomics to hardware primitives can be found at
21 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html .
22
23 #include <benchmark/benchmark.h>
24 #include <atomic>
25 #include <mutex>
26
27 // We time atomic operations separated by a volatile (not atomic!) increment. This ensures
28 // that the compiler emits memory instructions (e.g. load or store) prior to any fence or the
29 // like. That in turn ensures that the CPU has outstanding memory operations when the fence
30 // is executed.
31
32 // In most respects, we compute best case values. Since there is only one thread, there are no
33 // coherence misses.
34
35 // We assume that the compiler is not smart enough to optimize away fences in a single-threaded
36 // program. If that changes, we'll need to add a second thread.
37
38 volatile unsigned counter;
39
40 std::atomic<int> test_loc(0);
41
42 volatile unsigned sink;
43
44 std::mutex mtx;
45
BM_empty(benchmark::State & state)46 void BM_empty(benchmark::State& state) {
47 while (state.KeepRunning()) {
48 ++counter;
49 }
50 }
51 BENCHMARK(BM_empty);
52
BM_load_relaxed(benchmark::State & state)53 static void BM_load_relaxed(benchmark::State& state) {
54 unsigned result = 0;
55 while (state.KeepRunning()) {
56 result += test_loc.load(std::memory_order_relaxed);
57 ++counter;
58 }
59 sink = result;
60 }
61 BENCHMARK(BM_load_relaxed);
62
BM_load_acquire(benchmark::State & state)63 static void BM_load_acquire(benchmark::State& state) {
64 unsigned result = 0;
65 while (state.KeepRunning()) {
66 result += test_loc.load(std::memory_order_acquire);
67 ++counter;
68 }
69 sink = result;
70 }
71 BENCHMARK(BM_load_acquire);
72
BM_store_release(benchmark::State & state)73 static void BM_store_release(benchmark::State& state) {
74 int i = counter;
75 while (state.KeepRunning()) {
76 test_loc.store(++i, std::memory_order_release);
77 ++counter;
78 }
79 }
80 BENCHMARK(BM_store_release);
81
BM_store_seq_cst(benchmark::State & state)82 static void BM_store_seq_cst(benchmark::State& state) {
83 int i = counter;
84 while (state.KeepRunning()) {
85 test_loc.store(++i, std::memory_order_seq_cst);
86 ++counter;
87 }
88 }
89 BENCHMARK(BM_store_seq_cst);
90
BM_fetch_add_relaxed(benchmark::State & state)91 static void BM_fetch_add_relaxed(benchmark::State& state) {
92 unsigned result = 0;
93 while (state.KeepRunning()) {
94 result += test_loc.fetch_add(1, std::memory_order_relaxed);
95 ++counter;
96 }
97 sink = result;
98 }
99 BENCHMARK(BM_fetch_add_relaxed);
100
BM_fetch_add_seq_cst(benchmark::State & state)101 static void BM_fetch_add_seq_cst(benchmark::State& state) {
102 unsigned result = 0;
103 while (state.KeepRunning()) {
104 result += test_loc.fetch_add(1, std::memory_order_seq_cst);
105 ++counter;
106 }
107 sink = result;
108 }
109 BENCHMARK(BM_fetch_add_seq_cst);
110
111 // The fence benchmarks include a relaxed load to make it much harder to optimize away
112 // the fence.
113
BM_acquire_fence(benchmark::State & state)114 static void BM_acquire_fence(benchmark::State& state) {
115 unsigned result = 0;
116 while (state.KeepRunning()) {
117 result += test_loc.load(std::memory_order_relaxed);
118 std::atomic_thread_fence(std::memory_order_acquire);
119 ++counter;
120 }
121 sink = result;
122 }
123 BENCHMARK(BM_acquire_fence);
124
BM_seq_cst_fence(benchmark::State & state)125 static void BM_seq_cst_fence(benchmark::State& state) {
126 unsigned result = 0;
127 while (state.KeepRunning()) {
128 result += test_loc.load(std::memory_order_relaxed);
129 std::atomic_thread_fence(std::memory_order_seq_cst);
130 ++counter;
131 }
132 sink = result;
133 }
134 BENCHMARK(BM_seq_cst_fence);
135
136 // For comparison, also throw in a critical section version:
137
BM_fetch_add_cs(benchmark::State & state)138 static void BM_fetch_add_cs(benchmark::State& state) {
139 unsigned result = 0;
140 while (state.KeepRunning()) {
141 {
142 std::lock_guard<std::mutex> _(mtx);
143 result += ++counter;
144 }
145 }
146 sink = result;
147 }
148 BENCHMARK(BM_fetch_add_cs);
149