1 /*
2  * Copyright (C) 2020 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // See /docs/design-docs/protozero.md for rationale and results.
18 
19 #include <memory>
20 #include <vector>
21 
22 #include <unistd.h>
23 
24 #include <benchmark/benchmark.h>
25 
26 #include "perfetto/base/compiler.h"
27 #include "perfetto/protozero/static_buffer.h"
28 
29 // Autogenerated headers in out/*/gen/
30 #include "src/protozero/test/example_proto/library.pbzero.h"
31 #include "src/protozero/test/example_proto/test_messages.pb.h"
32 #include "src/protozero/test/example_proto/test_messages.pbzero.h"
33 
34 // Generated by the protozero plugin.
35 namespace pbzero = protozero::test::protos::pbzero;
36 
37 // Generated by the official protobuf compiler.
38 namespace pblite = protozero::test::protos;
39 
40 namespace {
41 
42 // This needs to be > the max size written by each iteration.
43 constexpr size_t kBufPerIteration = 512;
44 
45 // Write cyclically on a 64 MB buffer set to simulate a realistic tracing
46 // scenario.
47 constexpr size_t kTotalWorkingSetSize = 64 * 1024 * 1024;
48 alignas(uint64_t) char g_out_buffer[kTotalWorkingSetSize];
49 
50 char* g_cur = g_out_buffer;
51 
52 uint64_t g_fake_input_simple[] = {0x12345678,
53                                   0x90ABCDEF,
54                                   0x11111111,
55                                   0xFFFFFFFF,
56                                   0x6666666666666666ULL,
57                                   0x6666666666666666ULL,
58                                   0x6666666666666666ULL,
59                                   0x0066666666666666ULL};
60 
61 // Speed-of-light serializer. Aa very simple C++ class that just appends data
62 // into a linear buffer making all sorts of favourable assumptions. It does not
63 // use any binary-stable encoding, it does not perform bound checking,
64 // all writes are 64-bit aligned, it doesn't deal with any thread-safety.
65 // The speed-of-light serializer serves as a reference for how fast a serializer
66 // could be if argument marshalling and bound checking were zero cost.
67 struct SOLMsg {
68   template <typename T>
Append__anon2ce87da40111::SOLMsg69   void Append(T x) {
70     // The reinterpret_cast is to give favorable alignment guarantees.
71     // The memcpy will be elided by the compiler, which will emit just a
72     // 64-bit aligned mov instruction.
73     memcpy(reinterpret_cast<void*>(ptr_), &x, sizeof(x));
74     ptr_ += sizeof(uint64_t);
75   }
76 
set_field_int32__anon2ce87da40111::SOLMsg77   void set_field_int32(int32_t x) { Append(x); }
set_field_uint32__anon2ce87da40111::SOLMsg78   void set_field_uint32(uint32_t x) { Append(x); }
set_field_int64__anon2ce87da40111::SOLMsg79   void set_field_int64(int64_t x) { Append(x); }
set_field_uint64__anon2ce87da40111::SOLMsg80   void set_field_uint64(uint64_t x) { Append(x); }
set_field_string__anon2ce87da40111::SOLMsg81   void set_field_string(const char* str) { ptr_ = strcpy(ptr_, str); }
82 
add_field_nested__anon2ce87da40111::SOLMsg83   SOLMsg* add_field_nested() { return new (this + 1) SOLMsg(); }
84 
85   alignas(uint64_t) char storage_[sizeof(g_fake_input_simple) + 8];
86   char* ptr_ = &storage_[0];
87 };
88 
89 template <typename T>
FillMessage_Simple(T * msg)90 PERFETTO_ALWAYS_INLINE void FillMessage_Simple(T* msg) {
91   benchmark::DoNotOptimize(g_fake_input_simple);
92   msg->set_field_int32(static_cast<int32_t>(g_fake_input_simple[0]));
93   msg->set_field_uint32(static_cast<uint32_t>(g_fake_input_simple[1]));
94   msg->set_field_int64(static_cast<int64_t>(g_fake_input_simple[2]));
95   msg->set_field_uint64(static_cast<uint64_t>(g_fake_input_simple[3]));
96   msg->set_field_string(reinterpret_cast<const char*>(&g_fake_input_simple[4]));
97 }
98 
99 template <typename T>
FillMessage_Nested(T * msg,int depth=0)100 PERFETTO_ALWAYS_INLINE void FillMessage_Nested(T* msg, int depth = 0) {
101   benchmark::DoNotOptimize(g_fake_input_simple);
102   FillMessage_Simple(msg);
103   if (depth < 3) {
104     auto* child = msg->add_field_nested();
105     FillMessage_Nested(child, depth + 1);
106   }
107 }
108 
Clobber(benchmark::State & state)109 PERFETTO_ALWAYS_INLINE void Clobber(benchmark::State& state) {
110   uint64_t* buf = reinterpret_cast<uint64_t*>(g_cur);
111 
112   // Read-back the data written to have a realistic evaluation of the
113   // speed-of-light scenario. This is to deal with architecture of modern CPUs.
114   // If we write a bunch of memory bytes, never read-back from them, and then
115   // just over-write them, the CPU can just throw away the whole stream of
116   // instructions that produced them, if that's still in flight and tracked in
117   // the out-of-order units.
118   // The buf[i-1] ^= buf forces the CPU to consume the result of the writes.
119   buf[0] = reinterpret_cast<uint64_t>(&state);
120   for (size_t i = 1; i < kBufPerIteration / sizeof(uint64_t); i++)
121     buf[i] ^= buf[i - 1];
122   if (buf[(kBufPerIteration / sizeof(uint64_t)) - 1] == 42)
123     PERFETTO_CHECK(false);
124   benchmark::DoNotOptimize(buf);
125 
126   constexpr size_t kWrap = kTotalWorkingSetSize / kBufPerIteration;
127   g_cur = &g_out_buffer[(state.iterations() % kWrap) * kBufPerIteration];
128   benchmark::ClobberMemory();
129 }
130 
131 }  // namespace
132 
BM_Protozero_Simple_Libprotobuf(benchmark::State & state)133 static void BM_Protozero_Simple_Libprotobuf(benchmark::State& state) {
134   while (state.KeepRunning()) {
135     {
136       // The nested block is to account for RAII finalizers.
137       pblite::EveryField msg;
138       FillMessage_Simple(&msg);
139       msg.SerializeToArray(g_cur, kBufPerIteration);
140     }
141     Clobber(state);
142   }
143 }
144 
BM_Protozero_Simple_Protozero(benchmark::State & state)145 static void BM_Protozero_Simple_Protozero(benchmark::State& state) {
146   while (state.KeepRunning()) {
147     {
148       protozero::StaticBuffered<pbzero::EveryField> msg(g_cur,
149                                                         kBufPerIteration);
150       FillMessage_Simple(msg.get());
151     }
152     Clobber(state);
153   }
154 }
155 
BM_Protozero_Simple_SpeedOfLight(benchmark::State & state)156 static void BM_Protozero_Simple_SpeedOfLight(benchmark::State& state) {
157   while (state.KeepRunning()) {
158     SOLMsg* msg = new (g_cur) SOLMsg();
159     FillMessage_Simple(msg);
160     Clobber(state);
161   }
162 }
163 
BM_Protozero_Nested_Libprotobuf(benchmark::State & state)164 static void BM_Protozero_Nested_Libprotobuf(benchmark::State& state) {
165   while (state.KeepRunning()) {
166     {
167       pblite::EveryField msg;
168       FillMessage_Nested(&msg);
169       msg.SerializeToArray(g_cur, kBufPerIteration);
170     }
171     Clobber(state);
172   }
173 }
174 
BM_Protozero_Nested_Protozero(benchmark::State & state)175 static void BM_Protozero_Nested_Protozero(benchmark::State& state) {
176   while (state.KeepRunning()) {
177     {
178       protozero::StaticBuffered<pbzero::EveryField> msg(g_cur,
179                                                         kBufPerIteration);
180       FillMessage_Nested(msg.get());
181     }
182     Clobber(state);
183   }
184 }
185 
BM_Protozero_Nested_SpeedOfLight(benchmark::State & state)186 static void BM_Protozero_Nested_SpeedOfLight(benchmark::State& state) {
187   while (state.KeepRunning()) {
188     SOLMsg* msg = new (g_cur) SOLMsg();
189     FillMessage_Nested(msg);
190     Clobber(state);
191   }
192 }
193 
194 BENCHMARK(BM_Protozero_Simple_Libprotobuf);
195 BENCHMARK(BM_Protozero_Simple_Protozero);
196 BENCHMARK(BM_Protozero_Simple_SpeedOfLight);
197 
198 BENCHMARK(BM_Protozero_Nested_Libprotobuf);
199 BENCHMARK(BM_Protozero_Nested_Protozero);
200 BENCHMARK(BM_Protozero_Nested_SpeedOfLight);
201