1 /*
2  * Copyright (C) 2019 The Android Open Source Project
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *  * Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  *  * Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in
12  *    the documentation and/or other materials provided with the
13  *    distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <malloc.h>
30 #include <unistd.h>
31 
32 #include <condition_variable>
33 #include <mutex>
34 #include <random>
35 #include <thread>
36 #include <vector>
37 
38 #include <benchmark/benchmark.h>
39 #include "ScopedDecayTimeRestorer.h"
40 #include "util.h"
41 
42 #if defined(__BIONIC__)
43 
RunMalloptPurge(benchmark::State & state,int purge_value)44 static void RunMalloptPurge(benchmark::State& state, int purge_value) {
45   ScopedDecayTimeRestorer restorer;
46 
47   static size_t sizes[] = {8, 16, 32, 64, 128, 1024, 4096, 16384, 65536, 131072, 1048576};
48   static int pagesize = getpagesize();
49   mallopt(M_DECAY_TIME, 1);
50   mallopt(M_PURGE_ALL, 0);
51   for (auto _ : state) {
52     state.PauseTiming();
53     std::vector<void*> ptrs;
54     for (auto size : sizes) {
55       // Allocate at least two pages worth of the allocations.
56       for (size_t allocated = 0; allocated < 2 * static_cast<size_t>(pagesize); allocated += size) {
57         void* ptr = malloc(size);
58         if (ptr == nullptr) {
59           state.SkipWithError("Failed to allocate memory");
60         }
61         MakeAllocationResident(ptr, size, pagesize);
62         ptrs.push_back(ptr);
63       }
64     }
65     // Free the memory, which should leave many of the pages resident until
66     // the purge call.
67     for (auto ptr : ptrs) {
68       free(ptr);
69     }
70     ptrs.clear();
71     state.ResumeTiming();
72 
73     mallopt(purge_value, 0);
74   }
75 }
76 
RunThreadsThroughput(benchmark::State & state,size_t size,size_t num_threads)77 static void RunThreadsThroughput(benchmark::State& state, size_t size, size_t num_threads) {
78   constexpr size_t kMaxBytes = 1 << 24;
79   constexpr size_t kMaxThreads = 8;
80   constexpr size_t kMinRounds = 4;
81   const size_t MaxAllocCounts = kMaxBytes / size;
82   std::mutex m;
83   bool ready = false;
84   std::condition_variable cv;
85   std::thread* threads[kMaxThreads];
86 
87   // The goal is to create malloc/free interleaving patterns across threads.
88   // The bytes processed by each thread will be the same. The difference is the
89   // patterns. Here's an example:
90   //
91   // A: Allocation
92   // D: Deallocation
93   //
94   //   T1    T2    T3
95   //   A     A     A
96   //   A     A     D
97   //   A     D     A
98   //   A     D     D
99   //   D     A     A
100   //   D     A     D
101   //   D     D     A
102   //   D     D     D
103   //
104   // To do this, `AllocCounts` and `AllocRounds` will be adjusted according to the
105   // thread id.
106   auto thread_task = [&](size_t id) {
107     {
108       std::unique_lock lock(m);
109       // Wait until all threads are created.
110       cv.wait(lock, [&] { return ready; });
111     }
112 
113     void** MemPool;
114     const size_t AllocCounts = (MaxAllocCounts >> id);
115     const size_t AllocRounds = (kMinRounds << id);
116     MemPool = new void*[AllocCounts];
117 
118     for (size_t i = 0; i < AllocRounds; ++i) {
119       for (size_t j = 0; j < AllocCounts; ++j) {
120         void* ptr = malloc(size);
121         MemPool[j] = ptr;
122       }
123 
124       // Use a fix seed to reduce the noise of different round of benchmark.
125       const unsigned seed = 33529;
126       std::shuffle(MemPool, &MemPool[AllocCounts], std::default_random_engine(seed));
127 
128       for (size_t j = 0; j < AllocCounts; ++j) free(MemPool[j]);
129     }
130 
131     delete[] MemPool;
132   };
133 
134   for (auto _ : state) {
135     state.PauseTiming();
136     // Don't need to acquire the lock because no thread is created.
137     ready = false;
138 
139     for (size_t i = 0; i < num_threads; ++i) threads[i] = new std::thread(thread_task, i);
140 
141     state.ResumeTiming();
142 
143     {
144       std::unique_lock lock(m);
145       ready = true;
146     }
147 
148     cv.notify_all();
149 
150     for (size_t i = 0; i < num_threads; ++i) {
151       threads[i]->join();
152       delete threads[i];
153     }
154   }
155 
156   const size_t ThreadsBytesProcessed = kMaxBytes * kMinRounds * num_threads;
157   state.SetBytesProcessed(ThreadsBytesProcessed * static_cast<size_t>(state.iterations()));
158 }
159 
BM_mallopt_purge(benchmark::State & state)160 static void BM_mallopt_purge(benchmark::State& state) {
161   RunMalloptPurge(state, M_PURGE);
162 }
163 BIONIC_BENCHMARK(BM_mallopt_purge);
164 
BM_mallopt_purge_all(benchmark::State & state)165 static void BM_mallopt_purge_all(benchmark::State& state) {
166   RunMalloptPurge(state, M_PURGE_ALL);
167 }
168 BIONIC_BENCHMARK(BM_mallopt_purge_all);
169 
170 // Note that this will only test a single size class at a time so that we can
171 // observe the impact of contention more often.
172 #define BM_MALLOC_THREADS_THROUGHPUT(SIZE, NUM_THREADS)                                      \
173   static void BM_malloc_threads_throughput_##SIZE##_##NUM_THREADS(benchmark::State& state) { \
174     RunThreadsThroughput(state, SIZE, NUM_THREADS);                                          \
175   }                                                                                          \
176   BIONIC_BENCHMARK(BM_malloc_threads_throughput_##SIZE##_##NUM_THREADS);
177 
178 // There are three block categories in Scudo, we choose 1 from each category.
179 BM_MALLOC_THREADS_THROUGHPUT(64, 2);
180 BM_MALLOC_THREADS_THROUGHPUT(64, 4);
181 BM_MALLOC_THREADS_THROUGHPUT(64, 8);
182 BM_MALLOC_THREADS_THROUGHPUT(512, 2);
183 BM_MALLOC_THREADS_THROUGHPUT(512, 4);
184 BM_MALLOC_THREADS_THROUGHPUT(512, 8);
185 BM_MALLOC_THREADS_THROUGHPUT(8192, 2);
186 BM_MALLOC_THREADS_THROUGHPUT(8192, 4);
187 BM_MALLOC_THREADS_THROUGHPUT(8192, 8);
188 
189 #endif
190