1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifdef __APPLE__
16 #include <sys/time.h>
17 #endif
18 
19 #include <cstdint>
20 #include <cstdlib>
21 #include <ctime>
22 #include <iostream>
23 #include <map>
24 #include <vector>
25 #ifdef __APPLE__
26 #include <TargetConditionals.h>
27 #endif
28 
29 #include "test.h"
30 
31 #ifndef GEMMLOWP_TEST_BIT_DEPTH_PARAMS
32 #define GEMMLOWP_TEST_BIT_DEPTH_PARAMS DefaultL8R8BitDepthParams
33 #endif
34 
35 #if defined(__arm__) && !defined(GEMMLOWP_NEON)
36 #warning "Building without NEON support on ARM, check your compiler setup!"
37 #endif
38 
39 #if defined(__SSE4_2__) && !defined(GEMMLOWP_SSE4)
40 #warning \
41     "Building without SSE4.2 support on SSE4.2 enabled machine, check your compiler setup!"
42 #endif
43 
44 namespace gemmlowp {
45 
46 const double min_accurate_duration = 1e-1;
47 const std::size_t min_working_set_size = 16 * 1024 * 1024;
48 
49 struct gemm_t {
50   int rows, depth, cols;
gemm_tgemmlowp::gemm_t51   gemm_t() : rows(0), depth(0), cols(0) {}
gemm_tgemmlowp::gemm_t52   gemm_t(int r, int d, int c) : rows(r), depth(d), cols(c) {}
53 };
54 
operator <(const gemm_t & a,const gemm_t & b)55 bool operator<(const gemm_t& a, const gemm_t& b) {
56   return a.rows < b.rows ||
57          (a.rows <= b.rows &&
58           (a.depth < b.depth || (a.depth <= b.depth && (a.cols < b.cols))));
59 }
60 
61 template <typename LhsType, typename RhsType, typename ResultType>
time_for_gemms(GemmContext * context,const std::vector<gemm_t> & gemms)62 double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
63   typedef std::uint8_t Scalar;
64 
65   // set up the matrix pool
66 
67   std::size_t combined_gemm_sizes = 0;
68   for (auto gemm : gemms) {
69     int rows = gemm.rows;
70     int depth = gemm.depth;
71     int cols = gemm.cols;
72     combined_gemm_sizes +=
73         sizeof(Scalar) * (rows * depth + depth * cols + rows * cols);
74   }
75 
76   const std::size_t pool_size = 1 + min_working_set_size / combined_gemm_sizes;
77 
78   std::vector<LhsType> lhs(pool_size * gemms.size());
79   std::vector<RhsType> rhs(pool_size * gemms.size());
80   std::vector<ResultType> result(pool_size * gemms.size());
81 
82   for (std::size_t i = 0; i < pool_size; i++) {
83     for (std::size_t j = 0; j < gemms.size(); j++) {
84       int k = i * gemms.size() + j;
85       lhs[k].Resize(gemms[j].rows, gemms[j].depth);
86       MakeConstant(&lhs[k], 0);
87       rhs[k].Resize(gemms[j].depth, gemms[j].cols);
88       MakeConstant(&rhs[k], 0);
89       result[k].Resize(gemms[j].rows, gemms[j].cols);
90       MakeConstant(&result[k], 0);
91     }
92   }
93 
94   // main benchmark loop
95 
96   int iters_at_a_time = 1;
97   float time_per_iter = 0.0f;
98   std::size_t pool_index = 0;
99 
100   while (true) {
101     double starttime = real_time_in_seconds();
102     for (int i = 0; i < iters_at_a_time; i++) {
103       for (size_t j = 0; j < gemms.size(); j++) {
104         size_t k = pool_index * gemms.size() + j;
105         Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>(
106             context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(),
107             -75, -91, 74980, 123, 20);
108       }
109       pool_index++;
110       if (pool_index == pool_size) {
111         pool_index = 0;
112       }
113     }
114     double endtime = real_time_in_seconds();
115 
116     const float timing = static_cast<float>(endtime - starttime);
117 
118     if (timing >= min_accurate_duration) {
119       time_per_iter = timing / iters_at_a_time;
120       break;
121     }
122 
123     iters_at_a_time *= 2;
124   }
125 
126   return time_per_iter;
127 }
128 
129 template <typename LhsType, typename RhsType, typename ResultType>
gflops_for_gemms(GemmContext * context,const std::vector<gemm_t> & gemms)130 double gflops_for_gemms(GemmContext* context,
131                         const std::vector<gemm_t>& gemms) {
132   const double time_per_iter =
133       time_for_gemms<LhsType, RhsType, ResultType>(context, gemms);
134   double ops = 0;
135   for (auto gemm : gemms) {
136     ops += 2.0 * gemm.rows * gemm.depth * gemm.cols;
137   }
138   return 1e-9 * ops / time_per_iter;
139 }
140 
benchmark(GemmContext * context)141 void benchmark(GemmContext* context) {
142   std::map<gemm_t, std::vector<double>> benchmark_results;
143 
144   std::vector<gemm_t> benchmark_gemms;
145   benchmark_gemms.emplace_back(10, 10, 10);
146   benchmark_gemms.emplace_back(20, 20, 20);
147   benchmark_gemms.emplace_back(30, 30, 30);
148   benchmark_gemms.emplace_back(40, 40, 40);
149   benchmark_gemms.emplace_back(50, 50, 50);
150   benchmark_gemms.emplace_back(60, 60, 60);
151   benchmark_gemms.emplace_back(64, 256, 147);
152   benchmark_gemms.emplace_back(100, 100, 1);
153   benchmark_gemms.emplace_back(100, 100, 100);
154   benchmark_gemms.emplace_back(100, 1000, 100);
155   benchmark_gemms.emplace_back(1000, 1000, 1);
156   benchmark_gemms.emplace_back(1000, 1000, 10);
157   benchmark_gemms.emplace_back(1000, 1000, 100);
158   benchmark_gemms.emplace_back(1000, 1000, 1000);
159 
160   const int repeat = 2;
161 
162   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
163   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
164   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
165 
166 #ifdef GEMMLOWP_TEST_PROFILE
167   gemmlowp::RegisterCurrentThreadForProfiling();
168   gemmlowp::StartProfiling();
169 #endif
170 
171   // We don't record the first repetition, it's just warm-up.
172   for (int r = 0; r < repeat + 1; r++) {
173     std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
174               << std::flush;
175     for (auto gemm : benchmark_gemms) {
176       double gflops = 0;
177       std::vector<gemm_t> unique_gemm;
178       unique_gemm.push_back(gemm);
179       gflops =
180           gflops_for_gemms<LhsType, RhsType, ResultType>(context, unique_gemm);
181       if (r > 0) {
182         benchmark_results[gemm].emplace_back(gflops);
183       }
184     }
185   }
186 
187 #ifdef GEMMLOWP_TEST_PROFILE
188   gemmlowp::FinishProfiling();
189 #endif
190 
191   std::cout << "                                                \r"
192             << std::flush;
193 
194   std::cout.precision(4);
195 
196   for (auto b : benchmark_results) {
197     sort(b.second.begin(), b.second.end());
198     std::cout << b.first.rows << "x" << b.first.depth << "x" << b.first.cols
199               << " : " << b.second.back() << " GFlops/s" << std::endl;
200   }
201   std::cout << std::endl;
202 }
203 
benchmark_gemm_sizes(GemmContext * context,const std::vector<gemm_t> & gemms,double mintime)204 void benchmark_gemm_sizes(GemmContext* context,
205                           const std::vector<gemm_t>& gemms, double mintime) {
206   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
207   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
208   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
209 
210   std::vector<float> gemm_times;
211   std::cout << "running for " << mintime << " seconds..." << std::endl;
212 
213 #ifdef GEMMLOWP_TEST_PROFILE
214   gemmlowp::RegisterCurrentThreadForProfiling();
215   gemmlowp::StartProfiling();
216 #endif
217 
218   double starttime = real_time_in_seconds();
219   while (real_time_in_seconds() < starttime + mintime) {
220     gemm_times.push_back(
221         time_for_gemms<LhsType, RhsType, ResultType>(context, gemms));
222   }
223 
224 #ifdef GEMMLOWP_TEST_PROFILE
225   gemmlowp::FinishProfiling();
226 #endif
227 
228   std::sort(gemm_times.begin(), gemm_times.end());
229 
230   double sum_gemm_times = 0;
231   double sum_gemm_times_trimmed = 0;
232   int count_gemm_times_trimmed = 0;
233   const float trim_ratio = 0.25;
234   const size_t count_trimmed = gemm_times.size() * trim_ratio;
235   double sum_gemm_times_best = 0;
236   int count_gemm_times_best = 0;
237   const float best_ratio = 0.1;
238   const size_t count_best = gemm_times.size() * best_ratio;
239 
240   for (size_t i = 0; i < gemm_times.size(); i++) {
241     sum_gemm_times += gemm_times[i];
242     if (i >= count_trimmed && i < gemm_times.size() - count_trimmed) {
243       sum_gemm_times_trimmed += gemm_times[i];
244       count_gemm_times_trimmed++;
245     }
246     if (i < count_best) {
247       sum_gemm_times_best += gemm_times[i];
248       count_gemm_times_best++;
249     }
250   }
251 
252   const double min_latency = gemm_times.front();
253   const double max_latency = gemm_times.back();
254   const double mean_latency = sum_gemm_times / gemm_times.size();
255   const double trimmed_mean_latency =
256       sum_gemm_times_trimmed / count_gemm_times_trimmed;
257   const double best_mean_latency = sum_gemm_times_best / count_gemm_times_best;
258 
259   std::cout << "Graph latency (over " << gemm_times.size()
260             << " iterations):" << std::endl;
261   std::cout << "  Best:             " << min_latency << "s" << std::endl;
262   std::cout << "  Worst:            " << max_latency << "s" << std::endl;
263   std::cout << "  Mean:             " << mean_latency << "s" << std::endl;
264   std::cout << "  " << 100 * trim_ratio
265             << "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl;
266   std::cout << "  Mean of " << 100 * best_ratio
267             << "% best: " << best_mean_latency << "s" << std::endl;
268 }
269 
benchmark_googlenet(GemmContext * context)270 void benchmark_googlenet(GemmContext* context) {
271   // These are the m, n, k sizes for a typical GoogLeNet.
272   const int googlenet_gemm_sizes[] = {
273       12544, 64,  147, 3136, 64,   64,   3136, 192,  576,  784, 64,   192,
274       784,   96,  192, 784,  128,  864,  784,  16,   192,  784, 32,   400,
275       784,   32,  192, 784,  128,  256,  784,  128,  256,  784, 192,  1152,
276       784,   32,  256, 784,  96,   800,  784,  64,   256,  196, 192,  480,
277       196,   96,  480, 196,  204,  864,  196,  16,   480,  196, 48,   400,
278       196,   64,  480, 196,  160,  508,  196,  112,  508,  196, 224,  1008,
279       196,   24,  508, 196,  64,   600,  196,  64,   508,  196, 128,  512,
280       196,   128, 512, 196,  256,  1152, 196,  24,   512,  196, 64,   600,
281       196,   64,  512, 196,  112,  512,  196,  144,  512,  196, 288,  1296,
282       196,   32,  512, 196,  64,   800,  196,  64,   512,  196, 256,  528,
283       196,   160, 528, 196,  320,  1440, 196,  32,   528,  196, 128,  800,
284       196,   128, 528, 49,   256,  832,  49,   160,  832,  49,  320,  1440,
285       49,    48,  832, 49,   128,  1200, 49,   128,  832,  49,  384,  832,
286       49,    192, 832, 49,   384,  1728, 49,   48,   832,  49,  128,  1200,
287       49,    128, 832, 16,   128,  508,  1,    1024, 2048, 1,   1008, 1024,
288       16,    128, 528, 1,    1024, 2048, 1,    1008, 1024, 1,   1008, 1024,
289   };
290   assert(sizeof(googlenet_gemm_sizes) % (3 * sizeof(googlenet_gemm_sizes[0])) ==
291          0);
292   const std::size_t num_googlenet_gemms =
293       sizeof(googlenet_gemm_sizes) / (3 * sizeof(googlenet_gemm_sizes[0]));
294 
295   std::vector<gemm_t> googlenet_gemms(num_googlenet_gemms);
296   for (std::size_t i = 0; i < num_googlenet_gemms; i++) {
297     googlenet_gemms[i].rows = googlenet_gemm_sizes[3 * i + 1];
298     googlenet_gemms[i].depth = googlenet_gemm_sizes[3 * i + 2];
299     googlenet_gemms[i].cols = googlenet_gemm_sizes[3 * i + 0];
300   }
301 
302   const double mintime = 20.0;
303   benchmark_gemm_sizes(context, googlenet_gemms, mintime);
304 }
305 
benchmark_small_model(GemmContext * context)306 void benchmark_small_model(GemmContext* context) {
307   // These are the m, n, k sizes for a small model with large batches.
308   const int small_model_gemm_sizes[] = {
309       29232, 16, 25, 7308, 6, 400, 203, 3002, 216,
310   };
311   assert(sizeof(small_model_gemm_sizes) %
312              (3 * sizeof(small_model_gemm_sizes[0])) ==
313          0);
314   const std::size_t num_small_model_gemms =
315       sizeof(small_model_gemm_sizes) / (3 * sizeof(small_model_gemm_sizes[0]));
316 
317   std::vector<gemm_t> small_model_gemms(num_small_model_gemms);
318   for (std::size_t i = 0; i < num_small_model_gemms; i++) {
319     small_model_gemms[i].rows = small_model_gemm_sizes[3 * i + 1];
320     small_model_gemms[i].depth = small_model_gemm_sizes[3 * i + 2];
321     small_model_gemms[i].cols = small_model_gemm_sizes[3 * i + 0];
322   }
323 
324   const double mintime = 10.0;
325   benchmark_gemm_sizes(context, small_model_gemms, mintime);
326 }
327 
benchmark_all()328 void benchmark_all() {
329   {
330     gemmlowp::GemmContext context;
331     std::cout << "Benchmarking small model GEMMs..." << std::endl;
332     gemmlowp::benchmark_small_model(&context);
333   }
334 
335   {
336     gemmlowp::GemmContext context;
337     std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl;
338     gemmlowp::benchmark_googlenet(&context);
339   }
340 
341   {
342     gemmlowp::GemmContext context;
343     context.set_max_num_threads(0);
344     std::cout << "Benchmarking multi-threaded mode..." << std::endl;
345     gemmlowp::benchmark(&context);
346   }
347 
348   {
349     gemmlowp::GemmContext context;
350     context.set_max_num_threads(1);
351     std::cout << "Benchmarking single-threaded mode..." << std::endl;
352     gemmlowp::benchmark(&context);
353   }
354 }
355 
356 }  // end namespace gemmlowp
357 
358 // For iOS, we need to define our own main(), so skip it here.
359 #if !(defined(__APPLE__) && (TARGET_OS_IPHONE || TARGET_IPHONE_SIMULATOR))
main()360 int main() { gemmlowp::benchmark_all(); }
361 #endif
362