1 // Copyright 2015 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <unistd.h>
16 #ifdef __APPLE__
17 #include <sys/time.h>
18 #endif
19 
20 #include <cstdint>
21 #include <cstdlib>
22 #include <ctime>
23 #include <iostream>
24 #include <map>
25 #include <vector>
26 #ifdef __APPLE__
27 #include <TargetConditionals.h>
28 #endif
29 
30 #include "test.h"
31 
32 #ifndef GEMMLOWP_TEST_BIT_DEPTH_PARAMS
33 #define GEMMLOWP_TEST_BIT_DEPTH_PARAMS DefaultL8R8BitDepthParams
34 #endif
35 
36 #if defined(__arm__) && !defined(GEMMLOWP_NEON)
37 #warning "Building without NEON support on ARM, check your compiler setup!"
38 #endif
39 
40 namespace gemmlowp {
41 
time()42 double time() {
43 #ifdef __APPLE__
44   timeval t;
45   gettimeofday(&t, nullptr);
46   return t.tv_sec + 1e-6 * t.tv_usec;
47 #else
48   timespec t;
49   clock_gettime(CLOCK_REALTIME, &t);
50   return t.tv_sec + 1e-9 * t.tv_nsec;
51 #endif
52 }
53 
54 const double min_accurate_duration = 1e-1;
55 const std::size_t min_working_set_size = 16 * 1024 * 1024;
56 
57 struct gemm_t {
58   int rows, depth, cols;
gemm_tgemmlowp::gemm_t59   gemm_t() : rows(0), depth(0), cols(0) {}
gemm_tgemmlowp::gemm_t60   gemm_t(int r, int d, int c) : rows(r), depth(d), cols(c) {}
61 };
62 
operator <(const gemm_t & a,const gemm_t & b)63 bool operator<(const gemm_t& a, const gemm_t& b) {
64   return a.rows < b.rows ||
65          (a.rows <= b.rows &&
66           (a.depth < b.depth || (a.depth <= b.depth && (a.cols < b.cols))));
67 }
68 
69 template <typename LhsType, typename RhsType, typename ResultType>
time_for_gemms(GemmContext * context,const std::vector<gemm_t> & gemms)70 double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
71   typedef std::uint8_t Scalar;
72 
73   // set up the matrix pool
74 
75   std::size_t combined_gemm_sizes = 0;
76   for (auto gemm : gemms) {
77     int rows = gemm.rows;
78     int depth = gemm.depth;
79     int cols = gemm.cols;
80     combined_gemm_sizes +=
81         sizeof(Scalar) * (rows * depth + depth * cols + rows * cols);
82   }
83 
84   const std::size_t pool_size = 1 + min_working_set_size / combined_gemm_sizes;
85 
86   std::vector<LhsType> lhs(pool_size * gemms.size());
87   std::vector<RhsType> rhs(pool_size * gemms.size());
88   std::vector<ResultType> result(pool_size * gemms.size());
89 
90   for (std::size_t i = 0; i < pool_size; i++) {
91     for (std::size_t j = 0; j < gemms.size(); j++) {
92       int k = i * gemms.size() + j;
93       lhs[k].Resize(gemms[j].rows, gemms[j].depth);
94       MakeConstant(&lhs[k], 0);
95       rhs[k].Resize(gemms[j].depth, gemms[j].cols);
96       MakeConstant(&rhs[k], 0);
97       result[k].Resize(gemms[j].rows, gemms[j].cols);
98       MakeConstant(&result[k], 0);
99     }
100   }
101 
102   // main benchmark loop
103 
104   int iters_at_a_time = 1;
105   float time_per_iter = 0.0f;
106   std::size_t pool_index = 0;
107 
108   while (true) {
109     double starttime = time();
110     for (int i = 0; i < iters_at_a_time; i++) {
111       for (size_t j = 0; j < gemms.size(); j++) {
112         int k = pool_index * gemms.size() + j;
113         Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>(
114             context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(),
115             -75, -91, 74980, 123, 20);
116       }
117       pool_index++;
118       if (pool_index == pool_size) {
119         pool_index = 0;
120       }
121     }
122     double endtime = time();
123 
124     const float timing = static_cast<float>(endtime - starttime);
125 
126     if (timing >= min_accurate_duration) {
127       time_per_iter = timing / iters_at_a_time;
128       break;
129     }
130 
131     iters_at_a_time *= 2;
132   }
133 
134   return time_per_iter;
135 }
136 
137 template <typename LhsType, typename RhsType, typename ResultType>
gflops_for_gemms(GemmContext * context,const std::vector<gemm_t> & gemms)138 double gflops_for_gemms(GemmContext* context,
139                         const std::vector<gemm_t>& gemms) {
140   const double time_per_iter =
141       time_for_gemms<LhsType, RhsType, ResultType>(context, gemms);
142   double ops = 0;
143   for (auto gemm : gemms) {
144     ops += 2.0 * gemm.rows * gemm.depth * gemm.cols;
145   }
146   return 1e-9 * ops / time_per_iter;
147 }
148 
benchmark(GemmContext * context)149 void benchmark(GemmContext* context) {
150   std::map<gemm_t, std::vector<double>> benchmark_results;
151 
152   std::vector<gemm_t> benchmark_gemms;
153   benchmark_gemms.emplace_back(10, 10, 10);
154   benchmark_gemms.emplace_back(20, 20, 20);
155   benchmark_gemms.emplace_back(30, 30, 30);
156   benchmark_gemms.emplace_back(40, 40, 40);
157   benchmark_gemms.emplace_back(50, 50, 50);
158   benchmark_gemms.emplace_back(60, 60, 60);
159   benchmark_gemms.emplace_back(64, 256, 147);
160   benchmark_gemms.emplace_back(100, 100, 1);
161   benchmark_gemms.emplace_back(100, 100, 100);
162   benchmark_gemms.emplace_back(100, 1000, 100);
163   benchmark_gemms.emplace_back(1000, 1000, 1);
164   benchmark_gemms.emplace_back(1000, 1000, 10);
165   benchmark_gemms.emplace_back(1000, 1000, 100);
166   benchmark_gemms.emplace_back(1000, 1000, 1000);
167 
168   const int repeat = 2;
169 
170   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
171   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
172   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
173 
174 #ifdef GEMMLOWP_TEST_PROFILE
175   gemmlowp::RegisterCurrentThreadForProfiling();
176   gemmlowp::StartProfiling();
177 #endif
178 
179   // We don't record the first repetition, it's just warm-up.
180   for (int r = 0; r < repeat + 1; r++) {
181     std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
182               << std::flush;
183     for (auto gemm : benchmark_gemms) {
184       double gflops = 0;
185       std::vector<gemm_t> unique_gemm;
186       unique_gemm.push_back(gemm);
187       gflops =
188           gflops_for_gemms<LhsType, RhsType, ResultType>(context, unique_gemm);
189       if (r > 0) {
190         benchmark_results[gemm].emplace_back(gflops);
191       }
192     }
193   }
194 
195 #ifdef GEMMLOWP_TEST_PROFILE
196   gemmlowp::FinishProfiling();
197 #endif
198 
199   std::cout << "                                                \r"
200             << std::flush;
201 
202   std::cout.precision(4);
203 
204   for (auto b : benchmark_results) {
205     sort(b.second.begin(), b.second.end());
206     std::cout << b.first.rows << "x" << b.first.depth << "x" << b.first.cols
207               << " : " << b.second.back() << " GFlops/s" << std::endl;
208   }
209   std::cout << std::endl;
210 }
211 
benchmark_gemm_sizes(GemmContext * context,const std::vector<gemm_t> & gemms,double mintime)212 void benchmark_gemm_sizes(GemmContext* context,
213                           const std::vector<gemm_t>& gemms, double mintime) {
214   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
215   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
216   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
217 
218   std::vector<float> gemm_times;
219   std::cout << "running for " << mintime << " seconds..." << std::endl;
220 
221 #ifdef GEMMLOWP_TEST_PROFILE
222   gemmlowp::RegisterCurrentThreadForProfiling();
223   gemmlowp::StartProfiling();
224 #endif
225 
226   double starttime = time();
227   while (time() < starttime + mintime) {
228     gemm_times.push_back(
229         time_for_gemms<LhsType, RhsType, ResultType>(context, gemms));
230   }
231 
232 #ifdef GEMMLOWP_TEST_PROFILE
233   gemmlowp::FinishProfiling();
234 #endif
235 
236   std::sort(gemm_times.begin(), gemm_times.end());
237 
238   double sum_gemm_times = 0;
239   double sum_gemm_times_trimmed = 0;
240   int count_gemm_times_trimmed = 0;
241   const float trim_ratio = 0.25;
242   const size_t count_trimmed = gemm_times.size() * trim_ratio;
243   double sum_gemm_times_best = 0;
244   int count_gemm_times_best = 0;
245   const float best_ratio = 0.1;
246   const size_t count_best = gemm_times.size() * best_ratio;
247 
248   for (size_t i = 0; i < gemm_times.size(); i++) {
249     sum_gemm_times += gemm_times[i];
250     if (i >= count_trimmed && i < gemm_times.size() - count_trimmed) {
251       sum_gemm_times_trimmed += gemm_times[i];
252       count_gemm_times_trimmed++;
253     }
254     if (i < count_best) {
255       sum_gemm_times_best += gemm_times[i];
256       count_gemm_times_best++;
257     }
258   }
259 
260   const double min_latency = gemm_times.front();
261   const double max_latency = gemm_times.back();
262   const double mean_latency = sum_gemm_times / gemm_times.size();
263   const double trimmed_mean_latency =
264       sum_gemm_times_trimmed / count_gemm_times_trimmed;
265   const double best_mean_latency = sum_gemm_times_best / count_gemm_times_best;
266 
267   std::cout << "Graph latency (over " << gemm_times.size()
268             << " iterations):" << std::endl;
269   std::cout << "  Best:             " << min_latency << "s" << std::endl;
270   std::cout << "  Worst:            " << max_latency << "s" << std::endl;
271   std::cout << "  Mean:             " << mean_latency << "s" << std::endl;
272   std::cout << "  " << 100 * trim_ratio
273             << "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl;
274   std::cout << "  Mean of " << 100 * best_ratio
275             << "% best: " << best_mean_latency << "s" << std::endl;
276 }
277 
benchmark_googlenet(GemmContext * context)278 void benchmark_googlenet(GemmContext* context) {
279   // These are the m, n, k sizes for a typical GoogLeNet.
280   const int googlenet_gemm_sizes[] = {
281       12544, 64,  147, 3136, 64,   64,   3136, 192,  576,  784, 64,   192,
282       784,   96,  192, 784,  128,  864,  784,  16,   192,  784, 32,   400,
283       784,   32,  192, 784,  128,  256,  784,  128,  256,  784, 192,  1152,
284       784,   32,  256, 784,  96,   800,  784,  64,   256,  196, 192,  480,
285       196,   96,  480, 196,  204,  864,  196,  16,   480,  196, 48,   400,
286       196,   64,  480, 196,  160,  508,  196,  112,  508,  196, 224,  1008,
287       196,   24,  508, 196,  64,   600,  196,  64,   508,  196, 128,  512,
288       196,   128, 512, 196,  256,  1152, 196,  24,   512,  196, 64,   600,
289       196,   64,  512, 196,  112,  512,  196,  144,  512,  196, 288,  1296,
290       196,   32,  512, 196,  64,   800,  196,  64,   512,  196, 256,  528,
291       196,   160, 528, 196,  320,  1440, 196,  32,   528,  196, 128,  800,
292       196,   128, 528, 49,   256,  832,  49,   160,  832,  49,  320,  1440,
293       49,    48,  832, 49,   128,  1200, 49,   128,  832,  49,  384,  832,
294       49,    192, 832, 49,   384,  1728, 49,   48,   832,  49,  128,  1200,
295       49,    128, 832, 16,   128,  508,  1,    1024, 2048, 1,   1008, 1024,
296       16,    128, 528, 1,    1024, 2048, 1,    1008, 1024, 1,   1008, 1024,
297   };
298   assert(sizeof(googlenet_gemm_sizes) % (3 * sizeof(googlenet_gemm_sizes[0])) ==
299          0);
300   const std::size_t num_googlenet_gemms =
301       sizeof(googlenet_gemm_sizes) / (3 * sizeof(googlenet_gemm_sizes[0]));
302 
303   std::vector<gemm_t> googlenet_gemms(num_googlenet_gemms);
304   for (std::size_t i = 0; i < num_googlenet_gemms; i++) {
305     googlenet_gemms[i].rows = googlenet_gemm_sizes[3 * i + 1];
306     googlenet_gemms[i].depth = googlenet_gemm_sizes[3 * i + 2];
307     googlenet_gemms[i].cols = googlenet_gemm_sizes[3 * i + 0];
308   }
309 
310   const double mintime = 20.0;
311   benchmark_gemm_sizes(context, googlenet_gemms, mintime);
312 }
313 
benchmark_small_model(GemmContext * context)314 void benchmark_small_model(GemmContext* context) {
315   // These are the m, n, k sizes for a small model with large batches.
316   const int small_model_gemm_sizes[] = {
317       29232, 16, 25, 7308, 6, 400, 203, 3002, 216,
318   };
319   assert(sizeof(small_model_gemm_sizes) %
320              (3 * sizeof(small_model_gemm_sizes[0])) ==
321          0);
322   const std::size_t num_small_model_gemms =
323       sizeof(small_model_gemm_sizes) / (3 * sizeof(small_model_gemm_sizes[0]));
324 
325   std::vector<gemm_t> small_model_gemms(num_small_model_gemms);
326   for (std::size_t i = 0; i < num_small_model_gemms; i++) {
327     small_model_gemms[i].rows = small_model_gemm_sizes[3 * i + 1];
328     small_model_gemms[i].depth = small_model_gemm_sizes[3 * i + 2];
329     small_model_gemms[i].cols = small_model_gemm_sizes[3 * i + 0];
330   }
331 
332   const double mintime = 10.0;
333   benchmark_gemm_sizes(context, small_model_gemms, mintime);
334 }
335 
benchmark_all()336 void benchmark_all() {
337   {
338     gemmlowp::GemmContext context;
339     std::cout << "Benchmarking small model GEMMs..." << std::endl;
340     gemmlowp::benchmark_small_model(&context);
341   }
342 
343   {
344     gemmlowp::GemmContext context;
345     std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl;
346     gemmlowp::benchmark_googlenet(&context);
347   }
348 
349   {
350     gemmlowp::GemmContext context;
351     std::cout << "Benchmarking default mode (typically multi-threaded)..."
352               << std::endl;
353     gemmlowp::benchmark(&context);
354   }
355 
356   {
357     gemmlowp::GemmContext context;
358     context.set_max_num_threads(1);
359     std::cout << "Benchmarking single-threaded mode..." << std::endl;
360     gemmlowp::benchmark(&context);
361   }
362 }
363 
364 }  // end namespace gemmlowp
365 
366 // For iOS, we need to define our own main(), so skip it here.
367 #if !(defined(__APPLE__) && (TARGET_OS_IPHONE || TARGET_IPHONE_SIMULATOR))
main()368 int main() { gemmlowp::benchmark_all(); }
369 #endif
370