1 // Example command line to build on Android ARM64:
2 /*
3 ~/android/toolchains/r15c-aarch64/bin/aarch64-linux-android-clang++ \
4 test/benchmark_all_sizes.cc -o /tmp/b -O3 --std=c++11 -fPIE -static \
5 -DBENCHMARK_QUICK -DBENCHMARK_8bit
6 */
7 
8 #include <algorithm>
9 #include <cmath>
10 #include <cstdint>
11 #include <ctime>
12 #include <iostream>
13 #include <map>
14 #include <random>
15 #include <set>
16 
17 #include "../public/gemmlowp.h"
18 
19 #if defined GEMMLOWP_ANDROID && defined GEMMLOWP_ARM_32
20 // Compilation workaround
21 namespace std {
22   using ::round;
23 }
24 #endif
25 
26 // Minimum duration of each benchmark measurement. Also, duration
27 // of sleep time between each two consecutive benchmark measurements to
28 // prevent over-heating.
29 const double kBenchmarkSecs = 0.1;
30 
31 // Sleep time before each benchmark.
32 const int kCooldownBeforeBenchmarkSecs = 0;
33 
34 // Number of benchmark passes.
35 const int kPasses = 4;
36 
37 #ifdef BENCHMARK_NUM_THREADS
38 const int kNumThreads = BENCHMARK_NUM_THREADS;
39 #else
40 const int kNumThreads = 1;
41 #endif
42 
43 namespace gemmlowp {
44 
45 // gemmlowp itself doesn't have a Matrix class, only a MatrixMap class,
46 // since it only maps existing data. In tests though, we need to
47 // create our own matrices.
48 template <typename tScalar, MapOrder tOrder>
49 class Matrix : public MatrixMap<tScalar, tOrder> {
50  public:
51   typedef MatrixMap<tScalar, tOrder> Map;
52   typedef MatrixMap<const tScalar, tOrder> ConstMap;
53   typedef typename Map::Scalar Scalar;
54   static const MapOrder Order = tOrder;
55   using Map::cols_;
56   using Map::data_;
57   using Map::kOrder;
58   using Map::rows_;
59   using Map::stride_;
60 
61  public:
Matrix()62   Matrix() : Map(nullptr, 0, 0, 0) {}
63 
Matrix(int rows,int cols)64   Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); }
65 
Matrix(const Matrix & other)66   Matrix(const Matrix& other) : Map(nullptr, 0, 0, 0) { *this = other; }
67 
operator =(const Matrix & other)68   Matrix& operator=(const Matrix& other) {
69     Resize(other.rows_, other.cols_);
70     std::memcpy(data_, other.data_, size() * sizeof(Scalar));
71     return *this;
72   }
73 
operator ==(const Matrix & a,const Matrix & b)74   friend bool operator==(const Matrix& a, const Matrix& b) {
75     return a.rows_ == b.rows_ && a.cols_ == b.cols_ &&
76            !std::memcmp(a.data_, b.data_, a.size());
77   }
78 
Resize(int rows,int cols)79   void Resize(int rows, int cols) {
80     rows_ = rows;
81     cols_ = cols;
82     stride_ = kOrder == MapOrder::ColMajor ? rows : cols;
83     storage.resize(size());
84     data_ = storage.data();
85   }
86 
size() const87   int size() const { return rows_ * cols_; }
88 
map()89   Map& map() { return *static_cast<Map*>(this); }
90 
const_map() const91   ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); }
92 
93  protected:
94   std::vector<Scalar> storage;
95 };
96 
97 template <typename MatrixType>
MakeZero(MatrixType * m)98 void MakeZero(MatrixType* m) {
99   for (int c = 0; c < m->cols(); c++) {
100     for (int r = 0; r < m->rows(); r++) {
101       (*m)(r, c) = 128;
102     }
103   }
104 }
105 
106 }  // end namespace gemmlowp
107 
108 template <typename BitDepthParams>
benchmark_8bit(int rows,int depth,int cols)109 float benchmark_8bit(int rows, int depth, int cols) {
110   using namespace gemmlowp;
111   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
112   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
113   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
114 
115   LhsType lhs;
116   RhsType rhs;
117   ResultType result;
118   lhs.Resize(rows, depth);
119   rhs.Resize(depth, cols);
120   result.Resize(rows, cols);
121   MakeZero(&lhs);
122   MakeZero(&rhs);
123   MakeZero(&result);
124 
125   typedef std::tuple<OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
126                      OutputStageSaturatingCastToUint8>
127       Pipeline;
128   gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
129       quantize_down_stage;
130   quantize_down_stage.result_offset_after_shift = 128;
131   quantize_down_stage.result_fixedpoint_multiplier = 1234567890;
132   quantize_down_stage.result_shift = 16;
133   gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
134   const auto output_pipeline =
135       std::make_tuple(quantize_down_stage, saturating_cast_stage);
136   GemmContext gemm_context;
137   gemm_context.set_max_num_threads(kNumThreads);
138   gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>(
139       &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
140       -128, output_pipeline);
141 
142   double time_start = real_time_in_seconds();
143   double t = time_start;
144   int iters = 0;
145   int iters_at_a_time = 1;
146   while (t - time_start < kBenchmarkSecs) {
147     for (int i = 0; i < iters_at_a_time; i++) {
148       gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
149                                        BitDepthParams>(
150           &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
151           -128, output_pipeline);
152       iters++;
153     }
154     iters_at_a_time *= 2;
155     t = real_time_in_seconds();
156   }
157   return (t - time_start) / iters;
158 }
159 
160 template <typename BitDepthParams>
benchmark_8bit_to_32bit(int rows,int depth,int cols)161 float benchmark_8bit_to_32bit(int rows, int depth, int cols) {
162   using namespace gemmlowp;
163   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
164   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
165   typedef Matrix<std::int32_t, MapOrder::ColMajor> ResultType;
166 
167   LhsType lhs;
168   RhsType rhs;
169   ResultType result;
170   lhs.Resize(rows, depth);
171   rhs.Resize(depth, cols);
172   result.Resize(rows, cols);
173   MakeZero(&lhs);
174   MakeZero(&rhs);
175   MakeZero(&result);
176 
177   typedef std::tuple<> EmptyPipeline;
178   GemmContext gemm_context;
179   gemm_context.set_max_num_threads(kNumThreads);
180   gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>(
181       &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
182       -128, EmptyPipeline());
183 
184   double time_start = real_time_in_seconds();
185   double t = time_start;
186   int iters = 0;
187   int iters_at_a_time = 1;
188   while (t - time_start < kBenchmarkSecs) {
189     for (int i = 0; i < iters_at_a_time; i++) {
190       gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
191                                        BitDepthParams>(
192           &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
193           -128, EmptyPipeline());
194       iters++;
195     }
196     iters_at_a_time *= 2;
197     t = real_time_in_seconds();
198   }
199   return (t - time_start) / iters;
200 }
201 
202 struct Shape {
203   int rows;
204   int depth;
205   int cols;
206 };
207 
operator ==(const Shape & s1,const Shape & s2)208 bool operator==(const Shape& s1, const Shape& s2) {
209   return s1.rows == s2.rows && s1.depth == s2.depth && s1.cols == s2.cols;
210 }
211 
operator <(const Shape & shape1,const Shape & shape2)212 bool operator<(const Shape& shape1, const Shape& shape2) {
213   return shape1.depth < shape2.depth ||
214          (shape1.depth == shape2.depth &&
215           (shape1.rows < shape2.rows ||
216            (shape1.rows == shape2.rows && shape1.cols < shape2.cols)));
217 };
218 
219 #ifdef _WIN32
220 #define sleep(t) Sleep(t)
221 #endif
222 
benchmark(const Shape & shape)223 float benchmark(const Shape& shape) {
224   if (kCooldownBeforeBenchmarkSecs) {
225     sleep(kCooldownBeforeBenchmarkSecs);
226   }
227 #if defined BENCHMARK_8bit
228   // Benchmark the fast 8bit path, using L8R8WithLhsNonzeroBitDepthParams.
229   // This is the recommended thing to default to: it's what most applications
230   // want to use, as it's the fastest.
231   // The contract is that LHS must take values in [1, 255], while RHS can take
232   // any value in [0, 255].
233   return benchmark_8bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
234       shape.rows, shape.depth, shape.cols);
235 #elif defined BENCHMARK_8bit_wide
236   // Variant benchmarking the slower (mostly legacy) DefaultL8R8BitDepthParams.
237   // The only contract difference is that both LHS and RHS can take values in
238   // [0, 255].
239   return benchmark_8bit<gemmlowp::DefaultL8R8BitDepthParams>(
240       shape.rows, shape.depth, shape.cols);
241 #elif defined BENCHMARK_8bit_to_32bit
242   // Variant of BENCHMARK_8bit where the user asks for getting raw int32
243   // accumulators, instead of a 8bit-downscaled result.
244   return benchmark_8bit_to_32bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
245       shape.rows, shape.depth, shape.cols);
246 #elif defined BENCHMARK_8bit_to_32bit_wide
247   // Variant of BENCHMARK_8bit_wide where the user asks for getting raw int32
248   // accumulators, instead of a 8bit-downscaled result.
249   return benchmark_8bit_to_32bit<gemmlowp::DefaultL8R8BitDepthParams>(
250       shape.rows, shape.depth, shape.cols);
251 #elif defined BENCHMARK_float
252   return benchmark_float(shape.rows, shape.depth, shape.cols);
253 #else
254 #error What arithmetic path should we benchmark? (Suggestion: #define BENCHMARK_8bit)
255 #endif
256 }
257 
all_sizes()258 std::set<int> all_sizes() {
259   std::set<int> sizes;
260   for (int i = 1; i <= 2048; i *= 2) {
261     sizes.insert(i);
262   }
263   for (double x = 8; x <= 2048; x *= std::sqrt(2.)) {
264     sizes.insert(static_cast<int>(std::round(x)));
265   }
266   for (double x = 16; x <= 512; x *= std::pow(2., 1. / 4.)) {
267     sizes.insert(static_cast<int>(std::round(x)));
268   }
269   return sizes;
270 }
271 
RandomEngine()272 std::mt19937& RandomEngine() {
273   static std::mt19937 engine;
274   return engine;
275 }
276 
all_shapes_in_random_order()277 std::vector<Shape> all_shapes_in_random_order() {
278   std::vector<Shape> shapes;
279   const std::set<int> sizes = all_sizes();
280 #if defined BENCHMARK_ROWS
281   // Benchmark one specific shape
282   Shape shape;
283   shape.rows = BENCHMARK_ROWS;
284   shape.depth = BENCHMARK_DEPTH;
285   shape.cols = BENCHMARK_COLS;
286   shapes.push_back(shape);
287 #elif defined BENCHMARK_QUICK
288   // Benchmark an assortment of cubic shapes
289   for (int size : sizes) {
290     Shape shape;
291     shape.rows = size;
292     shape.depth = size;
293     shape.cols = size;
294     shapes.push_back(shape);
295   }
296 #elif defined BENCHMARK_EXHAUSTIVE
297   // Benchmark all sorts of shapes
298   for (int rows : sizes) {
299     for (int depth : sizes) {
300       for (int cols : sizes) {
301         Shape shape;
302         shape.rows = rows;
303         shape.depth = depth;
304         shape.cols = cols;
305         shapes.push_back(shape);
306       }
307     }
308   }
309 #else
310 #error What shapes should we benchmark? (Suggestion: #define BENCHMARK_QUICK)
311 #endif
312   std::shuffle(std::begin(shapes), std::end(shapes), RandomEngine());
313   return shapes;
314 }
315 
run_benchmarks(std::map<Shape,float> * results)316 void run_benchmarks(std::map<Shape, float>* results) {
317   std::vector<Shape> shapes;
318   for (int pass = 0; pass < kPasses; pass++) {
319     const std::vector<Shape> pass_shapes = all_shapes_in_random_order();
320     shapes.insert(std::end(shapes), std::begin(pass_shapes),
321                   std::end(pass_shapes));
322   }
323 
324   const double time_start = gemmlowp::real_time_in_seconds();
325   for (std::size_t i = 0; i < shapes.size(); i++) {
326     const double ratio = static_cast<double>(i) / shapes.size();
327     const double elapsed = gemmlowp::real_time_in_seconds() - time_start;
328     const double elapsed_hours = elapsed / 3600.;
329     const double eta_hours = elapsed_hours * (1. - ratio) / ratio;
330     fprintf(stderr,
331             "Benchmarking: %.2f%% done, Elapsed: %.2f hours, ETA: %.2f "
332             "hours...   \r",
333             100. * ratio, elapsed_hours, eta_hours);
334     fflush(stderr);
335     const Shape& shape = shapes[i];
336     float latency = benchmark(shape);
337     if (results->count(shape)) {
338       (*results)[shape] = std::min(latency, (*results)[shape]);
339     } else {
340       (*results)[shape] = latency;
341     }
342   }
343   fprintf(stderr, "\n");
344 }
345 
main()346 int main() {
347   std::map<Shape, float> results;
348   run_benchmarks(&results);
349   printf("Using %d thread(s)\n", kNumThreads);
350   printf("depth,rows,cols,latency(s),Gop/s\n");
351   for (const auto& result : results) {
352     const Shape& shape = result.first;
353     printf("%d,%d,%d,%.4g,%.4g\n", shape.depth, shape.rows, shape.cols,
354            result.second,
355            2e-9 * shape.depth * shape.rows * shape.cols / result.second);
356   }
357 }
358