1 // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <unistd.h>
16 #ifdef __APPLE__
17 #include <sys/time.h>
18 #endif
19 
20 #include <cstdint>
21 #include <cstdlib>
22 #include <ctime>
23 #include <iomanip>
24 #include <iostream>
25 #include <map>
26 #include <memory>
27 #include <vector>
28 
29 #include "multi_thread_transform.h"
30 #include "transform_kernels.h"
31 
32 using namespace gemmlowp::meta;
33 
time()34 double time() {
35 #ifdef __APPLE__
36   timeval t;
37   gettimeofday(&t, nullptr);
38   return t.tv_sec + 1e-6 * t.tv_usec;
39 #else
40   timespec t;
41   clock_gettime(CLOCK_REALTIME, &t);
42   return t.tv_sec + 1e-9 * t.tv_nsec;
43 #endif
44 }
45 
46 #define kernel_size (16)
47 
48 template <typename Context, typename Params>
run_benchmark(const std::string & name,int repetitions,int elements,Context * context,const Params & params)49 void run_benchmark(const std::string& name, int repetitions, int elements,
50                    Context* context, const Params& params) {
51   std::cout << "Benchmark: " << name << std::endl;
52   std::cout << "Warmup single." << std::endl;
53 
54   for (int i = 0; i < 10; ++i) {
55     Transform1D<Params, kernel_size>(params);
56   }
57 
58   std::cout << "Benchmark single." << std::endl;
59 
60   double start = time();
61 
62   for (int i = 0; i < repetitions; ++i) {
63     Transform1D<Params, kernel_size>(params);
64   }
65 
66   double wall_time = time() - start;
67   double ops = static_cast<double>(elements) * repetitions;
68   std::cout << "Avg: " << (wall_time / repetitions) << std::endl;
69   std::cout << "Perf: " << static_cast<std::int64_t>(ops / wall_time) << "/s."
70             << std::endl;
71 
72   std::cout << "Warmup single." << std::endl;
73 
74   for (int i = 0; i < 10; ++i) {
75     MultiThreadTransform1D<Context, Params, kernel_size>(context, params);
76   }
77 
78   std::cout << "Benchmark multi." << std::endl;
79 
80   start = time();
81 
82   for (int i = 0; i < repetitions; ++i) {
83     MultiThreadTransform1D<Context, Params, kernel_size>(context, params);
84   }
85 
86   wall_time = time() - start;
87   ops = static_cast<double>(elements) * repetitions;
88   std::cout << "Avg: " << (wall_time / repetitions) << std::endl;
89   std::cout << "Perf: " << static_cast<std::int64_t>(ops / wall_time) << "/s."
90             << std::endl;
91 }
92 
main()93 int main() {
94   const int repetitions = 500;
95   const int elements = 4 * 1024 * 1024;
96 
97   std::unique_ptr<std::int32_t[]> int32_array(new std::int32_t[elements]);
98   std::unique_ptr<std::uint8_t[]> uint8_array(new std::uint8_t[elements]);
99   std::unique_ptr<float[]> float_array(new float[elements]);
100 
101   typedef SimpleContext<gemmlowp::WorkersPool> Context;
102   Context context(4, new gemmlowp::WorkersPool());
103 
104   typedef Transform1DParams<std::int32_t, std::uint8_t, Requantize> RequantizeParams;
105   RequantizeParams requantize_params;
106   requantize_params.input = int32_array.get();
107   requantize_params.output = uint8_array.get();
108   requantize_params.kernel.count = elements;
109   requantize_params.kernel.input_range_min = -100.0f;
110   requantize_params.kernel.input_range_scale =
111       200.0f / ((static_cast<std::int64_t>(1) << 32) - 1);
112   requantize_params.kernel.input_range_offset =
113       static_cast<float>(std::numeric_limits<std::int32_t>::lowest());
114   requantize_params.kernel.output_range_min = -200.0f;
115   requantize_params.kernel.one_over_output_range_scale =
116       static_cast<float>((static_cast<std::int64_t>(1) << 8) - 1) / 500.0f;
117   requantize_params.kernel.output_range_offset =
118       static_cast<float>(std::numeric_limits<std::uint8_t>::lowest());
119 
120   run_benchmark("Requantize", repetitions, elements, &context,
121                 requantize_params);
122 
123   typedef Transform1DParams<std::uint8_t, float, Dequantize> DequantizeParams;
124   DequantizeParams dequantize_params;
125   dequantize_params.input = uint8_array.get();
126   dequantize_params.output = float_array.get();
127   dequantize_params.kernel.count = elements;
128   dequantize_params.kernel.range_min = -100.0f;
129   dequantize_params.kernel.range_scale =
130       static_cast<float>((static_cast<std::int64_t>(1) << 8) - 1) / 200.0f;
131   dequantize_params.kernel.range_offset =
132       static_cast<float>(std::numeric_limits<std::uint8_t>::lowest());
133 
134   run_benchmark("Dequantize", repetitions, elements, &context,
135                 dequantize_params);
136 
137   typedef Transform1DParams<float, std::uint8_t, Quantize> QuantizeParams;
138   QuantizeParams quantize_params;
139   quantize_params.input = float_array.get();
140   quantize_params.output = uint8_array.get();
141   quantize_params.kernel.count = elements;
142   quantize_params.kernel.range_min = -100.0f;
143   quantize_params.kernel.range_scale =
144       200.0f / ((static_cast<std::int64_t>(1) << 8) - 1);
145   quantize_params.kernel.range_offset =
146       static_cast<float>(std::numeric_limits<std::uint8_t>::lowest());
147 
148   run_benchmark("Quantize", repetitions, elements, &context, quantize_params);
149 
150   return 0;
151 }
152