1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Simple benchmarking facility.
17 #ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_TEST_BENCHMARK_H_
18 #define TENSORFLOW_CORE_PLATFORM_DEFAULT_TEST_BENCHMARK_H_
19 
20 #include <utility>
21 #include <vector>
22 
23 #if defined(_MSC_VER)
24 #include <intrin.h>  // for _ReadWriteBarrier
25 #endif
26 
27 #include "tensorflow/core/platform/logging.h"
28 #include "tensorflow/core/platform/macros.h"
29 #include "tensorflow/core/platform/platform.h"
30 #include "tensorflow/core/platform/types.h"
31 
32 #define BENCHMARK(n)                                            \
33   static ::tensorflow::testing::Benchmark* TF_BENCHMARK_CONCAT( \
34       __benchmark_, n, __LINE__) TF_ATTRIBUTE_UNUSED =          \
35       (new ::tensorflow::testing::Benchmark(#n, (n)))
36 #define TF_BENCHMARK_CONCAT(a, b, c) TF_BENCHMARK_CONCAT2(a, b, c)
37 #define TF_BENCHMARK_CONCAT2(a, b, c) a##b##c
38 
39 namespace testing {
40 namespace benchmark {
41 class State;
42 }
43 }  // namespace testing
44 
45 namespace tensorflow {
46 namespace testing {
47 namespace internal {
48 void UseCharPointer(char const volatile*);
49 }
50 
51 // The DoNotOptimize(...) function can be used to prevent a value or
52 // expression from being optimized away by the compiler. This function is
53 // intended to add little to no overhead.
54 // See: http://stackoverflow.com/questions/28287064
55 //
56 // The specific guarantees of DoNotOptimize(x) are:
57 //  1) x, and any data it transitively points to, will exist (in a register or
58 //     in memory) at the current point in the program.
59 //  2) The optimizer will assume that DoNotOptimize(x) could mutate x or
60 //     anything it transitively points to (although it actually doesn't).
61 //
62 // To see this in action:
63 //
64 //   void BM_multiply(benchmark::State& state) {
65 //     int a = 2;
66 //     int b = 4;
67 //     for (auto _ : state) {
68 //       testing::DoNotOptimize(a);
69 //       testing::DoNotOptimize(b);
70 //       int c = a * b;
71 //       testing::DoNotOptimize(c);
72 //     }
73 //   }
74 //   BENCHMARK(BM_multiply);
75 //
76 // Guarantee (2) applied to 'a' and 'b' prevents the compiler lifting the
77 // multiplication outside of the loop. Guarantee (1) applied to 'c' prevents the
78 // compiler from optimizing away 'c' as dead code.
79 template <class T>
DoNotOptimize(const T & var)80 void DoNotOptimize(const T& var) {
81 #if defined(_MSC_VER)
82   internal::UseCharPointer(reinterpret_cast<char const volatile*>(&var));
83   _ReadWriteBarrier();
84 #else
85   asm volatile("" : "+m"(const_cast<T&>(var)));
86 #endif
87 }
88 
89 class Benchmark {
90  public:
91   [[deprecated("use `benchmark::State&` instead.")]] Benchmark(const char* name,
92                                                                void (*fn)(int));
93 
94   [[deprecated("use `benchmark::State&` instead.")]] Benchmark(const char* name,
95                                                                void (*fn)(int,
96                                                                           int));
97 
98   [[deprecated("use `benchmark::State&` instead.")]] Benchmark(
99       const char* name, void (*fn)(int, int, int));
100 
101   Benchmark(const char* name, void (*fn)(::testing::benchmark::State&));
102 
103   Benchmark* Arg(int x);
104   Benchmark* ArgPair(int x, int y);
105   Benchmark* Range(int lo, int hi);
106   Benchmark* RangePair(int lo1, int hi1, int lo2, int hi2);
107 
108   Benchmark* UseRealTime();
109 
110   static void Run(const char* pattern);
111 
112  private:
113   string name_;
114   int num_args_;
115   int instantiated_num_args_ = -1;
116   std::vector<std::pair<int, int> > args_;
117   void (*fn0_)(int) = nullptr;
118   void (*fn1_)(int, int) = nullptr;
119   void (*fn2_)(int, int, int) = nullptr;
120   void (*fn_state_)(::testing::benchmark::State&) = nullptr;
121 
122   void Register();
123   void Run(int arg1, int arg2, int* run_count, double* run_seconds);
124 
125   void CheckArgCount(int expected);
126 };
127 
128 void RunBenchmarks();
129 void SetLabel(const std::string& label);
130 void BytesProcessed(int64);
131 void ItemsProcessed(int64);
132 void StartTiming();
133 void StopTiming();
134 void UseRealTime();
135 
136 }  // namespace testing
137 }  // namespace tensorflow
138 
139 // Support `void BM_Func(benchmark::State&)` interface so that the it is
140 // compatible with the internal version.
141 namespace testing {
142 namespace benchmark {
143 // State is passed as an argument to a benchmark function.
144 // Each thread in threaded benchmarks receives own object.
145 class State {
146  public:
147   // Incomplete iterator-like type with dummy value type so that
148   // benchmark::State can support iteration with a range-based for loop.
149   //
150   // The only supported usage:
151   //
152   //   static void BM_Foo(benchmark::State& state) {
153   //     for (auto s : state) {
154   //       // perform single iteration
155   //     }
156   //   }
157   //
158   // This is meant to replace the deprecated API :
159   //
160   //   static void BM_Foo(int iters) {
161   //     while (iters-- > 0) {
162   //       // perform single iteration
163   //     }
164   //   }
165   //
166   // See go/benchmark#old-benchmark-interface for more details.
167   class Iterator {
168    public:
169     struct Value {
170       // Non-trivial destructor to avoid warning for unused dummy variable in
171       // the range-based for loop.
~ValueValue172       ~Value() {}
173     };
174 
175     explicit Iterator(State* parent);
176 
177     Iterator& operator++();
178 
179     bool operator!=(const Iterator& other);
180 
181     Value operator*();
182 
183    private:
184     State* const parent_;
185   };
186 
187   Iterator begin();
188   Iterator end();
189 
190   void PauseTiming();
191   void ResumeTiming();
192 
193   // Set the number of bytes processed by the current benchmark
194   // execution.  This routine is typically called once at the end of a
195   // throughput oriented benchmark.  If this routine is called with a
196   // value > 0, then bytes processed per second is also reported.
197   void SetBytesProcessed(::tensorflow::int64 bytes);
198 
199   // If this routine is called with items > 0, then an items/s
200   // label is printed on the benchmark report line for the currently
201   // executing benchmark. It is typically called at the end of a processing
202   // benchmark where a processing items/second output is desired.
203   void SetItemsProcessed(::tensorflow::int64 items);
204 
205   // If this method is called, the specified label is printed at the
206   // end of the benchmark report line for the currently executing
207   // benchmark.  Example:
208   //  static void BM_Compress(benchmark::State& state) {
209   //    ...
210   //    double compression = input_size / output_size;
211   //    state.SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
212   //  }
213   // Produces output that looks like:
214   //  BM_Compress   50         50   14115038  compress:27.3%
215   //
216   // REQUIRES: a benchmark is currently executing
217   void SetLabel(absl::string_view label);
218 
219   // For parameterized benchmarks, range(i) returns the value of the ith
220   // parameter. Simple benchmarks are not parameterized and do not need to call
221   // range().
222   int range(size_t i) const;
223 
224   // Total number of iterations processed so far.
225   size_t iterations() const;
226 
227   const size_t
228       max_iterations;  // NOLINT: for compatibility with OSS benchmark library
229 
230   // Disallow copy and assign.
231   State(const State&) = delete;
232   State& operator=(const State&) = delete;
233 
234  protected:
235   friend class tensorflow::testing::Benchmark;
236   State(size_t max_iterations, int formal_arg_count, std::vector<int> args);
237 
238  private:
239   size_t completed_iterations_;
240   const int formal_arg_count_;
241   const std::vector<int> args_;
242 };
243 
Iterator(State * parent)244 inline State::Iterator::Iterator(State* parent) : parent_(parent) {}
245 
iterations()246 inline size_t State::iterations() const { return completed_iterations_; }
247 
248 inline bool State::Iterator::operator!=(const Iterator& other) {
249   DCHECK_EQ(other.parent_, nullptr);
250   DCHECK_NE(parent_, nullptr);
251 
252   if (parent_->completed_iterations_ < parent_->max_iterations) {
253     return true;
254   }
255 
256   ++parent_->completed_iterations_;
257   // If this is the last iteration, stop the timer.
258   parent_->PauseTiming();
259   return false;
260 }
261 
262 inline State::Iterator& State::Iterator::operator++() {
263   DCHECK_LT(parent_->completed_iterations_, parent_->max_iterations);
264   ++parent_->completed_iterations_;
265   return *this;
266 }
267 
268 inline State::Iterator::Value State::Iterator::operator*() { return Value(); }
269 
begin()270 inline State::Iterator State::begin() {
271   // Starts the timer here because if the code uses this API, it expects
272   // the timer to starts at the beginning of this loop.
273   ResumeTiming();
274   return Iterator(this);
275 }
276 
end()277 inline State::Iterator State::end() { return Iterator(nullptr); }
278 
279 void RunSpecifiedBenchmarks();
280 
281 }  // namespace benchmark
282 }  // namespace testing
283 
284 #endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_TEST_BENCHMARK_H_
285