1# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Benchmarks for autotuning performance knobs."""
16from __future__ import absolute_import
17from __future__ import division
18from __future__ import print_function
19
20
21import numpy as np
22
23from tensorflow.python.data.benchmarks import benchmark_base
24from tensorflow.python.data.ops import dataset_ops
25from tensorflow.python.ops import math_ops
26
27
28class AutotuneBenchmark(benchmark_base.DatasetBenchmarkBase):
29  """Benchmarks for autotuning performance knobs."""
30
31  def _run_benchmark(self, dataset, autotune, autotune_buffers,
32                     benchmark_iters, benchmark_label):
33    options = dataset_ops.Options()
34    options.experimental_optimization.apply_default_optimizations = False
35    options.experimental_optimization.autotune = autotune
36    options.experimental_optimization.autotune_buffers = autotune_buffers
37    dataset = dataset.with_options(options)
38
39    autotune_string = "_autotune_{}".format(
40        "parallelism_and_buffer_sizes"
41        if autotune_buffers else "parallelism_only")
42    wall_time = self.run_and_report_benchmark(
43        dataset=dataset,
44        num_elements=benchmark_iters,
45        warmup=True,
46        iters=1,
47        name=benchmark_label + (autotune_string if autotune else ""))
48    return wall_time
49
50  def benchmark_batch(self):
51    a = self._benchmark_batch(autotune=False)
52    b = self._benchmark_batch(autotune=True, autotune_buffers=False)
53    c = self._benchmark_batch(autotune=True, autotune_buffers=True)
54    print("autotune parallelism vs no autotuning speedup: {}".format(a / b))
55    print("autotune parallelism and buffer sizes vs no autotuning speedup: {}"
56          .format(a / c))
57
58  def _benchmark_batch(self, autotune, autotune_buffers=False):
59    batch_size = 128
60    k = 1024
61    dataset = dataset_ops.Dataset.from_tensors(
62        (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))).repeat()
63    dataset = dataset.map(math_ops.matmul)
64    dataset = dataset.batch(
65        batch_size=batch_size, num_parallel_calls=dataset_ops.AUTOTUNE)
66    return self._run_benchmark(
67        dataset,
68        autotune,
69        autotune_buffers,
70        benchmark_iters=10000,
71        benchmark_label="batch")
72
73  def benchmark_map(self):
74    a = self._benchmark_map(autotune=False)
75    b = self._benchmark_map(autotune=True, autotune_buffers=False)
76    c = self._benchmark_map(autotune=True, autotune_buffers=True)
77    print("autotune parallelism vs no autotuning speedup: {}".format(a / b))
78    print("autotune parallelism and buffer sizes vs no autotuning speedup: {}"
79          .format(a / c))
80
81  def _benchmark_map(self, autotune, autotune_buffers=False):
82    k = 1024 * 1024
83    dataset = dataset_ops.Dataset.from_tensors(
84        (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))).repeat()
85    dataset = dataset.map(
86        math_ops.matmul, num_parallel_calls=dataset_ops.AUTOTUNE)
87    return self._run_benchmark(
88        dataset=dataset,
89        autotune=autotune,
90        autotune_buffers=autotune_buffers,
91        benchmark_iters=10000,
92        benchmark_label="map")
93
94  def benchmark_map_and_batch(self):
95    a = self._benchmark_map_and_batch(autotune=False)
96    b = self._benchmark_map_and_batch(autotune=True, autotune_buffers=False)
97    c = self._benchmark_map_and_batch(autotune=True, autotune_buffers=True)
98    print("autotune parallelism vs no autotuning speedup: {}".format(a / b))
99    print("autotune parallelism and buffer sizes vs no autotuning speedup: {}"
100          .format(a / c))
101
102  def _benchmark_map_and_batch(self, autotune, autotune_buffers=False):
103    batch_size = 16
104    k = 1024 * 1024
105    dataset = dataset_ops.Dataset.from_tensors(
106        (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))).repeat()
107    dataset = dataset.map(
108        math_ops.matmul, num_parallel_calls=dataset_ops.AUTOTUNE)
109    dataset = dataset.batch(batch_size=batch_size)
110    return self._run_benchmark(
111        dataset=dataset,
112        autotune=autotune,
113        autotune_buffers=autotune_buffers,
114        benchmark_iters=1000,
115        benchmark_label="map_and_batch")
116
117  def benchmark_interleave(self):
118    a = self._benchmark_interleave(autotune=False)
119    b = self._benchmark_interleave(autotune=True, autotune_buffers=False)
120    c = self._benchmark_interleave(autotune=True, autotune_buffers=True)
121    print("autotune parallelism vs no autotuning speedup: {}".format(a / b))
122    print("autotune parallelism and buffer sizes vs no autotuning speedup: {}"
123          .format(a / c))
124
125  def _benchmark_interleave(self, autotune, autotune_buffers=False):
126    k = 1024 * 1024
127    dataset = dataset_ops.Dataset.from_tensors(
128        (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))).repeat()
129    dataset = dataset.map(math_ops.matmul)
130    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
131        lambda _: dataset,
132        cycle_length=10,
133        num_parallel_calls=dataset_ops.AUTOTUNE)
134    return self._run_benchmark(
135        dataset=dataset,
136        autotune=autotune,
137        autotune_buffers=autotune_buffers,
138        benchmark_iters=10000,
139        benchmark_label="interleave")
140
141  def benchmark_map_and_interleave(self):
142    a = self._benchmark_map_and_interleave(autotune=False)
143    b = self._benchmark_map_and_interleave(
144        autotune=True, autotune_buffers=False)
145    c = self._benchmark_map_and_interleave(autotune=True, autotune_buffers=True)
146    print("autotune parallelism vs no autotuning speedup: {}".format(a / b))
147    print("autotune parallelism and buffer sizes vs no autotuning speedup: {}"
148          .format(a / c))
149
150  def _benchmark_map_and_interleave(self, autotune, autotune_buffers=False):
151    k = 1024 * 1024
152    a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1))
153    b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))
154    c = (np.random.rand(1, 2 * k), np.random.rand(2 * k, 1))
155    dataset_a = dataset_ops.Dataset.from_tensors(a).repeat()
156    dataset_b = dataset_ops.Dataset.from_tensors(b).repeat()
157    dataset_c = dataset_ops.Dataset.from_tensors(c).repeat()
158
159    def f1(x, y):
160      return math_ops.matmul(x, y)
161
162    def f2(a, b):
163      x, y = b
164      return a, math_ops.matmul(x, y)
165
166    dataset = dataset_a
167    dataset = dataset.map(f1, num_parallel_calls=dataset_ops.AUTOTUNE)
168    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
169        lambda _: dataset,
170        num_parallel_calls=dataset_ops.AUTOTUNE,
171        cycle_length=2)
172
173    dataset = dataset_ops.Dataset.zip((dataset, dataset_b))
174    dataset = dataset.map(f2, num_parallel_calls=dataset_ops.AUTOTUNE)
175    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
176        lambda _: dataset,
177        num_parallel_calls=dataset_ops.AUTOTUNE,
178        cycle_length=2)
179
180    dataset = dataset_ops.Dataset.zip((dataset, dataset_c))
181    dataset = dataset.map(f2, num_parallel_calls=dataset_ops.AUTOTUNE)
182    return self._run_benchmark(
183        dataset=dataset,
184        autotune=autotune,
185        autotune_buffers=autotune_buffers,
186        benchmark_iters=10000,
187        benchmark_label="map_and_interleave")
188
189  def benchmark_map_batch_and_interleave(self):
190    a = self._benchmark_map_batch_and_interleave(autotune=False)
191    b = self._benchmark_map_batch_and_interleave(
192        autotune=True, autotune_buffers=False)
193    c = self._benchmark_map_batch_and_interleave(
194        autotune=True, autotune_buffers=True)
195    print("autotune parallelism vs no autotuning speedup: {}".format(a / b))
196    print("autotune parallelism and buffer sizes vs no autotuning speedup: {}"
197          .format(a / c))
198
199  def _benchmark_map_batch_and_interleave(self,
200                                          autotune,
201                                          autotune_buffers=False):
202    batch_size = 16
203    k = 1024 * 1024
204    a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1))
205    b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))
206    c = (np.random.rand(1, 2 * k), np.random.rand(2 * k, 1))
207    dataset_a = dataset_ops.Dataset.from_tensors(a).repeat()
208    dataset_b = dataset_ops.Dataset.from_tensors(b).repeat()
209    dataset_c = dataset_ops.Dataset.from_tensors(c).repeat()
210
211    dataset = dataset_a
212    dataset = dataset.map(
213        math_ops.matmul, num_parallel_calls=dataset_ops.AUTOTUNE)
214    dataset = dataset.batch(batch_size=batch_size)
215    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
216        lambda _: dataset,
217        num_parallel_calls=dataset_ops.AUTOTUNE,
218        cycle_length=2)
219
220    dataset = dataset_ops.Dataset.zip((dataset, dataset_b))
221    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
222        lambda _: dataset,
223        num_parallel_calls=dataset_ops.AUTOTUNE,
224        cycle_length=2)
225
226    dataset_c = dataset_c.map(
227        math_ops.matmul, num_parallel_calls=dataset_ops.AUTOTUNE)
228    dataset_c = dataset_c.batch(batch_size=batch_size)
229    dataset = dataset_ops.Dataset.zip((dataset, dataset_c))
230    return self._run_benchmark(
231        dataset=dataset,
232        autotune=autotune,
233        autotune_buffers=autotune_buffers,
234        benchmark_iters=1000,
235        benchmark_label="map_batch_and_interleave")
236
237
238if __name__ == "__main__":
239  benchmark_base.test.main()
240