1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
17 #define TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
18 
19 #include <numeric>
20 
21 #include "tensorflow/core/grappler/costs/cost_estimator.h"
22 #include "tensorflow/core/grappler/costs/op_context.h"
23 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
24 #include "tensorflow/core/lib/core/status.h"
25 #include "tensorflow/core/util/padding.h"
26 
27 namespace tensorflow {
28 namespace grappler {
29 
30 bool GetTensorShapeProtoFromTensorProto(const TensorProto& tensor_proto,
31                                         TensorShapeProto* tensor_shape_proto);
32 TensorShapeProto MaybeGetMinimumShape(const TensorShapeProto& original_shape,
33                                       int rank, bool* found_unknown_shapes);
34 
35 // Node costs; an intermediate structure used within op level cost estimator.
36 struct NodeCosts {
37   // If this FLAG is true, override calculated compute time with a minimum
38   // value, instead of calculating it from num_compute_ops and compute ops/sec.
39   // For example, PredictIdentity, PredictVariable, PredictMetadata set this
40   // FLAG.
41   bool minimum_cost_op = false;
42 
43   // Compute ops.
44   int64 num_compute_ops = 0;
45 
46   // Memory bytes accessed; note that these may be different to the size of
47   // tensors.
48   std::vector<int64> num_input_bytes_accessed;   // ordered by input tensors.
49   std::vector<int64> num_output_bytes_accessed;  // ordered by output ports.
50   int64 internal_read_bytes = 0;
51   int64 internal_write_bytes = 0;
52 
53   // Convenience functions.
num_total_input_bytesNodeCosts54   int64 num_total_input_bytes() const {
55     return std::accumulate(num_input_bytes_accessed.begin(),
56                            num_input_bytes_accessed.end(), 0LL);
57   }
num_total_read_bytesNodeCosts58   int64 num_total_read_bytes() const {
59     return num_total_input_bytes() + internal_read_bytes;
60   }
num_total_output_bytesNodeCosts61   int64 num_total_output_bytes() const {
62     return std::accumulate(num_output_bytes_accessed.begin(),
63                            num_output_bytes_accessed.end(), 0LL);
64   }
num_total_write_bytesNodeCosts65   int64 num_total_write_bytes() const {
66     return num_total_output_bytes() + internal_write_bytes;
67   }
num_bytes_accessedNodeCosts68   int64 num_bytes_accessed() const {
69     return num_total_read_bytes() + num_total_write_bytes();
70   }
71 
72   // Memory usage.
73   int64 max_memory = 0;
74   int64 persistent_memory = 0;
75   int64 temporary_memory = 0;
76 
77   // Stats.
78   int64 num_nodes = 1;
79   int64 num_nodes_with_unknown_shapes = 0;
80   int64 num_nodes_with_unknown_op_type = 0;
81   int64 num_nodes_with_pure_memory_op = 0;
82   bool inaccurate = false;
83 
84   // TODO(dyoon): this is added for compatibility; some old code is hard to
85   // migrate; hence, using these as a backup. Once we clean up, we'll delete
86   // these fields. New code should not use these.
87   bool has_costs = false;
88   Costs costs;
89 };
90 
91 class OpLevelCostEstimator {
92  public:
93   OpLevelCostEstimator();
~OpLevelCostEstimator()94   virtual ~OpLevelCostEstimator() {}
95 
96   virtual Costs PredictCosts(const OpContext& op_context) const;
97 
98   // Returns basic device performance info.
99   virtual DeviceInfo GetDeviceInfo(const DeviceProperties& device) const;
100 
101  protected:
102   // TODO(dyoon): Consider to remove PredictOpCountBasedCosts() with OpInfo.
103   // Naive cost estimate based on the given operations count and total
104   // input/output tensor sizes of the given op_info combined.
105   Costs PredictOpCountBasedCost(double operations, const OpInfo& op_info) const;
106 
107   // Naive cost estimate based on the given operations count and the given total
108   // io size in bytes. Sizes of op_info inputs and outputs are not taken into
109   // consideration.
110   Costs PredictOpCountBasedCost(double operations, double input_io_bytes,
111                                 double output_io_bytes,
112                                 const OpInfo& op_info) const;
113 
114   // Top-level method cost function (PredictCosts calls this method to get
115   // NodeCosts, and then converts it to Costs). PredictNodeCosts() calls other
116   // Predict methods depending on op types.
117   Status PredictNodeCosts(const OpContext& op_context,
118                           NodeCosts* node_costs) const;
119 
120   // Predict cost of an op for which no accurate estimator is defined.
121   Status PredictCostOfAnUnknownOp(const OpContext& op_context,
122                                   NodeCosts* node_costs) const;
123 
124   // This family of routines predicts the costs to
125   // perform the specified TensorFlow Op on the
126   // device represented by a subclass. The default
127   // implementation just divides the operations to
128   // perform the op (from the "Count" routines,
129   // above) by the device peak operations per
130   // second.
131   // Implementation of costs other than
132   // execution_time is optional, depending on the
133   // device.
134   Status PredictNaryOp(const OpContext& op_context,
135                        NodeCosts* node_costs) const;
136   Status PredictConv2D(const OpContext& op_context,
137                        NodeCosts* node_costs) const;
138   Status PredictCwiseOp(const OpContext& op_context,
139                         NodeCosts* node_costs) const;
140   Status PredictConv2DBackpropInput(const OpContext& op_context,
141                                     NodeCosts* node_costs) const;
142   Status PredictConv2DBackpropFilter(const OpContext& op_context,
143                                      NodeCosts* node_costs) const;
144   Status PredictFusedConv2DBiasActivation(const OpContext& op_context,
145                                           NodeCosts* node_costs) const;
146   Status PredictMatMul(const OpContext& op_context,
147                        NodeCosts* node_costs) const;
148   Status PredictSparseTensorDenseMatMul(const OpContext& op_context,
149                                         NodeCosts* node_costs) const;
150   Status PredictNoOp(const OpContext& op_context, NodeCosts* node_costs) const;
151   Status PredictIdentity(const OpContext& op_context,
152                          NodeCosts* node_costs) const;
153   Status PredictVariable(const OpContext& op_context,
154                          NodeCosts* node_costs) const;
155   Status PredictBatchMatMul(const OpContext& op_context,
156                             NodeCosts* node_costs) const;
157   Status PredictMetadata(const OpContext& op_context,
158                          NodeCosts* node_costs) const;
159   Status PredictGatherOrSlice(const OpContext& op_context,
160                               NodeCosts* node_costs) const;
161   Status PredictScatter(const OpContext& op_context,
162                         NodeCosts* node_costs) const;
163   Status PredictMaxPool(const OpContext& op_context,
164                         NodeCosts* node_costs) const;
165   Status PredictMaxPoolGrad(const OpContext& op_context,
166                             NodeCosts* node_costs) const;
167   Status PredictAvgPool(const OpContext& op_context,
168                         NodeCosts* node_costs) const;
169   Status PredictAvgPoolGrad(const OpContext& op_context,
170                             NodeCosts* node_costs) const;
171   Status PredictFusedBatchNorm(const OpContext& op_context,
172                                NodeCosts* node_costs) const;
173   Status PredictFusedBatchNormGrad(const OpContext& op_context,
174                                    NodeCosts* node_costs) const;
175   Status PredictEinsum(const OpContext& op_context,
176                        NodeCosts* node_costs) const;
177   Status PredictAssignVariableOps(const OpContext& op_context,
178                                   NodeCosts* node_costs) const;
179   Status PredictPureMemoryOp(const OpContext& op_context,
180                              NodeCosts* node_costs) const;
181   Status PredictSoftmax(const OpContext& op_context,
182                         NodeCosts* node_costs) const;
183   Status PredictResizeBilinear(const OpContext& op_context,
184                                NodeCosts* node_costs) const;
185   Status PredictCropAndResize(const OpContext& op_context,
186                               NodeCosts* node_costs) const;
187 
188   // Generic cost prediction method for fused operations.
189   Status PredictFusedOp(const OpContext& op_context,
190                         const std::vector<OpContext>& fused_op_contexts,
191                         NodeCosts* node_costs) const;
192 
193   // Utility function for safe division. Returns 0
194   // if rhs is 0 or negative.
SafeDiv(const double lhs,const double rhs)195   static double SafeDiv(const double lhs, const double rhs) {
196     if (rhs > 0) {
197       return lhs / rhs;
198     } else {
199       return 0.0;
200     }
201   }
202 
203   // This family of routines counts the number of operations to perform the
204   // specified TensorFlow Op.
205   struct MatMulDimensions {
206     int m;
207     int n;
208     int k;
209   };
210   struct BatchMatMulDimensions {
211     std::vector<int> batch_dims;
212     MatMulDimensions matmul_dims;
213   };
214   struct ConvolutionDimensions {
215     int64 batch;  // Batch size.
216     int64 ix;     // Input size x.
217     int64 iy;     // Input size y.
218     int64 iz;     // Input depth.
219     int64 kx;     // Kernel x.
220     int64 ky;     // Kernel y.
221     int64 kz;     // Kernel depth (in case of group convolution, this will be
222                   // smaller than input depth).
223     int64 oz;     // Output depth.
224     int64 ox;     // Output size x.
225     int64 oy;     // Output size y.
226     int64 sx;     // Stride x.
227     int64 sy;     // Stride y.
228     Padding padding;  // SAME or VALID.
229   };
230   static int64 CountConv2DOperations(const OpInfo& op_info,
231                                      bool* found_unknown_shapes);
232   static int64 CountConv2DOperations(const OpInfo& op_info,
233                                      ConvolutionDimensions* conv_info,
234                                      bool* found_unknown_shapes);
235   static int64 CountMatMulOperations(const OpInfo& op_info,
236                                      bool* found_unknown_shapes);
237   static int64 CountMatMulOperations(const OpInfo& op_info,
238                                      MatMulDimensions* mat_mul,
239                                      bool* found_unknown_shapes);
240   bool GenerateBatchMatmulContextFromEinsum(const OpContext& einsum_context,
241                                             OpContext* batch_matmul_context,
242                                             bool* found_unknown_shapes) const;
243   static int64 CountBatchMatMulOperations(const OpInfo& op_info,
244                                           bool* found_unknown_shapes);
245   static int64 CountBatchMatMulOperations(const OpInfo& op_info,
246                                           BatchMatMulDimensions* batch_mat_mul,
247                                           bool* found_unknown_shapes);
248   static int64 CountConv2DBackpropInputOperations(
249       const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
250       bool* found_unknown_shapes);
251   static int64 CountConv2DBackpropFilterOperations(
252       const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
253       bool* found_unknown_shapes);
254 
255   // Calculate the element count of an input/output tensor.
256   static int64 CalculateTensorElementCount(
257       const OpInfo::TensorProperties& tensor, bool* found_unknown_shapes);
258 
259   // Calculate the total size in bytes of an input/output tensor.
260   static int64 CalculateTensorSize(const OpInfo::TensorProperties& tensor,
261                                    bool* found_unknown_shapes);
262 
263   // Calculate the element count of the largest
264   // input of specified TensorFlow op.
265   static int64 CalculateLargestInputCount(const OpInfo& op_info,
266                                           bool* found_unknown_shapes);
267 
268   // Calculate the total size in bytes of the all
269   // the inputs of specified TensorFlow op.
270   static int64 CalculateInputSize(const OpInfo& op_info,
271                                   bool* found_unknown_shapes);
272 
273   // Same, but a vector format: one for each input.
274   static std::vector<int64> CalculateInputTensorSize(
275       const OpInfo& op_info, bool* found_unknown_shapes);
276 
277   // Calculate the total size in bytes of the all
278   // the outputs of specified TensorFlow op.
279   static int64 CalculateOutputSize(const OpInfo& op_info,
280                                    bool* found_unknown_shapes);
281 
282   // Same, but a vector format: one for each output.
283   static std::vector<int64> CalculateOutputTensorSize(
284       const OpInfo& op_info, bool* found_unknown_shapes);
285 
286   // For convolution and its grad ops.
287   static ConvolutionDimensions ConvolutionDimensionsFromInputs(
288       const TensorShapeProto& original_image_shape,
289       const TensorShapeProto& original_filter_shape, const OpInfo& op_info,
290       bool* found_unknown_shapes);
291 
292   // For Pooling, FusedBatchNorm, and their grad ops.
293   static ConvolutionDimensions OpDimensionsFromInputs(
294       const TensorShapeProto& original_image_shape, const OpInfo& op_info,
295       bool* found_unknown_shapes);
296 
297   // Helper to construct child operation contexts for the component operations
298   // of fused ops.
299   static OpContext FusedChildContext(
300       const OpContext& parent, const string& op_name,
301       const OpInfo::TensorProperties& output,
302       const std::vector<OpInfo::TensorProperties>& inputs);
303 
304   // Helper to construct tensor shapes.
305   static OpInfo::TensorProperties DescribeTensor(
306       DataType type, const std::vector<int64>& dims);
307 
308   // Helper method for building common case NodeCosts.
309   static Status PredictDefaultNodeCosts(const int64 num_compute_ops,
310                                         const OpContext& op_context,
311                                         bool* found_unknown_shapes,
312                                         NodeCosts* node_costs);
313 
314  protected:
315   std::map<string, int> elementwise_ops_;
316   typedef std::function<Status(const OpContext& op_context, NodeCosts*)>
317       CostImpl;
318   std::map<string, CostImpl> device_cost_impl_;
319   // If true, assume compute and memory overlap; hence, the op cost is max of
320   // compute_time and memory_time, instead of sum of those two.
321   bool compute_memory_overlap_;
322   std::set<string> persistent_ops_;
323 
324  private:
325   friend class OpLevelCostEstimatorTest;
326 };
327 
328 }  // end namespace grappler
329 }  // end namespace tensorflow
330 
331 #endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
332