1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/conv_buffer_1x1.h"
17 
18 #include <array>
19 #include <string>
20 #include <utility>
21 
22 #include "tensorflow/lite/delegates/gpu/common/status.h"
23 #include "tensorflow/lite/delegates/gpu/common/task/util.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
25 
26 namespace tflite {
27 namespace gpu {
28 namespace {
29 
30 // element_size must be 1, 2 or 4
31 // 1 - is FLT4
32 // 2 - is FLT8
33 // 4 - is FLT16
34 // This function generates code for arithmetic part of convolution
GetComputationPart(const int3 & block_size,int element_size,CalculationsPrecision precision)35 std::string GetComputationPart(const int3& block_size, int element_size,
36                                CalculationsPrecision precision) {
37   const std::string hexes[16] = {"0", "1", "2", "3", "4", "5", "6", "7",
38                                  "8", "9", "a", "b", "c", "d", "e", "f"};
39   std::string c;
40   for (int z = 0; z < block_size.z; ++z) {
41     const std::string z_s = std::to_string(z);
42     c += "    FLT16 W" + z_s + " = weights_cache[" + z_s + "];\n";
43     for (int y = 0; y < block_size.y; ++y) {
44       for (int x = 0; x < block_size.x; ++x) {
45         std::string s_index = std::to_string(y * block_size.x + x);
46         for (int e = 0; e < element_size; ++e) {
47           std::string r_index =
48               z_s + std::to_string(y) + std::to_string(x * element_size + e);
49           const std::string f0 = "W" + z_s + ".s0123";
50           const std::string f1 = "W" + z_s + ".s4567";
51           const std::string f2 = "W" + z_s + ".s89ab";
52           const std::string f3 = "W" + z_s + ".scdef";
53           switch (precision) {
54             case CalculationsPrecision::F32:
55             case CalculationsPrecision::F16:
56               c += "    r" + r_index + " += " + f0 + " * s" + s_index + ".s" +
57                    hexes[e * 4 + 0] + ";\n";
58               c += "    r" + r_index + " += " + f1 + " * s" + s_index + ".s" +
59                    hexes[e * 4 + 1] + ";\n";
60               c += "    r" + r_index + " += " + f2 + " * s" + s_index + ".s" +
61                    hexes[e * 4 + 2] + ";\n";
62               c += "    r" + r_index + " += " + f3 + " * s" + s_index + ".s" +
63                    hexes[e * 4 + 3] + ";\n";
64               break;
65             case CalculationsPrecision::F32_F16:
66               c += "    r" + r_index + " += convert_float4(" + f0 + " * s" +
67                    s_index + ".s" + hexes[e * 4 + 0] + " + " + f1 + " * s" +
68                    s_index + ".s" + hexes[e * 4 + 1] + " + " + f2 + " * s" +
69                    s_index + ".s" + hexes[e * 4 + 2] + " + " + f3 + " * s" +
70                    s_index + ".s" + hexes[e * 4 + 3] + ");\n";
71               break;
72           }
73         }
74       }
75     }
76   }
77   return c;
78 }
79 
GetBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const BHWC & shape,int src_depth,int dst_depth)80 ConvBuffer1x1::ConvParams GetBestParams(const GpuInfo& gpu_info,
81                                         const OperationDef& definition,
82                                         const BHWC& shape, int src_depth,
83                                         int dst_depth) {
84   ConvBuffer1x1::ConvParams conv_params;
85   conv_params.element_size = 4;
86   conv_params.block_size = int3(1, 1, 1);
87   if (!gpu_info.IsMali()) {
88     return conv_params;
89   }
90   bool can_use_flt8 = (shape.w * shape.b) % 2 == 0 &&
91                       definition.precision != CalculationsPrecision::F32;
92   bool is_midgard = gpu_info.IsMali() && gpu_info.mali_info.IsMidgard();
93   if (is_midgard) {
94     if (can_use_flt8) {
95       conv_params.element_size = 8;
96     }
97     if (definition.precision == CalculationsPrecision::F16 || !can_use_flt8) {
98       conv_params.block_size.x = 2;
99     }
100     return conv_params;
101   }
102 
103   int task_size = shape.w * shape.b * shape.h * dst_depth;
104   int block_size =
105       GetRecommendedBlockSizeForConv(gpu_info, definition.precision, task_size);
106 
107   if (!can_use_flt8 && block_size > 4) {
108     block_size = 4;
109   }
110 
111   if (can_use_flt8 && block_size >= 2) {
112     conv_params.element_size = 8;
113     block_size /= 2;
114   }
115   if (block_size == 4) {
116     conv_params.block_size.x = 2;
117     if (definition.precision == CalculationsPrecision::F32 && dst_depth < 32) {
118       conv_params.block_size.y = 2;
119     } else {
120       conv_params.block_size.z = 2;
121     }
122   } else if (block_size == 2) {
123     if (dst_depth >= 32) {
124       conv_params.block_size.z = 2;
125     } else {
126       conv_params.block_size.x = 2;
127     }
128   }
129 
130   return conv_params;
131 }
132 
GetBestParams(const GpuInfo & gpu_info,const OperationDef & definition,int src_depth,int dst_depth)133 ConvBuffer1x1::ConvParams GetBestParams(const GpuInfo& gpu_info,
134                                         const OperationDef& definition,
135                                         int src_depth, int dst_depth) {
136   ConvBuffer1x1::ConvParams conv_params;
137   conv_params.element_size = 4;
138   conv_params.block_size = int3(1, 1, 1);
139   if (gpu_info.IsMali() && definition.precision == CalculationsPrecision::F16 &&
140       gpu_info.GetComputeUnitsCount() <= 4) {
141     conv_params.block_size.x *= 2;
142   }
143   return conv_params;
144 }
145 
146 }  // namespace
147 
ConvBuffer1x1(const OperationDef & definition,const ConvParams & conv_params)148 ConvBuffer1x1::ConvBuffer1x1(const OperationDef& definition,
149                              const ConvParams& conv_params)
150     : GPUOperation(definition), conv_params_(conv_params) {
151   code_ = GenerateConvBuffer1x1(definition_, conv_params_, &args_);
152   work_group_size_ = int3(2, 4, 1);
153 }
154 
ConvBuffer1x1(ConvBuffer1x1 && operation)155 ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1&& operation)
156     : GPUOperation(std::move(operation)),
157       conv_params_(std::move(operation.conv_params_)) {}
158 
operator =(ConvBuffer1x1 && operation)159 ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) {
160   if (this != &operation) {
161     std::swap(conv_params_, operation.conv_params_);
162     GPUOperation::operator=(std::move(operation));
163   }
164   return *this;
165 }
166 
GenerateConvBuffer1x1(const OperationDef & op_def,const ConvBuffer1x1::ConvParams & conv_params,Arguments * args)167 std::string ConvBuffer1x1::GenerateConvBuffer1x1(
168     const OperationDef& op_def, const ConvBuffer1x1::ConvParams& conv_params,
169     Arguments* args) {
170   auto src_desc = op_def.src_tensors[0];
171   if (op_def.IsBatchSupported()) {
172     src_desc.SetStateVar("BatchedWidth", "true");
173   }
174   if (conv_params_.element_size == 8) {
175     src_desc.SetStateVar("ElementsX2", "true");
176   } else if (conv_params_.element_size == 16) {
177     src_desc.SetStateVar("ElementsX4", "true");
178   }
179   AddSrcTensor("src_tensor", src_desc);
180   if (op_def.src_tensors.size() == 2) {
181     // dynamic weights
182     BufferDescriptor desc;
183     desc.element_type = op_def.src_tensors[1].data_type;
184     desc.element_size = 16;
185     desc.memory_type = MemoryType::GLOBAL;
186     AddSrcBuffer("weights", desc);
187   }
188 
189   auto dst_desc = op_def.dst_tensors[0];
190   if (op_def.IsBatchSupported()) {
191     dst_desc.SetStateVar("BatchedWidth", "true");
192   }
193   AddDstTensor("dst_tensor", dst_desc);
194 
195   std::string c;
196   switch (op_def.precision) {
197     case CalculationsPrecision::F32:
198       c += "#define FLT8 float8\n";
199       c += "#define FLT16 float16\n";
200       break;
201     case CalculationsPrecision::F32_F16:
202     case CalculationsPrecision::F16:
203       c += "#define FLT8 half8\n";
204       c += "#define FLT16 half16\n";
205       break;
206   }
207 
208   const int3 block_size = conv_params.block_size;
209   const int element_size = conv_params.element_size / 4;
210 
211   c += "__kernel void main_function(\n";
212   c += "$0) {\n";
213   c += "  int X = get_global_id(0) * " +
214        std::to_string(block_size.x * element_size) + ";\n";
215   c += "  int X_SRC = get_global_id(0) * " + std::to_string(block_size.x) +
216        ";\n";
217   c += "  int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n";
218   c += "  int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n";
219   c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
220        "Z >= args.dst_tensor.Slices()) return;\n";
221   if (conv_params.different_weights_for_height) {
222     c += "  __global FLT16* weights_cache = args.weights.GetPtr() + (Z * "
223          "args.src_tensor.Height() + "
224          "Y * " +
225          std::to_string(block_size.z) +
226          ") * "
227          "args.src_tensor.Slices();\n";
228   } else {
229     c += "  __global FLT16* weights_cache = args.weights.GetPtr() + Z * "
230          "args.src_tensor.Slices();\n";
231   }
232   for (int z = 0; z < block_size.z; ++z) {
233     const std::string z_s = std::to_string(z);
234     c += "  ACCUM_FLT4 bias_val_" + z_s +
235          " = TO_ACCUM_TYPE(args.biases.Read(Z + " + z_s + "));\n";
236     for (int y = 0; y < block_size.y; ++y) {
237       for (int x = 0; x < block_size.x * element_size; ++x) {
238         c += "  ACCUM_FLT4 r" + z_s + std::to_string(y) + std::to_string(x) +
239              " = bias_val_" + z_s + ";\n";
240       }
241     }
242   }
243   for (int x = 0; x < block_size.x; ++x) {
244     std::string x_s = std::to_string(x);
245     c += "  int xc" + x_s + " = min(X_SRC + " + std::to_string(x) +
246          ", args.src_tensor.Width() - 1);\n";
247   }
248   for (int y = 0; y < block_size.y; ++y) {
249     std::string y_s = std::to_string(y);
250     c += "  int yc" + y_s + " = min(Y + " + y_s +
251          ", args.src_tensor.Height() - 1);\n";
252   }
253   for (int y = 0; y < block_size.y; ++y) {
254     std::string y_s = std::to_string(y);
255     for (int x = 0; x < block_size.x; ++x) {
256       std::string x_s = std::to_string(x);
257       std::string i_s = std::to_string(y * block_size.x + x);
258       c += "  int src_addr_" + i_s + " = (yc" + y_s +
259            ") * args.src_tensor.Width() + (xc" + x_s + ");\n";
260     }
261   }
262   c += "  for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
263   for (int y = 0; y < block_size.y; ++y) {
264     std::string y_s = std::to_string(y);
265     for (int x = 0; x < block_size.x; ++x) {
266       std::string x_s = std::to_string(x);
267       std::string i_s = std::to_string(y * block_size.x + x);
268       c += "    FLT" + std::to_string(element_size * 4) + " s" + i_s +
269            " = args.src_tensor.Read(src_addr_" + i_s + ");\n";
270     }
271   }
272   c += GetComputationPart(block_size, element_size, op_def.precision);
273   for (int i = 0; i < block_size.x * block_size.y; ++i) {
274     std::string i_s = std::to_string(i);
275     c += "    src_addr_" + i_s + " += args.src_tensor.SliceStride();\n";
276   }
277   c += "    weights_cache += " + std::to_string(block_size.z) + ";\n";
278   c += "  }\n";  // SRC_SLICES
279 
280   for (int z = 0; z < block_size.z; ++z) {
281     const std::string z_s = std::to_string(z);
282     if (z != 0) {
283       c += "  if (Z + " + z_s + " >= args.dst_tensor.Slices()) return;\n";
284     }
285     for (int y = 0; y < block_size.y; ++y) {
286       const std::string y_s = std::to_string(y);
287       for (int x = 0; x < block_size.x * element_size; ++x) {
288         const std::string x_s = std::to_string(x);
289         c += "  if (X + " + x_s + " < args.dst_tensor.Width() && Y + " + y_s +
290              " < args.dst_tensor.Height()) {\n";
291         c += "    FLT4 res = TO_FLT4(r" + z_s + y_s + x_s + ");\n";
292         c += "    args.dst_tensor.Write(res, X + " + x_s + ", Y + " + y_s +
293              ", Z + " + z_s + ");\n";
294         c += "  }\n";
295       }
296     }
297   }
298   c += "}\n";
299   return c;
300 }
301 
GetGridSize() const302 int3 ConvBuffer1x1::GetGridSize() const {
303   const int dst_width_elements = DivideRoundUp(
304       dst_[0]->Width() * dst_[0]->Batch(), (conv_params_.element_size / 4));
305   const int grid_x =
306       DivideRoundUp(dst_width_elements, conv_params_.block_size.x);
307   const int grid_y =
308       DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
309   const int grid_z =
310       DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
311   return int3(grid_x, grid_y, grid_z);
312 }
313 
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const314 void ConvBuffer1x1::GetPossibleKernelWorkGroups(
315     TuningType tuning_type, const GpuInfo& gpu_info,
316     const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
317   GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
318                             work_groups);
319 }
320 
IsConvBuffer1x1Supported(const OperationDef & definition,const Convolution2DAttributes & attr)321 bool IsConvBuffer1x1Supported(const OperationDef& definition,
322                               const Convolution2DAttributes& attr) {
323   auto src_storage_type = definition.src_tensors[0].storage_type;
324   return src_storage_type == TensorStorageType::BUFFER &&
325          attr.weights.shape.w == 1 && attr.weights.shape.h == 1 &&
326          attr.dilations.w == 1 && attr.dilations.h == 1 &&
327          attr.strides.w == 1 && attr.strides.h == 1 &&
328          attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0 &&
329          attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
330 }
331 
IsConvBuffer1x1Supported(const OperationDef & definition,const BHWC & weights_shape,const Convolution2DAttributes & attr)332 bool IsConvBuffer1x1Supported(const OperationDef& definition,
333                               const BHWC& weights_shape,
334                               const Convolution2DAttributes& attr) {
335   auto src_storage_type = definition.src_tensors[0].storage_type;
336   return src_storage_type == TensorStorageType::BUFFER &&
337          weights_shape.w == 1 && weights_shape.h == 1 &&
338          attr.dilations.w == 1 && attr.dilations.h == 1 &&
339          attr.strides.w == 1 && attr.strides.h == 1 &&
340          attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0 &&
341          attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
342 }
343 
CreateConvBuffer1x1(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * shape)344 ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
345                                   const OperationDef& definition,
346                                   const Convolution2DAttributes& attr,
347                                   const BHWC* shape) {
348   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
349   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
350   ConvBuffer1x1::ConvParams conv_params;
351   if (shape) {
352     conv_params =
353         GetBestParams(gpu_info, definition, *shape, src_depth, dst_depth);
354   } else {
355     conv_params = GetBestParams(gpu_info, definition, src_depth, dst_depth);
356   }
357   ConvBuffer1x1 result(definition, conv_params);
358   result.UploadData(attr.weights, attr.bias);
359   return result;
360 }
361 
CreateConvBuffer1x1(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr,const BHWC * shape)362 ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
363                                   const OperationDef& definition,
364                                   const FullyConnectedAttributes& attr,
365                                   const BHWC* shape) {
366   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
367   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
368   ConvBuffer1x1::ConvParams conv_params;
369   if (shape) {
370     conv_params =
371         GetBestParams(gpu_info, definition, *shape, src_depth, dst_depth);
372   } else {
373     conv_params = GetBestParams(gpu_info, definition, src_depth, dst_depth);
374   }
375   conv_params.block_size.x *= conv_params.block_size.y;
376   conv_params.block_size.y = 1;
377   ConvBuffer1x1 result(definition, conv_params);
378   result.UploadData(attr.weights, attr.bias);
379   return result;
380 }
381 
CreateConvBuffer1x1Wino4x4To6x6(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * shape)382 ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
383     const GpuInfo& gpu_info, const OperationDef& definition,
384     const Convolution2DAttributes& attr, const BHWC* shape) {
385   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
386   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
387   ConvBuffer1x1::ConvParams conv_params;
388   if (shape) {
389     conv_params =
390         GetBestParams(gpu_info, definition, *shape, src_depth, dst_depth);
391   } else {
392     conv_params = GetBestParams(gpu_info, definition, src_depth, dst_depth);
393   }
394   conv_params.block_size.x *= conv_params.block_size.y;
395   conv_params.block_size.y = 1;
396   conv_params.different_weights_for_height = true;
397   ConvBuffer1x1 result(definition, conv_params);
398   result.UploadDataForWinograd4x4To6x6(attr.weights);
399   return result;
400 }
401 
CreateConvBuffer1x1DynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC & weights_shape,const BHWC * dst_shape)402 ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
403     const GpuInfo& gpu_info, const OperationDef& definition,
404     const Convolution2DAttributes& attr, const BHWC& weights_shape,
405     const BHWC* dst_shape) {
406   const int dst_depth = DivideRoundUp(weights_shape.b, 4);
407   const int src_depth = DivideRoundUp(weights_shape.c, 4);
408   ConvBuffer1x1::ConvParams conv_params;
409   if (dst_shape) {
410     conv_params =
411         GetBestParams(gpu_info, definition, *dst_shape, src_depth, dst_depth);
412   } else {
413     conv_params = GetBestParams(gpu_info, definition, src_depth, dst_depth);
414   }
415   ConvBuffer1x1 result(definition, conv_params);
416   result.UploadBiases(attr.bias);
417   return result;
418 }
419 
420 }  // namespace gpu
421 }  // namespace tflite
422