1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_4x4.h"
17 
18 #include <string>
19 #include <utility>
20 #include <vector>
21 
22 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
23 
24 namespace tflite {
25 namespace gpu {
26 
27 namespace {
GetBestWeightsUploadType(const GpuInfo & gpu_info)28 ConvolutionTransposed4x4::WeightsUploadType GetBestWeightsUploadType(
29     const GpuInfo& gpu_info) {
30   ConvolutionTransposed4x4::WeightsUploadType weights_upload_type =
31       ConvolutionTransposed4x4::WeightsUploadType::GLOBAL_MEM;
32   if (gpu_info.IsPowerVR()) {
33     weights_upload_type =
34         ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC;
35   } else if (gpu_info.IsNvidia() || gpu_info.IsIntel()) {
36     weights_upload_type =
37         ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS;
38   } else if (gpu_info.IsAMD()) {
39     weights_upload_type =
40         ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM;
41   } else {
42     weights_upload_type =
43         ConvolutionTransposed4x4::WeightsUploadType::GLOBAL_MEM;
44   }
45   return weights_upload_type;
46 }
47 }  // namespace
48 
ConvolutionTransposed4x4(const OperationDef & definition,const GpuInfo & gpu_info)49 ConvolutionTransposed4x4::ConvolutionTransposed4x4(
50     const OperationDef& definition, const GpuInfo& gpu_info)
51     : GPUOperation(definition) {
52   work_group_size_ = int3(8, 4, 1);
53   if (gpu_info.IsApple()) {
54     work_group_launch_order_ = int3(2, 0, 1);
55   }
56 
57   if (gpu_info.IsApple()) {
58     weights_layout_ = WeightsLayout::kOICustomSpatialO4I4;
59   } else {
60     weights_layout_ = WeightsLayout::kOICustomSpatialI4O4;
61   }
62 
63   code_ = GenerateConvolutionTransposedCode(gpu_info, definition_,
64                                             GetBestWeightsUploadType(gpu_info));
65   if (definition_.precision == CalculationsPrecision::F16 &&
66       gpu_info.IsPowerVR()) {
67     compiler_options_.push_back(CompilerOptions::kClPowervrFp16);
68   }
69 }
70 
GenerateConvolutionTransposedCode(const GpuInfo & gpu_info,const OperationDef & op_def,WeightsUploadType weights_upload_type)71 std::string ConvolutionTransposed4x4::GenerateConvolutionTransposedCode(
72     const GpuInfo& gpu_info, const OperationDef& op_def,
73     WeightsUploadType weights_upload_type) {
74   auto src_desc = op_def.src_tensors[0];
75   src_desc.SetAddressMode(AddressMode::kZero);
76   if (op_def.IsBatchSupported()) {
77     src_desc.SetStateVar("BatchedWidth", "true");
78   }
79   AddSrcTensor("src_tensor", src_desc);
80 
81   auto dst_desc = op_def.dst_tensors[0];
82   if (op_def.IsBatchSupported()) {
83     dst_desc.SetStateVar("BatchedWidth", "true");
84   }
85   AddDstTensor("dst_tensor", dst_desc);
86 
87   if (op_def.src_tensors.size() == 2) {
88     // dynamic weights
89     BufferDescriptor desc;
90     desc.element_type = op_def.src_tensors[1].data_type;
91     desc.element_size = 4;
92     desc.memory_type =
93         weights_upload_type ==
94                 ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
95             ? MemoryType::CONSTANT
96             : MemoryType::GLOBAL;
97     AddSrcBuffer("weights", desc);
98   }
99 
100   args_.AddInt("filter_offset");
101 
102   const bool need_local_mem =
103       weights_upload_type ==
104           ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
105       weights_upload_type ==
106           ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC;
107 
108   const int wg_total_size =
109       work_group_size_.x * work_group_size_.y * work_group_size_.z;
110   const std::string barrier =
111       wg_total_size == 32 && gpu_info.IsWaveSizeEqualTo32()
112           ? "SIMD_LOCAL_MEM_BARRIER"
113           : "LOCAL_MEM_BARRIER";
114 
115   std::string c;
116   if (GetWeightsDescription().IsI4O4()) {
117     switch (op_def.precision) {
118       case CalculationsPrecision::F32:
119       case CalculationsPrecision::F16:
120         c += "#define CONV(R, SRC, F) \\\n";
121         c += "  R += SRC.x * weights_cache[F]; \\\n";
122         c += "  R += SRC.y * weights_cache[F + 1]; \\\n";
123         c += "  R += SRC.z * weights_cache[F + 2]; \\\n";
124         c += "  R += SRC.w * weights_cache[F + 3];   \n";
125         break;
126       case CalculationsPrecision::F32_F16:
127         c += "#define CONV(R, SRC, F) \\\n";
128         c += "  R += TO_ACCUM_TYPE(SRC.x * weights_cache[F] + SRC.y * "
129              "weights_cache[F + 1] + SRC.z * weights_cache[F + 2] + SRC.w * "
130              "weights_cache[F + 3]);\n";
131         break;
132     }
133   } else {
134     // O4I4
135     c += "#define CONV(R, SRC, F) \\\n";
136     c += "  R.x += dot(SRC, weights_cache[F]); \\\n";
137     c += "  R.y += dot(SRC, weights_cache[F + 1]); \\\n";
138     c += "  R.z += dot(SRC, weights_cache[F + 2]); \\\n";
139     c += "  R.w += dot(SRC, weights_cache[F + 3]);   \n";
140   }
141 
142   const std::string weights_space =
143       weights_upload_type ==
144               ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
145           ? "__constant"
146           : "__global";
147 
148   const std::string pixel_stride =
149       op_def.IsBatchSupported() ? "args.dst_tensor.Batch()" : "1";
150   if (gpu_info.IsApiOpenCl()) {
151     c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
152   }
153   c += "MAIN_FUNCTION($0) {\n";
154   std::string grid_coords[3];
155   int3 launch_remap;
156   launch_remap[work_group_launch_order_.x] = 0;
157   launch_remap[work_group_launch_order_.y] = 1;
158   launch_remap[work_group_launch_order_.z] = 2;
159   if (work_group_launch_order_[0] == 0) {
160     grid_coords[0] = "GLOBAL_ID_0";
161   } else {
162     grid_coords[0] = "(GROUP_ID_" + std::to_string(launch_remap[0]) +
163                      " * GROUP_SIZE_0 + LOCAL_ID_0);\n";
164   }
165   if (work_group_launch_order_[1] == 1) {
166     grid_coords[1] = "GLOBAL_ID_1";
167   } else {
168     grid_coords[1] = "(GROUP_ID_" + std::to_string(launch_remap[1]) +
169                      " * GROUP_SIZE_1 + LOCAL_ID_1);\n";
170   }
171   if (work_group_launch_order_[2] == 2) {
172     grid_coords[2] = "GLOBAL_ID_2";
173   } else {
174     grid_coords[2] = "(GROUP_ID_" + std::to_string(launch_remap[2]) +
175                      " * GROUP_SIZE_2 + LOCAL_ID_2);\n";
176   }
177   if (op_def.IsBatchSupported()) {
178     c += "  int linear_id = " + grid_coords[0] + ";\n";
179     c += "  int X0 = linear_id / args.dst_tensor.Batch();\n";
180     c += "  int B = linear_id % args.dst_tensor.Batch();\n";
181   }
182   c += "  int X = " + grid_coords[0] + ";\n";
183   c += "  int Y = " + grid_coords[1] + ";\n";
184   c += "  int Z = " + grid_coords[2] + ";\n";
185   if (!need_local_mem) {
186     if (op_def.IsBatchSupported()) {
187       c += "  if (X0 * 2 * args.dst_tensor.Batch() > args.dst_tensor.Width() "
188            "|| Y * 2 > args.dst_tensor.Height() || Z "
189            ">= args.dst_tensor.Slices()) return;\n";
190     } else {
191       c += "  if (X * 2 > args.dst_tensor.Width() || Y * 2 > "
192            "args.dst_tensor.Height() || Z >= args.dst_tensor.Slices()) "
193            "return;\n";
194     }
195   }
196   c += "  ACCUM_FLT4 r0 = INIT_ACCUM_FLT4(0.0f);\n";
197   c += "  ACCUM_FLT4 r1 = INIT_ACCUM_FLT4(0.0f);\n";
198   c += "  ACCUM_FLT4 r2 = INIT_ACCUM_FLT4(0.0f);\n";
199   c += "  ACCUM_FLT4 r3 = INIT_ACCUM_FLT4(0.0f);\n";
200   c += "  int f_offset = Z * args.filter_offset;\n";
201   if (need_local_mem) {
202     c += "  __local FLT4 weights_cache[64];\n";
203   }
204   if (weights_upload_type ==
205       ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
206     c += "  int local_id = LOCAL_ID_1 * 8 + LOCAL_ID_0;\n";
207   }
208   const std::string prev_x = "X - " + pixel_stride;
209   if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
210     c += "  bool in_x0 = " + prev_x + " >= 0 && " + prev_x +
211          " < args.src_tensor.Width();\n";
212     c += "  bool in_x1 = X >= 0 && X < args.src_tensor.Width();\n";
213   }
214   if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
215     c += "  bool in_y0 = Y - 1 >= 0 && Y - 1 < args.src_tensor.Height();\n";
216     c += "  bool in_y1 = Y >= 0 && Y < args.src_tensor.Height();\n";
217   }
218   auto generate_check = [&](int x, int y) {
219     std::string check;
220     const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT};
221     const std::vector<std::string> names{"in_x" + std::to_string(x),
222                                          "in_y" + std::to_string(y)};
223     for (int i = 0; i < axes.size(); ++i) {
224       const auto& axis = axes[i];
225       if (src_desc.HasAxis(axis) && !src_desc.SupportsZeroClamp(axis)) {
226         if (!check.empty()) {
227           check += " && ";
228         }
229         check += names[i];
230       }
231     }
232     return check;
233   };
234   if (src_desc.IsLinear()) {
235     if (src_desc.ReturnsZeroForNegOneRead()) {
236       c += "  args.src_tensor.GetAddress(addr_0, " + prev_x + ", Y - 1, 0);\n";
237       c += "  args.src_tensor.GetAddress(addr_1, X, Y - 1, 0);\n";
238       c += "  args.src_tensor.GetAddress(addr_2, " + prev_x + ", Y, 0);\n";
239       c += "  args.src_tensor.GetAddress(addr_3, X, Y, 0);\n";
240       c += "  addr_0 = select(-1, addr_0, (in_x0 && in_y0));\n";
241       c += "  addr_1 = select(-1, addr_1, (in_x1 && in_y0));\n";
242       c += "  addr_2 = select(-1, addr_2, (in_x0 && in_y1));\n";
243       c += "  addr_3 = select(-1, addr_3, (in_x1 && in_y1));\n";
244       c += "  int dz_0 = select(0, args.src_tensor.SliceStride(), (in_x0 && "
245            "in_y0));\n";
246       c += "  int dz_1 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
247            "in_y0));\n";
248       c += "  int dz_2 = select(0, args.src_tensor.SliceStride(), (in_x0 && "
249            "in_y1));\n";
250       c += "  int dz_3 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
251            "in_y1));\n";
252     } else {
253       c += "  int xc0 = clamp(" + prev_x +
254            ", 0, args.src_tensor.Width() - 1);\n";
255       c += "  int xc1 = clamp(X, 0, args.src_tensor.Width() - 1);\n";
256       c += "  int yc0 = clamp(Y - 1, 0, args.src_tensor.Height() - 1);\n";
257       c += "  int yc1 = clamp(Y, 0, args.src_tensor.Height() - 1);\n";
258       c += "  args.src_tensor.GetAddress(addr_0, xc0, yc0, 0);\n";
259       c += "  args.src_tensor.GetAddress(addr_1, xc1, yc0, 0);\n";
260       c += "  args.src_tensor.GetAddress(addr_2, xc0, yc1, 0);\n";
261       c += "  args.src_tensor.GetAddress(addr_3, xc1, yc1, 0);\n";
262       c += "  int dz = args.src_tensor.SliceStride();\n";
263     }
264   }
265   auto read_src = [&](int x, int y) {
266     if (src_desc.IsLinear()) {
267       const std::string id = std::to_string(y * 2 + x);
268       const std::string addr = "addr_" + std::to_string(y * 2 + x);
269       if (src_desc.ReturnsZeroForNegOneRead()) {
270         return "args.src_tensor.Read(" + addr + "); " + addr + " += dz_" + id +
271                ";";
272       } else {
273         return "args.src_tensor.Read(" + addr + ") * INIT_FLT(in_x" +
274                std::to_string(x) + " && in_y" + std::to_string(y) + "); " +
275                addr + " += dz;";
276       }
277     } else {
278       std::string check = generate_check(x, y);
279       if (!check.empty()) {
280         check = " * INIT_FLT(" + check + ")";
281       }
282       return "args.src_tensor.Read(X + " + std::to_string(x - 1) + " * " +
283              pixel_stride + ", Y + " + std::to_string(y - 1) + ", s)" + check +
284              ";";
285     }
286   };
287   c += "  for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
288   if (need_local_mem) {
289     c += "    " + barrier + ";\n";
290   }
291   if (weights_upload_type ==
292       ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC) {
293     c += "    async_work_group_copy(weights_cache, "
294          "args.weights.GetPtr(f_offset), 64, "
295          "0);\n";
296   } else if (weights_upload_type ==
297              ConvolutionTransposed4x4::WeightsUploadType::
298                  LOCAL_MEM_BY_THREADS) {
299     c += "    weights_cache[local_id] = args.weights.Read(f_offset + "
300          "local_id);\n";
301     c += "    weights_cache[local_id + 32] = args.weights.Read(f_offset + "
302          "local_id + "
303          "32);\n";
304   } else {  // GLOBAL_MEM
305     c += "    " + weights_space +
306          " FLT4* weights_cache = args.weights.GetPtr(f_offset);\n";
307   }
308   c += "    FLT4 src0 = " + read_src(0, 0) + ";\n";
309   c += "    FLT4 src1 = " + read_src(1, 0) + ";\n";
310   c += "    FLT4 src2 = " + read_src(0, 1) + ";\n";
311   c += "    FLT4 src3 = " + read_src(1, 1) + ";\n";
312   c += "    f_offset += 64;\n";
313   if (need_local_mem) {
314     c += "    " + barrier + ";\n";
315   }
316   c += "    CONV(r0, src0, 0);\n";
317   c += "    CONV(r1, src0, 4);\n";
318   c += "    CONV(r2, src0, 8);\n";
319   c += "    CONV(r3, src0, 12);\n";
320   c += "    CONV(r0, src1, 16);\n";
321   c += "    CONV(r1, src1, 20);\n";
322   c += "    CONV(r2, src1, 24);\n";
323   c += "    CONV(r3, src1, 28);\n";
324   c += "    CONV(r0, src2, 32);\n";
325   c += "    CONV(r1, src2, 36);\n";
326   c += "    CONV(r2, src2, 40);\n";
327   c += "    CONV(r3, src2, 44);\n";
328   c += "    CONV(r0, src3, 48);\n";
329   c += "    CONV(r1, src3, 52);\n";
330   c += "    CONV(r2, src3, 56);\n";
331   c += "    CONV(r3, src3, 60);\n";
332   c += "  }\n";
333   c += "\n";
334   if (need_local_mem) {
335     if (op_def.IsBatchSupported()) {
336       c += "  if (X0 * 2 * args.dst_tensor.Batch() > args.dst_tensor.Width() "
337            "|| Y * 2 > args.dst_tensor.Height() || Z "
338            ">= args.dst_tensor.Slices()) return;\n";
339     } else {
340       c += "  if (X * 2 > args.dst_tensor.Width() || Y * 2 > "
341            "args.dst_tensor.Height() || Z >= args.dst_tensor.Slices()) "
342            "return;\n";
343     }
344   }
345   if (op_def.IsBatchSupported()) {
346     c += "  X = X0 * 2 * args.dst_tensor.Batch() + B - "
347          "args.dst_tensor.Batch();\n";
348   } else {
349     c += "  X = X * 2 - 1;\n";
350   }
351   c += "  Y = Y * 2 - 1;\n";
352   c += "\n";
353   c += "  FLT4 bias_val = args.biases.Read(Z);\n";
354   c += "  if (X >= 0 && Y >= 0) {\n";
355   c += "    FLT4 result = TO_FLT4(r0) + bias_val;\n";
356   c += "    args.dst_tensor.Write(result, X, Y, Z);\n";
357   c += "  }\n";
358   c +=
359       "  if (X + " + pixel_stride + " < args.dst_tensor.Width() && Y >= 0) {\n";
360   c += "    FLT4 result = TO_FLT4(r1) + bias_val;\n";
361   c += "    args.dst_tensor.Write(result, X + " + pixel_stride + ", Y, Z);\n";
362   c += "  }\n";
363   c += "  if (X >= 0 && Y + 1 < args.dst_tensor.Height()) {\n";
364   c += "    FLT4 result = TO_FLT4(r2) + bias_val;\n";
365   c += "    args.dst_tensor.Write(result, X, Y + 1, Z);\n";
366   c += "  }\n";
367   c += "  if (X + " + pixel_stride +
368        " < args.dst_tensor.Width() && Y + 1 < args.dst_tensor.Height()) {\n";
369   c += "    FLT4 result = TO_FLT4(r3) + bias_val;\n";
370   c += "    args.dst_tensor.Write(result, X + " + pixel_stride + ", Y+1, Z);\n";
371   c += "  }\n";
372   c += "}\n";
373   return c;
374 }
375 
BindArguments(ArgumentsBinder * args)376 absl::Status ConvolutionTransposed4x4::BindArguments(ArgumentsBinder* args) {
377   return args->SetInt("filter_offset", 4 * 16 * src_[0]->Slices());
378 }
379 
GetGridSize() const380 int3 ConvolutionTransposed4x4::GetGridSize() const {
381   const int grid_x = DivideRoundUp(dst_[0]->Width() + 2, 2) * dst_[0]->Batch();
382   const int grid_y = DivideRoundUp(dst_[0]->Height() + 2, 2);
383   const int grid_z = dst_[0]->Slices();
384   return int3(grid_x, grid_y, grid_z);
385 }
386 
GetSpatialWeightsRemap() const387 std::vector<int> ConvolutionTransposed4x4::GetSpatialWeightsRemap() const {
388   return std::vector<int>{10, 11, 14, 15, 8, 9, 12, 13, 2, 3, 6, 7, 0, 1, 4, 5};
389 }
390 
UploadWeights(const tflite::gpu::Tensor<OHWI,DataType::FLOAT32> & weights,WeightsUploadType weights_upload_type)391 void ConvolutionTransposed4x4::UploadWeights(
392     const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights,
393     WeightsUploadType weights_upload_type) {
394   const int flt_count =
395       GetTotalElementsCountForLayout(GetWeightsDescription(), weights.shape);
396 
397   DataType weights_type = definition_.precision == CalculationsPrecision::F32
398                               ? DataType::FLOAT32
399                               : DataType::FLOAT16;
400 
401   BufferDescriptor desc;
402   desc.element_type = weights_type;
403   desc.element_size = 4;
404   desc.memory_type =
405       weights_upload_type ==
406               ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
407           ? MemoryType::CONSTANT
408           : MemoryType::GLOBAL;
409   desc.size = flt_count * SizeOf(desc.element_type);
410   desc.data.resize(desc.size);
411 
412   RearrangeWeights(weights, GetWeightsDescription(), weights_type,
413                    absl::MakeSpan(desc.data));
414   args_.AddObject("weights",
415                   absl::make_unique<BufferDescriptor>(std::move(desc)));
416 }
417 
IsConvolutionTransposed4x4Supported(const OperationDef & definition,const ConvolutionTransposedAttributes & attr)418 bool IsConvolutionTransposed4x4Supported(
419     const OperationDef& definition,
420     const ConvolutionTransposedAttributes& attr) {
421   return attr.weights.shape.w == 4 && attr.weights.shape.h == 4 &&
422          attr.stride.w == 2 && attr.stride.h == 2 &&
423          attr.padding.prepended.w == 1 && attr.padding.prepended.h == 1;
424 }
425 
CreateConvolutionTransposed4x4(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)426 ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
427     const GpuInfo& gpu_info, const OperationDef& definition,
428     const ConvolutionTransposedAttributes& attr) {
429   ConvolutionTransposed4x4 result(definition, gpu_info);
430   result.UploadWeights(attr.weights, GetBestWeightsUploadType(gpu_info));
431 
432   TensorLinearDescriptor desc;
433   desc.storage_type = gpu_info.IsApple() || !gpu_info.SupportsImages()
434                           ? LinearStorageType::BUFFER
435                           : LinearStorageType::TEXTURE_2D;
436   desc.element_type = definition.GetDataType();
437   desc.UploadLinearData(attr.bias);
438   result.args_.AddObject(
439       "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
440   return result;
441 }
442 
CreateConvolutionTransposed4x4DynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)443 ConvolutionTransposed4x4 CreateConvolutionTransposed4x4DynamicWeights(
444     const GpuInfo& gpu_info, const OperationDef& definition,
445     const ConvolutionTransposedAttributes& attr) {
446   OperationDef new_def = definition;
447   new_def.src_tensors = {
448       definition.src_tensors[0]};  // leaving only src_tensor def, weights defs
449                                    // will be added later
450   const DataType weights_type = definition.GetDataType();
451   // add 1 src_tensor(buffer) for weights
452   new_def.src_tensors.push_back(
453       {weights_type, TensorStorageType::BUFFER, Layout::HWC});
454 
455   ConvolutionTransposed4x4 result(new_def, gpu_info);
456 
457   TensorLinearDescriptor desc;
458   desc.storage_type = gpu_info.IsApple() || !gpu_info.SupportsImages()
459                           ? LinearStorageType::BUFFER
460                           : LinearStorageType::TEXTURE_2D;
461   desc.element_type = new_def.GetDataType();
462   desc.UploadLinearData(attr.bias);
463   result.args_.AddObject(
464       "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
465   return result;
466 }
467 
468 }  // namespace gpu
469 }  // namespace tflite
470