1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_thin.h"
17 
18 #include <string>
19 #include <utility>
20 #include <vector>
21 
22 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
23 
24 namespace tflite {
25 namespace gpu {
26 
ConvolutionTransposedThin(const OperationDef & definition,const ConvolutionTransposedAttributes & attr,const GpuInfo & gpu_info)27 ConvolutionTransposedThin::ConvolutionTransposedThin(
28     const OperationDef& definition, const ConvolutionTransposedAttributes& attr,
29     const GpuInfo& gpu_info)
30     : GPUOperation(definition) {
31   code_ = GenerateConvolutionTransposedCode(
32       definition_, DivideRoundUp(attr.weights.shape.i, 4), attr.weights.shape.o,
33       int2(attr.weights.shape.w, attr.weights.shape.h));
34   if (definition_.precision == CalculationsPrecision::F16 &&
35       gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx()) {
36     compiler_options_.push_back(CompilerOptions::kAdrenoFullSimd);
37   }
38 }
39 
ConvolutionTransposedThin(ConvolutionTransposedThin && operation)40 ConvolutionTransposedThin::ConvolutionTransposedThin(
41     ConvolutionTransposedThin&& operation)
42     : GPUOperation(std::move(operation)) {}
43 
operator =(ConvolutionTransposedThin && operation)44 ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
45     ConvolutionTransposedThin&& operation) {
46   if (this != &operation) {
47     GPUOperation::operator=(std::move(operation));
48   }
49   return *this;
50 }
51 
GenerateConvolutionTransposedCode(const OperationDef & op_def,int src_depth,int dst_channels,const int2 & kernel_size)52 std::string ConvolutionTransposedThin::GenerateConvolutionTransposedCode(
53     const OperationDef& op_def, int src_depth, int dst_channels,
54     const int2& kernel_size) {
55   AddSrcTensor("src_tensor", op_def.src_tensors[0]);
56   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
57 
58   const std::string channel_x = dst_channels == 1 ? "" : ".x";
59   const std::vector<std::string> postfix = {channel_x, ".y", ".z", ".w"};
60   const std::vector<std::string> channel = {".x", ".y", ".z", ".w"};
61 
62   const std::string type_postfix =
63       dst_channels == 1 ? "" : std::to_string(dst_channels);
64 
65   std::string accum_type;
66 
67   switch (op_def.precision) {
68     case CalculationsPrecision::F32:
69     case CalculationsPrecision::F32_F16:
70       accum_type = "float" + type_postfix;
71       break;
72     case CalculationsPrecision::F16:
73       accum_type = "half" + type_postfix;
74       break;
75   }
76 
77   std::string c;
78   c += "MAIN_FUNCTION($0) {\n";
79   if (op_def.IsBatchSupported()) {
80     c += "  int linear_id = GLOBAL_ID_0;\n";
81     c += "  int X = linear_id / args.dst_tensor.Batch();\n";
82     c += "  int B = linear_id % args.dst_tensor.Batch();\n";
83     c += "  args.dst_tensor.SetBatchRef(B);\n";
84     c += "  args.src_tensor.SetBatchRef(B);\n";
85   } else {
86     c += "  int X = GLOBAL_ID_0;\n";
87   }
88   c += "  int Y = GLOBAL_ID_1;\n";
89   c += "  if (X >= args.src_tensor.Width() || Y >= args.src_tensor.Height()) "
90        "return;\n";
91   c += "  " + accum_type + " r[" + std::to_string(kernel_size.y) + "][" +
92        std::to_string(kernel_size.x) + "];\n";
93   c += "  {\n";
94   c += "  FLT4 src = args.src_tensor.Read(X, Y, 0);\n";
95   int index = 0;
96   for (int y = 0; y < kernel_size.y; ++y) {
97     for (int x = 0; x < kernel_size.x; ++x) {
98       std::string r_s =
99           "  r[" + std::to_string(y) + "][" + std::to_string(x) + "]";
100       for (int d = 0; d < dst_channels; ++d) {
101         c += r_s + postfix[d] + " = dot(src, args.weights.Read(" +
102              std::to_string(index) + "));\n";
103         index++;
104       }
105     }
106   }
107   c += "  }\n";
108   for (int i = 1; i < src_depth; ++i) {
109     c += "  if (X > " + std::to_string(-i) +
110          ") {  // always true, to reduce registers usage\n";
111     c +=
112         "  FLT4 src = args.src_tensor.Read(X, Y, " + std::to_string(i) + ");\n";
113     for (int y = 0; y < kernel_size.y; ++y) {
114       for (int x = 0; x < kernel_size.x; ++x) {
115         std::string r_s =
116             "  r[" + std::to_string(y) + "][" + std::to_string(x) + "]";
117         for (int d = 0; d < dst_channels; ++d) {
118           c += r_s + postfix[d] + " += dot(src, args.weights.Read(" +
119                std::to_string(index) + "));\n";
120           index++;
121         }
122       }
123     }
124     c += "  }\n";
125   }
126   c += "  X *= " + std::to_string(kernel_size.x) + ";\n";
127   c += "  Y *= " + std::to_string(kernel_size.y) + ";\n";
128   for (int y = 0; y < kernel_size.y; ++y) {
129     for (int x = 0; x < kernel_size.x; ++x) {
130       const std::string x_coord = "X + " + std::to_string(x);
131       const std::string y_coord = "Y + " + std::to_string(y);
132       c += "  if (" + x_coord + " < args.dst_tensor.Width() && " + y_coord +
133            " < args.dst_tensor.Height()) {\n";
134       c += "    FLT4 result = args.weights.Read(" + std::to_string(index) +
135            ");\n";
136       for (int d = 0; d < dst_channels; ++d) {
137         c += "    result" + channel[d] + " += r[" + std::to_string(y) + "][" +
138              std::to_string(x) + "]" + postfix[d] + ";\n";
139       }
140       c += "    args.dst_tensor.Write(result, " + x_coord + ", " + y_coord +
141            ", 0);\n";
142       c += "  }\n";
143     }
144   }
145   c += "}\n";
146 
147   return c;
148 }
149 
GetGridSize() const150 int3 ConvolutionTransposedThin::GetGridSize() const {
151   const int grid_x = src_[0]->Width() * dst_[0]->Batch();
152   const int grid_y = src_[0]->Height();
153   const int grid_z = 1;
154   return int3(grid_x, grid_y, grid_z);
155 }
156 
IsConvolutionTransposedThinSupported(const ConvolutionTransposedAttributes & attr)157 bool IsConvolutionTransposedThinSupported(
158     const ConvolutionTransposedAttributes& attr) {
159   return attr.weights.shape.o <= 4 && attr.weights.shape.w == attr.stride.w &&
160          attr.weights.shape.h == attr.stride.h &&
161          attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0 &&
162          attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
163 }
164 
CreateConvolutionTransposedThin(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)165 ConvolutionTransposedThin CreateConvolutionTransposedThin(
166     const GpuInfo& gpu_info, const OperationDef& definition,
167     const ConvolutionTransposedAttributes& attr) {
168   ConvolutionTransposedThin result(definition, attr, gpu_info);
169   result.UploadData(attr.weights, attr.bias);
170   return result;
171 }
172 
173 }  // namespace gpu
174 }  // namespace tflite
175