1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3_thin.h"
17 
18 #include <string>
19 #include <utility>
20 #include <vector>
21 
22 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
23 
24 namespace tflite {
25 namespace gpu {
26 
ConvolutionTransposed3x3Thin(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)27 ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
28     const GpuInfo& gpu_info, const OperationDef& definition,
29     const ConvolutionTransposedAttributes& attr)
30     : GPUOperation(definition) {
31   if (gpu_info.IsApple()) {
32     weights_layout_ = WeightsLayout::kOICustomSpatialO4I4;
33   } else {
34     weights_layout_ = WeightsLayout::kOICustomSpatialI4O4;
35   }
36   code_ = GenerateConvolutionTransposedCode(
37       definition_, DivideRoundUp(attr.weights.shape.i, 4),
38       DivideRoundUp(attr.weights.shape.o, 4));
39 }
40 
GenerateConvolutionTransposedCode(const OperationDef & op_def,int src_depth,int dst_depth)41 std::string ConvolutionTransposed3x3Thin::GenerateConvolutionTransposedCode(
42     const OperationDef& op_def, int src_depth, int dst_depth) {
43   auto src_desc = op_def.src_tensors[0];
44   src_desc.SetAddressMode(AddressMode::kZero);
45   AddSrcTensor("src_tensor", src_desc);
46   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
47 
48   if (op_def.src_tensors.size() == 2) {
49     // dynamic weights
50     BufferDescriptor desc;
51     desc.element_type = op_def.src_tensors[1].data_type;
52     desc.element_size = 4;
53     desc.memory_type = MemoryType::CONSTANT;
54     AddSrcBuffer("weights", desc);
55   }
56 
57   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
58 
59   std::string c;
60 
61   if (GetWeightsDescription().IsI4O4()) {
62     switch (op_def.precision) {
63       case CalculationsPrecision::F32:
64       case CalculationsPrecision::F16:
65         c += "#define CONV(R, SRC, F, i) \\\n";
66         c += "  R += SRC.x * F[i + 0]; \\\n";
67         c += "  R += SRC.y * F[i + 1]; \\\n";
68         c += "  R += SRC.z * F[i + 2]; \\\n";
69         c += "  R += SRC.w * F[i + 3];   \n";
70         break;
71       case CalculationsPrecision::F32_F16:
72         c += "#define CONV(R, SRC, F, i) \\\n";
73         c += "  R += TO_ACCUM_TYPE(SRC.x * F[i + 0] + SRC.y * F[i + 1]";
74         c += "+ SRC.z * F[i + 2] + SRC.w * F[i + 3]);\n";
75         break;
76     }
77   } else {
78     // O4I4
79     c += "#define CONV(R, SRC, F, i) \\\n";
80     c += "  R.x += dot(SRC, F[i + 0]); \\\n";
81     c += "  R.y += dot(SRC, F[i + 1]); \\\n";
82     c += "  R.z += dot(SRC, F[i + 2]); \\\n";
83     c += "  R.w += dot(SRC, F[i + 3]);   \n";
84   }
85 
86   c += "MAIN_FUNCTION($0) {\n";
87   if (op_def.IsBatchSupported()) {
88     c += "  int linear_id = GLOBAL_ID_0;\n";
89     c += "  int X = linear_id / args.dst_tensor.Batch();\n";
90     c += "  int B = linear_id % args.dst_tensor.Batch();\n";
91     c += "  args.dst_tensor.SetBatchRef(B);\n";
92     c += "  args.src_tensor.SetBatchRef(B);\n";
93   } else {
94     c += "  int X = GLOBAL_ID_0;\n";
95   }
96   c += "  int Y = GLOBAL_ID_1;\n";
97   c += "  if (X >= args.src_tensor.Width() || Y >= args.src_tensor.Height()) "
98        "return;\n";
99   for (int d = 0; d < dst_depth; ++d) {
100     const std::string layer = std::to_string(d);
101     c += "  ACCUM_FLT4 r" + layer + "[2][2];\n";
102     c += "  r" + layer + "[0][0] = INIT_ACCUM_FLT4(0.0f);\n";
103     c += "  r" + layer + "[0][1] = INIT_ACCUM_FLT4(0.0f);\n";
104     c += "  r" + layer + "[1][0] = INIT_ACCUM_FLT4(0.0f);\n";
105     c += "  r" + layer + "[1][1] = INIT_ACCUM_FLT4(0.0f);\n";
106   }
107   int filters_index = 0;
108   for (int s = 0; s < src_depth; ++s) {
109     const std::string z = std::to_string(s);
110     c += "  {\n";
111     if (src_tensor_type == TensorStorageType::BUFFER) {
112       c += "  bool x_in = X + 1 < args.src_tensor.Width();\n";
113       c += "  bool y_in = Y + 1 < args.src_tensor.Height();\n";
114       c += "  FLT4 src0 = args.src_tensor.Read(X, Y, " + z + ");\n";
115       c += "  FLT4 src1 = INIT_FLT4(0.0);\n";
116       c += "  FLT4 src2 = INIT_FLT4(0.0);\n";
117       c += "  FLT4 src3 = INIT_FLT4(0.0);\n";
118       c += "  if (x_in) {\n";
119       c += "    src1 = args.src_tensor.Read(X + 1, Y, " + z + ");\n";
120       c += "  }\n";
121       c += "  if (y_in) {\n";
122       c += "    src2 = args.src_tensor.Read(X, Y + 1, " + z + ");\n";
123       c += "  }\n";
124       c += "  if (x_in && y_in) {\n";
125       c += "    src3 = args.src_tensor.Read(X + 1, Y + 1, " + z + ");\n";
126       c += "  }\n";
127     } else if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
128       c += "  args.src_tensor.GetAddress(c0, X, Y, " + z + ");\n";
129       c += "  args.src_tensor.GetAddress(c1, X + 1, Y, " + z + ");\n";
130       c += "  args.src_tensor.GetAddress(c2, X, Y + 1, " + z + ");\n";
131       c += "  args.src_tensor.GetAddress(c3, X + 1, Y + 1, " + z + ");\n";
132       c += "  bool x_in = X + 1 < args.src_tensor.Width();\n";
133       c += "  bool y_in = Y + 1 < args.src_tensor.Height();\n";
134       c += "  c1 = select(-1, c1, x_in);\n";
135       c += "  c2 = select(-1, c2, y_in);\n";
136       c += "  c3 = select(-1, c3, x_in && y_in);\n";
137       c += "  FLT4 src0 = args.src_tensor.Read(c0);\n";
138       c += "  FLT4 src1 = args.src_tensor.Read(c1);\n";
139       c += "  FLT4 src2 = args.src_tensor.Read(c2);\n";
140       c += "  FLT4 src3 = args.src_tensor.Read(c3);\n";
141     } else {
142       c += "  FLT4 src0 = args.src_tensor.Read(X, Y, " + z + ");\n";
143       c += "  FLT4 src1 = args.src_tensor.Read(X + 1, Y, " + z + ");\n";
144       c += "  FLT4 src2 = args.src_tensor.Read(X, Y + 1, " + z + ");\n";
145       c += "  FLT4 src3 = args.src_tensor.Read(X + 1, Y + 1, " + z + ");\n";
146     }
147     for (int d = 0; d < dst_depth; ++d) {
148       const std::string layer = std::to_string(d);
149       const std::string f_offset = std::to_string(filters_index);
150       filters_index++;
151       c += "  {\n";
152       c += "  __constant FLT4* L0 = args.weights.GetPtr() + 36 * " + f_offset +
153            ";\n";
154       c += "  CONV(r" + layer + "[0][0], src0, L0, 0);\n";
155       c += "  CONV(r" + layer + "[0][1], src0, L0, 4);\n";
156       c += "  CONV(r" + layer + "[0][1], src1, L0, 8);\n";
157       c += "  CONV(r" + layer + "[1][0], src0, L0, 12);\n";
158       c += "  CONV(r" + layer + "[1][0], src2, L0, 16);\n";
159       c += "  CONV(r" + layer + "[1][1], src0, L0, 20);\n";
160       c += "  CONV(r" + layer + "[1][1], src1, L0, 24);\n";
161       c += "  CONV(r" + layer + "[1][1], src2, L0, 28);\n";
162       c += "  CONV(r" + layer + "[1][1], src3, L0, 32);\n";
163       c += "  }\n";
164     }
165     c += "  }\n";
166   }
167   c += "  X *= 2;\n";
168   c += "  Y *= 2;\n";
169   for (int d = 0; d < dst_depth; ++d) {
170     const std::string layer = std::to_string(d);
171     c += "  {\n";
172     c += "  FLT4 bias_val = args.biases.Read(" + layer + ");\n";
173     for (int y = 0; y < 2; ++y) {
174       for (int x = 0; x < 2; ++x) {
175         const std::string x_coord = "X + " + std::to_string(x);
176         const std::string y_coord = "Y + " + std::to_string(y);
177         c += "  {\n";
178         c += "    FLT4 result = TO_FLT4(r" + layer + "[" + std::to_string(y) +
179              "][" + std::to_string(x) + "]) + bias_val;\n";
180         c += "    args.dst_tensor.Write(result, " + x_coord + ", " + y_coord +
181              ", " + layer + ");\n";
182         c += "  }\n";
183       }
184     }
185     c += "  }\n";
186   }
187   c += "}\n";
188 
189   return c;
190 }
191 
GetGridSize() const192 int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
193   const int grid_x = src_[0]->Width() * dst_[0]->Batch();
194   const int grid_y = src_[0]->Height();
195   const int grid_z = 1;
196   return int3(grid_x, grid_y, grid_z);
197 }
198 
GetSpatialWeightsRemap() const199 std::vector<int> ConvolutionTransposed3x3Thin::GetSpatialWeightsRemap() const {
200   return std::vector<int>{4, 5, 3, 7, 1, 8, 6, 2, 0};
201 }
202 
UploadWeights(const tflite::gpu::Tensor<OHWI,DataType::FLOAT32> & weights)203 void ConvolutionTransposed3x3Thin::UploadWeights(
204     const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights) {
205   const int flt_count =
206       GetTotalElementsCountForLayout(GetWeightsDescription(), weights.shape);
207 
208   DataType weights_type = definition_.precision == CalculationsPrecision::F32
209                               ? DataType::FLOAT32
210                               : DataType::FLOAT16;
211 
212   BufferDescriptor desc;
213   desc.element_type = weights_type;
214   desc.element_size = 4;
215   desc.memory_type = MemoryType::CONSTANT;
216   desc.size = flt_count * SizeOf(desc.element_type);
217   desc.data.resize(desc.size);
218 
219   RearrangeWeights(weights, GetWeightsDescription(), weights_type,
220                    absl::MakeSpan(desc.data));
221 
222   args_.AddObject("weights",
223                   absl::make_unique<BufferDescriptor>(std::move(desc)));
224 }
225 
IsConvolutionTransposed3x3ThinSupported(const ConvolutionTransposedAttributes & attr)226 bool IsConvolutionTransposed3x3ThinSupported(
227     const ConvolutionTransposedAttributes& attr) {
228   return attr.weights.shape.o <= 8 && attr.weights.shape.w == 3 &&
229          attr.weights.shape.h == 3 && attr.stride.w == 2 &&
230          attr.stride.h == 2 && attr.padding.prepended.w == 1 &&
231          attr.padding.prepended.h == 1 && attr.padding.appended.w == 1 &&
232          attr.padding.appended.h == 1;
233 }
234 
CreateConvolutionTransposed3x3Thin(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)235 ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
236     const GpuInfo& gpu_info, const OperationDef& definition,
237     const ConvolutionTransposedAttributes& attr) {
238   ConvolutionTransposed3x3Thin result(gpu_info, definition, attr);
239   result.UploadWeights(attr.weights);
240 
241   TensorLinearDescriptor desc;
242   desc.storage_type = LinearStorageType::TEXTURE_2D;
243   desc.element_type = definition.GetDataType();
244   desc.UploadLinearData(attr.bias);
245   result.args_.AddObject(
246       "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
247   return result;
248 }
249 
CreateConvolutionTransposed3x3ThinDynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)250 ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3ThinDynamicWeights(
251     const GpuInfo& gpu_info, const OperationDef& definition,
252     const ConvolutionTransposedAttributes& attr) {
253   OperationDef new_def = definition;
254   new_def.src_tensors = {
255       definition.src_tensors[0]};  // leaving only src_tensor def, weights defs
256                                    // will be added later
257   const DataType weights_type = definition.GetDataType();
258   // add 1 src_tensor(buffer) for weights
259   new_def.src_tensors.push_back(
260       {weights_type, TensorStorageType::BUFFER, Layout::HWC});
261   ConvolutionTransposed3x3Thin result(gpu_info, new_def, attr);
262 
263   TensorLinearDescriptor desc;
264   desc.storage_type = LinearStorageType::TEXTURE_2D;
265   desc.element_type = new_def.GetDataType();
266   desc.UploadLinearData(attr.bias);
267   result.args_.AddObject(
268       "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
269   return result;
270 }
271 
272 }  // namespace gpu
273 }  // namespace tflite
274