1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv.h"
17 
18 #include <string>
19 #include <utility>
20 #include <vector>
21 
22 #include "tensorflow/lite/delegates/gpu/common/task/tensor_linear_desc.h"
23 #include "tensorflow/lite/delegates/gpu/common/task/util.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
25 
26 namespace tflite {
27 namespace gpu {
28 
29 namespace {
30 
IsSpecializedCase(int channel_multiplier)31 bool IsSpecializedCase(int channel_multiplier) {
32   return channel_multiplier == 1 || channel_multiplier == 2 ||
33          channel_multiplier == 4;
34 }
35 
GetSrcValue(int channel_multiplier,const std::string coords)36 std::string GetSrcValue(int channel_multiplier, const std::string coords) {
37   std::string c;
38   if (channel_multiplier == 1) {
39     c += "      FLT4 src_final = args.src_tensor.Read(" + coords + ", S);\n";
40   } else if (channel_multiplier == 2) {
41     c += "      int s_layer = S / 2;\n";
42     c += "      FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n";
43     c += "      FLT2 t0 = S % 2 == 0 ? src.xy : src.zw;\n";
44     c += "      FLT4 src_final = INIT_FLT4v4(t0.x, t0.x, t0.y, t0.y);\n";
45   } else if (channel_multiplier == 4) {
46     c += "      int s_layer = S / 4;\n";
47     c += "      FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n";
48     c += "      FLT t0 = src.x;\n";
49     c += "      int reminder = S % 4;\n";
50     c += "      if (reminder == 1) t0 = src.y;\n";
51     c += "      if (reminder == 2) t0 = src.z;\n";
52     c += "      if (reminder == 3) t0 = src.w;\n";
53     c += "      FLT4 src_final = INIT_FLT4v4(t0, t0, t0, t0);\n";
54   } else {
55     c += "      int s_layer = S / args.ch_multiplier;\n";
56     c += "      FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n";
57     c += "      int s_offset = (S % args.ch_multiplier) * 4;\n";
58     c += "      FLT4 src_final;\n";
59     c += "      FLT temp_arr[4] = {src.x, src.y, src.z, src.w};\n";
60     c += "      src_final.x = temp_arr[(s_offset + 0) / args.ch_multiplier];\n";
61     c += "      src_final.y = temp_arr[(s_offset + 1) / args.ch_multiplier];\n";
62     c += "      src_final.z = temp_arr[(s_offset + 2) / args.ch_multiplier];\n";
63     c += "      src_final.w = temp_arr[(s_offset + 3) / args.ch_multiplier];\n";
64   }
65 
66   return c;
67 }
68 
GenerateDepthwiseConvolutionCode(const OperationDef & op_def,bool stride_correction,int channel_multiplier,bool weights_are_buffer,bool dynamic_weights,GPUOperation * op)69 std::string GenerateDepthwiseConvolutionCode(
70     const OperationDef& op_def, bool stride_correction, int channel_multiplier,
71     bool weights_are_buffer, bool dynamic_weights, GPUOperation* op) {
72   auto src_desc = op_def.src_tensors[0];
73   src_desc.SetAddressMode(AddressMode::kZero);
74   if (op_def.IsBatchSupported()) {
75     src_desc.SetStateVar("BatchedWidth", "true");
76   }
77   op->AddSrcTensor("src_tensor", src_desc);
78   if (dynamic_weights) {
79     op->AddSrcTensor("weights", op_def.src_tensors[1]);
80   }
81 
82   auto dst_desc = op_def.dst_tensors[0];
83   if (op_def.IsBatchSupported()) {
84     dst_desc.SetStateVar("BatchedWidth", "true");
85   }
86   op->AddDstTensor("dst_tensor", dst_desc);
87 
88   std::string c;
89 
90   c += "MAIN_FUNCTION(\n";
91   c += "$0) {\n";
92   c += "  int X = GLOBAL_ID_0;\n";
93   if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
94     c += "  int linear_id_1 = GLOBAL_ID_1;\n";
95     c += "  int Y = linear_id_1 / args.dst_tensor.Depth();\n";
96     c += "  int Z = linear_id_1 % args.dst_tensor.Depth();\n";
97   } else {
98     c += "  int Y = GLOBAL_ID_1;\n";
99   }
100   c += "  int S = GLOBAL_ID_2;\n";
101   c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
102        "S >= args.dst_tensor.Slices()) { \n";
103   c += "    return; \n";
104   c += "  } \n";
105   c += "  ACCUM_FLT4 r = INIT_ACCUM_FLT4(0.0f);\n";
106   if (stride_correction) {
107     c += "  int x_offseted = " +
108          GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x",
109                                "args.padding_x") +
110          ";\n";
111   } else {
112     if (op_def.IsBatchSupported()) {
113       c += "  int x_offseted = X * args.stride_x + args.padding_x * "
114            "args.src_tensor.Batch();\n";
115     } else {
116       c += "  int x_offseted = X * args.stride_x + args.padding_x;\n";
117     }
118   }
119   c += "  int y_offseted = Y * args.stride_y + args.padding_y;\n";
120   if (!dynamic_weights) {
121     std::string weights_offset = "args.kernel_size_x * args.kernel_size_y";
122     if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
123       c += "  int z_offseted = Z * args.stride_z + args.padding_z;\n";
124       weights_offset += " * args.kernel_size_z";
125     }
126     if (weights_are_buffer) {
127       c += "  int fx_c = S * " + weights_offset + ";\n";
128     } else {
129       c += "  int fx_c = 0;\n";
130     }
131   }
132   std::string kernel_size_x =
133       dynamic_weights ? "args.weights.Width()" : "args.kernel_size_x";
134   std::string kernel_size_y =
135       dynamic_weights ? "args.weights.Height()" : "args.kernel_size_y";
136   std::string kernel_size_z =
137       dynamic_weights ? "args.weights.Depth()" : "args.kernel_size_z";
138 
139   auto generate_check = [&]() {
140     std::string check;
141     const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
142     const std::vector<std::string> names{"outside_x", "outside_y", "outside_z"};
143     for (int i = 0; i < axes.size(); ++i) {
144       const auto& axis = axes[i];
145       if (src_desc.HasAxis(axis) && !src_desc.SupportsZeroClamp(axis)) {
146         if (!check.empty()) {
147           check += " && ";
148         }
149         check += "!" + names[i];
150       }
151     }
152     return check;
153   };
154   auto generate_coords = [&]() {
155     std::string check;
156     const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
157     const std::vector<std::string> names{"x_c", "y_c", "z_c"};
158     for (int i = 0; i < axes.size(); ++i) {
159       const auto& axis = axes[i];
160       if (src_desc.HasAxis(axis)) {
161         if (!check.empty()) {
162           check += ", ";
163         }
164         check += names[i];
165       }
166     }
167     return check;
168   };
169   const std::string check = generate_check();
170   const std::string coords = generate_coords();
171 
172   if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
173     c += "  for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n";
174     c += "    int z_c = z_offseted + kz * args.dilation_z;\n";
175     if (!src_desc.SupportsZeroClamp(Axis::DEPTH)) {
176       c += "    bool outside_z = z_c < 0 || z_c >= args.src_tensor.Depth();\n";
177     }
178   }
179   if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
180     c += "  for (int ky = 0; ky < " + kernel_size_y + "; ++ky) {\n";
181     c += "    int y_c = y_offseted + ky * args.dilation_y;\n";
182     if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
183       c += "    bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n";
184     }
185   }
186   if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
187     c += "  for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n";
188     const std::string dilation_x =
189         op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
190                                   : "args.dilation_x";
191     c += "    int x_c = x_offseted + kx * " + dilation_x + ";\n";
192     if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
193       c += "    bool outside_x = x_c < 0 || x_c >= args.src_tensor.Width();\n";
194     }
195   }
196   if (!check.empty()) {
197     c += "    if (" + check + ") {\n";
198   }
199   if (dynamic_weights) {
200     c += "      FLT4 f = args.weights.Read(kx, ky, S);\n";
201   } else {
202     if (weights_are_buffer) {
203       c += "      FLT4 f = args.weights.Read(fx_c);\n";
204     } else {
205       c += "      FLT4 f = args.weights.Read(fx_c, S);\n";
206     }
207   }
208   c += GetSrcValue(channel_multiplier, coords);
209   c += "      r += TO_ACCUM_TYPE(src_final * f);\n";
210   if (!check.empty()) {
211     c += "    }\n";
212   }
213   if (!dynamic_weights) {
214     c += "    fx_c++;\n";
215   }
216   if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
217     c += "  }\n";
218   }
219   if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
220     c += "  }\n";
221   }
222   if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
223     c += "  }\n";
224   }
225   c += "  FLT4 res0 = TO_FLT4(r) + args.biases.Read(S);\n";
226   if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
227     c += "  args.dst_tensor.Write(res0, X, Y, Z, S);\n";
228   } else {
229     c += "  args.dst_tensor.Write(res0, X, Y, S);\n";
230   }
231   c += "}\n";
232   return c;
233 }
234 }  // namespace
235 
CreateDepthwiseConvolution2D(const GpuInfo & gpu_info,const OperationDef & definition,const DepthwiseConvolution2DAttributes & attr)236 GPUOperation CreateDepthwiseConvolution2D(
237     const GpuInfo& gpu_info, const OperationDef& definition,
238     const DepthwiseConvolution2DAttributes& attr) {
239   bool weights_are_buffer =
240       !gpu_info.SupportsImages() || gpu_info.IsMali() || gpu_info.IsApple();
241   GPUOperation op(definition);
242   op.args_.AddInt("kernel_size_x", attr.weights.shape.w);
243   op.args_.AddInt("stride_x", attr.strides.w);
244   op.args_.AddInt("padding_x", -attr.padding.prepended.w);
245   op.args_.AddInt("dilation_x", attr.dilations.w);
246   op.args_.AddInt("kernel_size_y", attr.weights.shape.h);
247   op.args_.AddInt("stride_y", attr.strides.h);
248   op.args_.AddInt("padding_y", -attr.padding.prepended.h);
249   op.args_.AddInt("dilation_y", attr.dilations.h);
250   if (!IsSpecializedCase(attr.weights.shape.o)) {
251     op.args_.AddInt("ch_multiplier", attr.weights.shape.o);
252   }
253   const bool stride_correction =
254       definition.IsBatchSupported() && attr.strides.w != 1;
255   op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction,
256                                               attr.weights.shape.o,
257                                               weights_are_buffer, false, &op);
258   UploadWeightsForDWConv2D(attr.weights, weights_are_buffer,
259                            definition.precision, &op);
260   op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
261 
262   TensorLinearDescriptor desc;
263   desc.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
264                                          : LinearStorageType::TEXTURE_2D;
265   desc.element_type = definition.GetDataType();
266   desc.UploadLinearData(attr.bias);
267   op.args_.AddObject(
268       "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
269   return op;
270 }
271 
CreateDepthwiseConvolution2DDynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const DepthwiseConvolution2DAttributes & attr)272 GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
273     const GpuInfo& gpu_info, const OperationDef& definition,
274     const DepthwiseConvolution2DAttributes& attr) {
275   GPUOperation op(definition);
276   op.args_.AddInt("stride_x", attr.strides.w);
277   op.args_.AddInt("padding_x", -attr.padding.prepended.w);
278   op.args_.AddInt("dilation_x", attr.dilations.w);
279   op.args_.AddInt("stride_y", attr.strides.h);
280   op.args_.AddInt("padding_y", -attr.padding.prepended.h);
281   op.args_.AddInt("dilation_y", attr.dilations.h);
282   const bool stride_correction =
283       definition.IsBatchSupported() && attr.strides.w != 1;
284   op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction, 1,
285                                               false, true, &op);
286   op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
287 
288   TensorLinearDescriptor desc;
289   desc.storage_type =
290       !gpu_info.SupportsImages() || gpu_info.IsMali() || gpu_info.IsApple()
291           ? LinearStorageType::BUFFER
292           : LinearStorageType::TEXTURE_2D;
293   desc.element_type = definition.GetDataType();
294   desc.UploadLinearData(attr.bias);
295   op.args_.AddObject(
296       "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
297   return op;
298 }
299 
CreateDepthwiseConvolution3D(const GpuInfo & gpu_info,const OperationDef & definition,const DepthwiseConvolution3DAttributes & attr)300 GPUOperation CreateDepthwiseConvolution3D(
301     const GpuInfo& gpu_info, const OperationDef& definition,
302     const DepthwiseConvolution3DAttributes& attr) {
303   bool weights_are_buffer =
304       !gpu_info.SupportsImages() || gpu_info.IsMali() || gpu_info.IsApple();
305   GPUOperation op(definition);
306   op.args_.AddInt("kernel_size_x", attr.weights.shape.w);
307   op.args_.AddInt("stride_x", attr.strides.w);
308   op.args_.AddInt("padding_x", -attr.padding.prepended.w);
309   op.args_.AddInt("dilation_x", attr.dilations.w);
310   op.args_.AddInt("kernel_size_y", attr.weights.shape.h);
311   op.args_.AddInt("stride_y", attr.strides.h);
312   op.args_.AddInt("padding_y", -attr.padding.prepended.h);
313   op.args_.AddInt("dilation_y", attr.dilations.h);
314   op.args_.AddInt("kernel_size_z", attr.weights.shape.d);
315   op.args_.AddInt("stride_z", attr.strides.d);
316   op.args_.AddInt("padding_z", -attr.padding.prepended.d);
317   op.args_.AddInt("dilation_z", attr.dilations.d);
318   if (!IsSpecializedCase(attr.weights.shape.o)) {
319     op.args_.AddInt("ch_multiplier", attr.weights.shape.o);
320   }
321   const bool stride_correction =
322       definition.IsBatchSupported() && attr.strides.w != 1;
323   op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction,
324                                               attr.weights.shape.o,
325                                               weights_are_buffer, false, &op);
326   UploadWeightsForDWConv3D(attr.weights, weights_are_buffer,
327                            definition.precision, &op);
328   op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
329 
330   TensorLinearDescriptor desc;
331   desc.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
332                                          : LinearStorageType::TEXTURE_2D;
333   desc.element_type = definition.GetDataType();
334   desc.UploadLinearData(attr.bias);
335   op.args_.AddObject(
336       "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
337   return op;
338 }
339 
340 }  // namespace gpu
341 }  // namespace tflite
342