1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/conv_constants.h"
17 
18 #include <string>
19 #include <utility>
20 
21 #include "absl/strings/str_cat.h"
22 #include "tensorflow/lite/delegates/gpu/common/task/util.h"
23 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
24 
25 namespace tflite {
26 namespace gpu {
27 
28 namespace {
29 // Adreno can provide up to ~3-4KB of constant memory, but in some cases even
30 // 3KB can have very bad performance.
GetAdrenoOptimalMaxConstantSize(const AdrenoInfo & adreno_info)31 int GetAdrenoOptimalMaxConstantSize(const AdrenoInfo& adreno_info) {
32   if (adreno_info.IsAdreno3xx() || adreno_info.IsAdreno4xx() ||
33       adreno_info.IsAdreno5xx()) {
34     return 256 * 10;  // 2.5KB
35   } else {
36     return 256 * 14;  // 3.5KB
37   }
38 }
39 
GetOptimalMaxConstantSize(const GpuInfo & info)40 int GetOptimalMaxConstantSize(const GpuInfo& info) {
41   if (!info.IsAdreno()) {
42     // In general we do not expect that this kernel will be used with non Adreno
43     // so as it tuned for __constant memory that have big profit on Adreno
44     return 1024;  // 1KB
45   } else {
46     return GetAdrenoOptimalMaxConstantSize(info.adreno_info);
47   }
48 }
49 
50 // src_size and dst_size must be <= 4;
GenerateConv(int src_size,int dst_size,bool use_dot_conv,int const_mem_offset,CalculationsPrecision precision,const std::string & dst,const std::string & src)51 std::string GenerateConv(int src_size, int dst_size, bool use_dot_conv,
52                          int const_mem_offset, CalculationsPrecision precision,
53                          const std::string& dst, const std::string& src) {
54   std::string result;
55   const std::string postfixes[] = {".x", ".y", ".z", ".w"};
56   if (use_dot_conv) {
57     const std::string src_postfixes[] = {".x", ".xy", ".xyz", ""};
58     const std::string src_postfix = src_postfixes[src_size - 1];
59     for (int i = 0; i < dst_size; ++i) {
60       result += "    " + dst + postfixes[i] + " += dot(" + src +
61                 ", constants[" + std::to_string(const_mem_offset + i) + "]" +
62                 src_postfix + ");\n";
63     }
64   } else {
65     const std::string dst_postfixes[] = {".x", ".xy", ".xyz", ""};
66     const std::string dst_postfix = dst_postfixes[dst_size - 1];
67     if (precision == CalculationsPrecision::F32_F16) {
68       for (int i = 0; i < src_size; ++i) {
69         if (i != 0) {
70           result += " + ";
71         }
72         std::string src_name = src;
73         if (src_size != 1) {
74           src_name += postfixes[i];
75         }
76         result += src_name + " * constants[" +
77                   std::to_string(const_mem_offset + i) + "]" + dst_postfix;
78       }
79       std::string size = dst_size == 1 ? "" : std::to_string(dst_size);
80       result = "    " + dst + dst_postfix + " += convert_float" + size + "(" +
81                result + ");\n";
82     } else {
83       for (int i = 0; i < src_size; ++i) {
84         std::string src_name = src;
85         if (src_size != 1) {
86           src_name += postfixes[i];
87         }
88         result += "    " + dst + dst_postfix + " += " + src_name +
89                   " * constants[" + std::to_string(const_mem_offset + i) + "]" +
90                   dst_postfix + ";\n";
91       }
92     }
93   }
94   return result;
95 }
96 
GenerateConvolutionConstantCode(const OperationDef & op_def,const OHWI & weights_shape,bool stride_correction,bool use_dot_conv,GPUOperation * op)97 std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
98                                             const OHWI& weights_shape,
99                                             bool stride_correction,
100                                             bool use_dot_conv,
101                                             GPUOperation* op) {
102   auto src_desc = op_def.src_tensors[0];
103   src_desc.SetAddressMode(AddressMode::kZero);
104   if (op_def.IsBatchSupported()) {
105     src_desc.SetStateVar("BatchedWidth", "true");
106   }
107   op->AddSrcTensor("src_tensor", src_desc);
108 
109   auto dst_desc = op_def.dst_tensors[0];
110   if (op_def.IsBatchSupported()) {
111     dst_desc.SetStateVar("BatchedWidth", "true");
112   }
113   op->AddDstTensor("dst_tensor", dst_desc);
114 
115   const int out_z = DivideRoundUp(weights_shape.o, 4);
116   const std::string kOutZ = std::to_string(out_z);
117   const int src_depth = DivideRoundUp(weights_shape.i, 4);
118 
119   const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
120 
121   std::string c;
122   c += "__kernel void main_function(\n";
123   c += "$0) {\n";
124   c += "  int X = get_global_id(0);\n";
125   c += "  int Y = get_global_id(1);\n";
126   c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) "
127        "return;\n";
128   if (stride_correction) {
129     c += "  int start_x = " +
130          GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x",
131                                "args.padding_x") +
132          ";\n";
133   } else {
134     if (op_def.IsBatchSupported()) {
135       c += "  int start_x = X * args.stride_x + args.padding_x * "
136            "args.src_tensor.Batch();\n";
137     } else {
138       c += "  int start_x = X * args.stride_x + args.padding_x;\n";
139     }
140   }
141   c += "  int start_y = Y * args.stride_y + args.padding_y;\n";
142   c += "  __constant FLT4* constants = args.weights.GetPtr();\n";
143   for (int i = 0; i < out_z; ++i) {
144     c += "  ACCUM_FLT4 r" + std::to_string(i) +
145          " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
146   }
147   auto generate_check = [&]() {
148     std::string check;
149     const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
150     const std::vector<std::string> names{"x_out", "y_out", "z_out"};
151     for (int i = 0; i < axes.size(); ++i) {
152       const auto& axis = axes[i];
153       if (src_desc.HasAxis(axis) && !src_desc.SupportsZeroClamp(axis)) {
154         if (!check.empty()) {
155           check += " || ";
156         }
157         check += names[i];
158       }
159     }
160     return check;
161   };
162   const std::string check = generate_check();
163   int filters_counter = 0;
164   for (int s = 0; s < src_depth; ++s) {
165     const int src_ch_count = std::min(4, weights_shape.i - s * 4);
166     const std::string s_count =
167         src_ch_count == 1 ? "" : std::to_string(src_ch_count);
168     const std::string s_type = absl::StrCat("FLT", s_count);
169     const std::string s_postfix = postfixes[src_ch_count - 1];
170     const std::string dilation_x =
171         op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
172                                   : "args.dilation_x";
173     for (int ky = 0; ky < weights_shape.h; ++ky) {
174       std::string s_y = absl::StrCat("(start_y + ", ky, " * args.dilation_y)");
175       if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
176         c += "  {\n";
177         c += "  bool y_out = " + s_y + " < 0 || " + s_y +
178              " >= args.src_tensor.Height();\n";
179       }
180       for (int kx = 0; kx < weights_shape.w; ++kx) {
181         c += "  {\n";
182         std::string s_x =
183             absl::StrCat("(start_x + ", kx, " * " + dilation_x + ")");
184         if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
185           c += "    bool x_out = " + s_x + " < 0 || " + s_x +
186                ">= args.src_tensor.Width();\n";
187         }
188         if (check.empty()) {
189           c += "    " + s_type + " src = args.src_tensor.Read(" + s_x + ", " +
190                s_y + ", " + std::to_string(s) + ")" + s_postfix + ";\n";
191         } else {
192           c += "    " + s_type + " src = x_out || y_out ? ";
193           c += "(" + s_type + ")(0.0) : args.src_tensor.Read(" + s_x + ", " +
194                s_y + ", " + std::to_string(s) + ")" + s_postfix + ";\n";
195         }
196         for (int d = 0; d < out_z; ++d) {
197           const int dst_ch_count = std::min(4, weights_shape.o - d * 4);
198           c += GenerateConv(src_ch_count, dst_ch_count, use_dot_conv,
199                             filters_counter, op_def.precision,
200                             "r" + std::to_string(d), "src");
201           filters_counter += use_dot_conv ? dst_ch_count : src_ch_count;
202         }
203         c += "  }\n";
204       }
205       if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
206         c += "  }\n";
207       }
208     }
209   }
210   for (int i = 0; i < out_z; ++i) {
211     std::string s_i = std::to_string(i);
212     c += "  {\n";
213     c += "    FLT4 res = TO_FLT4(r" + s_i + ") + args.biases.Read(" + s_i +
214          ");\n";
215     c += "    args.dst_tensor.Write(res, X, Y, " + s_i + ");\n";
216     c += "  }\n";
217   }
218   c += "}\n";
219   return c;
220 }
221 
IsDotConvBetter(int src_channels,int dst_channels)222 bool IsDotConvBetter(int src_channels, int dst_channels) {
223   if (dst_channels % 4 == 0) {
224     return false;
225   }
226 
227   // dst_channels % 4 != 0
228   if (src_channels % 4 == 0) {
229     return true;
230   }
231 
232   // dst_channels % 4 != 0 && src_channels % 4 != 0
233   const int src_depth = DivideRoundUp(src_channels, 4);
234   const int dst_depth = DivideRoundUp(dst_channels, 4);
235   return dst_channels * src_depth < src_channels * dst_depth;
236 }
237 
238 }  // namespace
239 
IsConvConstantsSupported(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr)240 bool IsConvConstantsSupported(const GpuInfo& gpu_info,
241                               const OperationDef& definition,
242                               const Convolution2DAttributes& attr) {
243   if (gpu_info.IsAMD() && definition.precision != CalculationsPrecision::F32 &&
244       definition.src_tensors[0].storage_type != TensorStorageType::BUFFER) {
245     // BUG, some AMD GPUs crash without it
246     return false;
247   }
248 
249   const bool use_dot_conv =
250       IsDotConvBetter(attr.weights.shape.i, attr.weights.shape.o);
251   const auto& w_shape = attr.weights.shape;
252   const int src_depth = DivideRoundUp(w_shape.i, 4);
253   const int dst_depth = DivideRoundUp(w_shape.o, 4);
254   const int aligned_ch_count =
255       use_dot_conv ? w_shape.o * src_depth * 4 : w_shape.i * dst_depth * 4;
256   const int filters_count = aligned_ch_count * w_shape.h * w_shape.w;
257   const int float_size = definition.precision == CalculationsPrecision::F32
258                              ? sizeof(float)
259                              : sizeof(half);
260   const int filters_buffer_size = filters_count * float_size;
261   const int kConstantMaxSize = GetOptimalMaxConstantSize(gpu_info);
262   const int flt4_registers = DivideRoundUp(w_shape.o, 4);
263   return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
264 }
265 
CreateConvConstants(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr)266 GPUOperation CreateConvConstants(const GpuInfo& gpu_info,
267                                  const OperationDef& definition,
268                                  const Convolution2DAttributes& attr) {
269   const bool use_dot_conv =
270       IsDotConvBetter(attr.weights.shape.i, attr.weights.shape.o);
271   GPUOperation op(definition);
272   UploadWeightsForConvConstants(attr.weights, definition.precision,
273                                 use_dot_conv, &op);
274   op.args_.AddInt("stride_x", attr.strides.w);
275   op.args_.AddInt("stride_y", attr.strides.h);
276   op.args_.AddInt("padding_x", -attr.padding.prepended.w);
277   op.args_.AddInt("padding_y", -attr.padding.prepended.h);
278   op.args_.AddInt("dilation_x", attr.dilations.w);
279   op.args_.AddInt("dilation_y", attr.dilations.h);
280   op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
281 
282   const bool stride_correction =
283       definition.IsBatchSupported() && attr.strides.w != 1;
284 
285   op.code_ = GenerateConvolutionConstantCode(
286       definition, attr.weights.shape, stride_correction, use_dot_conv, &op);
287   if (definition.precision == CalculationsPrecision::F16 &&
288       gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx()) {
289     op.compiler_options_.push_back(CompilerOptions::kAdrenoFullSimd);
290   }
291   if (definition.precision != CalculationsPrecision::F32 &&
292       gpu_info.IsPowerVR()) {
293     // BUG, some PowerVRs (GE8320) produce incorrect result without it
294     op.compiler_options_.push_back(CompilerOptions::kClDisableOptimizations);
295   }
296 
297   TensorLinearDescriptor desc;
298   desc.storage_type = LinearStorageType::BUFFER;
299   desc.element_type = definition.GetDataType();
300   desc.memory_type = MemoryType::CONSTANT;
301   desc.UploadLinearData(attr.bias);
302   op.args_.AddObject(
303       "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
304   return op;
305 }
306 
307 }  // namespace gpu
308 }  // namespace tflite
309