1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/winograd.h"
17 
18 #include <string>
19 #include <vector>
20 
21 #include "absl/strings/str_format.h"
22 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
23 #include "tensorflow/lite/delegates/gpu/common/shape.h"
24 #include "tensorflow/lite/delegates/gpu/common/status.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
26 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
27 
28 namespace tflite {
29 namespace gpu {
30 namespace {
GetKernelWinograd4x4To36()31 std::string GetKernelWinograd4x4To36() {
32   std::string c;
33   auto bt_mat = BtMatrixForWinograd4x4To6x6();
34   c += "__constant FLT Bt[36] = {\n";
35   for (int y = 0; y < 6; ++y) {
36     c += "\t";
37     for (int x = 0; x < 6; ++x) {
38       c += absl::StrFormat("%.10f", bt_mat[y * 6 + x]) + "f, ";
39     }
40     c += "\n";
41   }
42   c += "};\n";
43   c += R"(
44 MAIN_FUNCTION($0) {
45   int X = GLOBAL_ID_0 * 4;
46   int Y = GLOBAL_ID_1 * 4;
47   int S = GLOBAL_ID_2;
48 
49   if (GLOBAL_ID_0 >= args.tiles_x || GLOBAL_ID_1 >= args.tiles_y) return;
50 
51   FLT4 I[6][6];
52   for (int y = 0; y < 6; ++y) {
53     for (int x = 0; x < 6; ++x) {
54       I[y][x] = INIT_FLT4(0.0f);
55     }
56   }
57   const int src_base = S * args.src_tensor.Height() * args.src_tensor.Width();
58 )";
59   for (int y = 0; y < 6; ++y) {
60     const std::string s_y = std::to_string(y);
61     c += "  {\n";
62     c += "    int coord_y = Y + " + s_y + " + args.padding_y;\n";
63     c += "    bool in_y = coord_y >= 0 && coord_y < "
64          "args.src_tensor.Height();\n";
65     c += "    coord_y = clamp(coord_y, 0, args.src_tensor.Height() - 1);\n";
66     c += "    const int src_adress_y = src_base + coord_y * "
67          "args.src_tensor.Width();\n";
68     for (int x = 0; x < 6; ++x) {
69       const std::string s_x = std::to_string(x);
70       c += "    {\n";
71       c += "      int coord_x = X + " + s_x + " + args.padding_x;\n";
72       c += "      bool in_x = coord_x >= 0 && coord_x < "
73            "args.src_tensor.Width();\n";
74       c += "      FLT mult = INIT_FLT(in_y && in_x);\n";
75       c += "      coord_x = clamp(coord_x, 0, args.src_tensor.Width() - 1);\n";
76       c += "      FLT4 src = args.src_tensor.Read(src_adress_y + coord_x) * "
77            "mult;\n";
78       c += "      I[0][" + s_x + "] += Bt[" + std::to_string(y) + "] * src;\n";
79       c += "      I[1][" + s_x + "] += Bt[" + std::to_string(y + 6) +
80            "] * src;\n";
81       c += "      I[2][" + s_x + "] += Bt[" + std::to_string(y + 12) +
82            "] * src;\n";
83       c += "      I[3][" + s_x + "] += Bt[" + std::to_string(y + 18) +
84            "] * src;\n";
85       c += "      I[4][" + s_x + "] += Bt[" + std::to_string(y + 24) +
86            "] * src;\n";
87       c += "      I[5][" + s_x + "] += Bt[" + std::to_string(y + 30) +
88            "] * src;\n";
89       c += "    }\n";
90     }
91     c += "  }\n";
92   }
93   c += R"(
94 
95   int dst_x = GLOBAL_ID_1 * args.tiles_x + GLOBAL_ID_0;
96   args.dst_tensor.GetAddress(dst_adress, dst_x, 0, S);
97   for (int y = 0; y < 6; ++y) {
98     FLT4 value = I[y][0] + Bt[2] * I[y][2] + Bt[4] * I[y][4];
99     args.dst_tensor.WriteLinear(value, dst_adress);
100     dst_adress += args.dst_tensor.Width();
101     value = Bt[7] * I[y][1] + Bt[8] * I[y][2] + Bt[9] * I[y][3] + Bt[10] * I[y][4];
102     args.dst_tensor.WriteLinear(value, dst_adress);
103     dst_adress += args.dst_tensor.Width();
104     value = Bt[13] * I[y][1] + Bt[14] * I[y][2] + Bt[15] * I[y][3] + Bt[16] * I[y][4];
105     args.dst_tensor.WriteLinear(value, dst_adress);
106     dst_adress += args.dst_tensor.Width();
107     value = Bt[19] * I[y][1] + Bt[20] * I[y][2] + Bt[21] * I[y][3] + Bt[22] * I[y][4];
108     args.dst_tensor.WriteLinear(value, dst_adress);
109     dst_adress += args.dst_tensor.Width();
110     value = Bt[25] * I[y][1] + Bt[26] * I[y][2] + Bt[27] * I[y][3] + Bt[28] * I[y][4];
111     args.dst_tensor.WriteLinear(value, dst_adress);
112     dst_adress += args.dst_tensor.Width();
113     value = Bt[31] * I[y][1] + Bt[33] * I[y][3] + I[y][5];
114     args.dst_tensor.WriteLinear(value, dst_adress);
115     dst_adress += args.dst_tensor.Width();
116   }
117 }
118 )";
119   return c;
120 }
121 
GetKernelWinograd36To4x4()122 std::string GetKernelWinograd36To4x4() {
123   std::string c;
124   auto at_mat = AtMatrixForWinograd4x4To6x6();
125   c += "__constant FLT At[24] = {\n";
126   for (int y = 0; y < 4; ++y) {
127     c += "\t";
128     for (int x = 0; x < 6; ++x) {
129       c += absl::StrFormat("%.10f", at_mat[y * 6 + x]) + "f, ";
130     }
131     c += "\n";
132   }
133   c += "};\n";
134   c += R"(
135 MAIN_FUNCTION($0) {
136   int tile_id = GLOBAL_ID_0;
137   int Z = GLOBAL_ID_2;
138   int tiles_count_x = (args.dst_tensor.Width() + 3) / 4;
139   int tile_x = (tile_id % tiles_count_x) * 4;
140   int tile_y = (tile_id / tiles_count_x) * 4;
141   if (tile_x >= args.dst_tensor.Width() || tile_y >= args.dst_tensor.Height()) return;
142 
143   int src_adress = Z * args.src_tensor.Height() * args.src_tensor.Width() + tile_id;
144   FLT4 I[4][6];
145   for (int y = 0; y < 4; ++y) {
146     for (int x = 0; x < 6; ++x) {
147       I[y][x] = INIT_FLT4(0.0f);
148     }
149   }
150   for (int y = 0; y < 6; ++y) {
151     for (int x = 0; x < 6; ++x, src_adress += args.src_tensor.Width()) {
152       FLT4 src = args.src_tensor.Read(src_adress);
153       I[0][x] += src * At[y];
154       I[1][x] += src * At[y + 6];
155       I[2][x] += src * At[y + 12];
156       I[3][x] += src * At[y + 18];
157     }
158   }
159 
160   FLT4 bias_val = args.biases.Read(Z);
161   for (int y = 0; y < 4 && tile_y + y < args.dst_tensor.Height(); ++y) {
162     FLT4 t0 = I[y][1] + I[y][2];
163     FLT4 t1 = I[y][3] + I[y][4];
164     if (tile_x < args.dst_tensor.Width()) {
165       FLT4 value = I[y][0] + t0 + t1 + bias_val;
166       args.dst_tensor.Write(value, tile_x, tile_y + y, Z);
167     }
168     FLT4 t2 = I[y][1] - I[y][2];
169     FLT4 t3 = I[y][3] - I[y][4];
170     if (tile_x + 1 < args.dst_tensor.Width()) {
171       FLT4 value = t2 * At[7] + t3 * At[9] + bias_val;
172       args.dst_tensor.Write(value, tile_x + 1, tile_y + y, Z);
173     }
174     if (tile_x + 2 < args.dst_tensor.Width()) {
175       FLT4 value = t0 * At[13] + t1 * At[15] + bias_val;
176       args.dst_tensor.Write(value, tile_x + 2, tile_y + y, Z);
177     }
178     if (tile_x + 3 < args.dst_tensor.Width()) {
179       FLT4 value = t2 * At[19] + t3 * At[21] + I[y][5] + bias_val;
180       args.dst_tensor.Write(value, tile_x + 3, tile_y + y, Z);
181     }
182   }
183 }
184 )";
185   return c;
186 }
187 }  // namespace
188 
GetGridSize() const189 int3 Winograd4x4To36::GetGridSize() const {
190   int new_width =
191       src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2;
192   int new_height =
193       src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2;
194   int tiles_x = DivideRoundUp(new_width, 4);
195   int tiles_y = DivideRoundUp(new_height, 4);
196   return int3(tiles_x, tiles_y, src_[0]->Slices());
197 }
198 
BindArguments(ArgumentsBinder * args)199 absl::Status Winograd4x4To36::BindArguments(ArgumentsBinder* args) {
200   int new_width =
201       src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2;
202   int new_height =
203       src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2;
204   int tiles_x = DivideRoundUp(new_width, 4);
205   int tiles_y = DivideRoundUp(new_height, 4);
206   RETURN_IF_ERROR(args->SetInt("tiles_x", tiles_x));
207   RETURN_IF_ERROR(args->SetInt("tiles_y", tiles_y));
208   return absl::OkStatus();
209 }
210 
CreateWinograd4x4To36(const OperationDef & definition,const Padding2D & padding)211 Winograd4x4To36 CreateWinograd4x4To36(const OperationDef& definition,
212                                       const Padding2D& padding) {
213   Winograd4x4To36 desc(definition, padding);
214   desc.code_ = GetKernelWinograd4x4To36();
215 
216   desc.AddSrcTensor("src_tensor", definition.src_tensors[0]);
217   desc.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
218 
219   desc.args_.AddInt("padding_x", -padding.prepended.w);
220   desc.args_.AddInt("padding_y", -padding.prepended.h);
221   desc.args_.AddInt("tiles_x");
222   desc.args_.AddInt("tiles_y");
223 
224   desc.work_group_size_ = int3(8, 4, 1);
225   return desc;
226 }
227 
Winograd4x4To36TileX6(const OperationDef & definition,const Padding2D & padding,const GpuInfo & gpu_info)228 Winograd4x4To36TileX6::Winograd4x4To36TileX6(const OperationDef& definition,
229                                              const Padding2D& padding,
230                                              const GpuInfo& gpu_info)
231     : GPUOperation(definition), padding_(padding) {
232   work_group_size_ = int3(32, 1, 1);
233   code_ = GetWinograd4x4To36TileX6Code(definition_);
234   if (gpu_info.IsAdreno()) {
235     compiler_options_.push_back(CompilerOptions::kAdrenoMoreWaves);
236   }
237   if (definition_.precision == CalculationsPrecision::F16 &&
238       gpu_info.IsPowerVR()) {
239     compiler_options_.push_back(CompilerOptions::kClPowervrFp16);
240   }
241 }
242 
GetWinograd4x4To36TileX6Code(const OperationDef & op_def)243 std::string Winograd4x4To36TileX6::GetWinograd4x4To36TileX6Code(
244     const OperationDef& op_def) {
245   std::string c;
246 
247   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
248   const bool is_image_buffer =
249       src_tensor_type == TensorStorageType::IMAGE_BUFFER;
250   const bool is_buffer = src_tensor_type == TensorStorageType::BUFFER;
251 
252   switch (op_def.precision) {
253     case CalculationsPrecision::F32:
254     case CalculationsPrecision::F32_F16:
255       c += "#define ACCUM_FLT float\n";
256       break;
257     case CalculationsPrecision::F16:
258       c += "#define ACCUM_FLT half\n";
259       break;
260   }
261 
262   const DataType accum_type = op_def.precision == CalculationsPrecision::F16
263                                   ? DataType::FLOAT16
264                                   : DataType::FLOAT32;
265 
266   auto bt_mat = BtMatrixForWinograd4x4To6x6();
267   c += "constant ACCUM_FLT Bt[36] = {\n";
268   for (int y = 0; y < 6; ++y) {
269     c += "\t";
270     for (int x = 0; x < 6; ++x) {
271       c += absl::StrFormat("%.10f", bt_mat[y * 6 + x]) + "f, ";
272     }
273     c += "\n";
274   }
275   c += "};\n";
276 
277   std::string cl_type = accum_type == DataType::FLOAT16 ? "half" : "float";
278   auto src_desc = op_def.src_tensors[0];
279   src_desc.SetStateVar("ACCUM_FLT", cl_type);
280   AddSrcTensor("src_tensor", src_desc);
281   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
282   args_.AddInt("padding_x");
283   args_.AddInt("padding_y");
284   args_.AddInt("tiles_total");
285   args_.AddInt("tiles_x");
286 
287   c += "MAIN_FUNCTION($0) {\n";
288   c += "  int DST_X = GLOBAL_ID_0;\n";
289   c += "  int DST_Y = GLOBAL_ID_1;\n";
290   c += "  int DST_Z = GLOBAL_ID_2;\n";
291   c += "  if (DST_X >= args.tiles_total || DST_Y >= 6 || DST_Z >= "
292        "args.dst_tensor.Slices()) {\n";
293   c += "    return; \n";
294   c += "  }\n";
295   c += "  int tile_x = (DST_X % args.tiles_x) * 4;\n";
296   c += "  int tile_y = (DST_X / args.tiles_x) * 4;\n";
297   c += "  ACCUM_FLT4 I0, I1, I2, I3, I4, I5;\n";
298   c += "  ACCUM_FLT bt_ar[6];\n";
299   c += "  ACCUM_FLT4 t0 = TO_ACCUM_TYPE(args.bt.Read(DST_Y * 2 + 0));\n";
300   c += "  ACCUM_FLT4 t1 = TO_ACCUM_TYPE(args.bt.Read(DST_Y * 2 + 1));\n";
301   c += "  DST_Y *= 6;\n";
302   c += "  bt_ar[0] = t0.x;\n";
303   c += "  bt_ar[1] = t0.y;\n";
304   c += "  bt_ar[2] = t0.z;\n";
305   c += "  bt_ar[3] = t0.w;\n";
306   c += "  bt_ar[4] = t1.x;\n";
307   c += "  bt_ar[5] = t1.y;\n";
308   auto read_src = [&](const std::string& src, const std::string& xs) {
309     if (is_image_buffer) {
310       c += "    ACCUM_FLT4 " + src +
311            " = args.src_tensor.Read<ACCUM_FLT>(src_a_" + xs + " + offset);\n";
312     } else if (is_buffer) {
313       c += "    ACCUM_FLT4 " + src +
314            " = args.src_tensor.Read<ACCUM_FLT>(src_a_" + xs + " + offset) * m" +
315            xs + "_x;\n";
316     } else {
317       c += "    ACCUM_FLT4 " + src +
318            " = args.src_tensor.Read<ACCUM_FLT>(tile_x + args.padding_x + " +
319            xs + ", yc, DST_Z);\n";
320     }
321   };
322   if (is_buffer || is_image_buffer) {
323     for (int x = 0; x < 6; ++x) {
324       const std::string xs = std::to_string(x);
325       c += "  int xc" + xs + " = tile_x + args.padding_x + " + xs + ";\n";
326       c += "  ACCUM_FLT m" + xs + "_x = TO_ACCUM_FLT(xc" + xs + " >= 0 && xc" +
327            xs + " < args.src_tensor.Width());\n";
328       c += "  bool inx" + xs + " = (xc" + xs + " >= 0 && xc" + xs +
329            " < args.src_tensor.Width());\n";
330       c += "  xc" + xs + " = clamp(xc" + xs +
331            ", 0, args.src_tensor.Width() - 1);\n";
332       c += "  args.src_tensor.GetAddress(src_a_" + xs + ", xc" + xs +
333            ", 0, DST_Z);\n";
334       if (is_image_buffer) {
335         c += "  src_a_" + xs +
336              " = select(-args.src_tensor.Width() * args.src_tensor.Height(), "
337              "src_a_" +
338              xs + ", inx" + xs + ");\n";
339       }
340     }
341   }
342   c += "  {\n";
343   c += "    int yc = tile_y + args.padding_y;\n";
344   if (is_buffer || is_image_buffer) {
345     c += "    bool iny = (yc >= 0 && yc < args.src_tensor.Height());\n";
346     c += "    int offset = select(0, yc * args.src_tensor.Width(), iny);\n";
347     c += "    ACCUM_FLT bt = bt_ar[0] * TO_ACCUM_FLT(iny);\n";
348   } else {
349     c += "    ACCUM_FLT bt = bt_ar[0];\n";
350   }
351   for (int x = 0; x < 6; ++x) {
352     const std::string xs = std::to_string(x);
353     const std::string src = "src" + xs;
354     read_src(src, xs);
355     c += "    I" + xs + " = bt * " + src + ";\n";
356   }
357   c += "  }\n";
358   for (int y = 1; y < 6; ++y) {
359     const std::string ys = std::to_string(y);
360     c += "  {\n";
361     c += "    int yc = tile_y + args.padding_y + (" + ys + ");\n";
362     if (is_buffer || is_image_buffer) {
363       c += "    bool iny = (yc >= 0 && yc < args.src_tensor.Height());\n";
364       c += "    int offset = select(0, yc * args.src_tensor.Width(), iny);\n";
365       c += "    ACCUM_FLT bt = bt_ar[" + ys + "] * TO_ACCUM_FLT(iny);\n";
366     } else {
367       c += "    ACCUM_FLT bt = bt_ar[" + ys + "];\n";
368     }
369     for (int x = 0; x < 6; ++x) {
370       const std::string xs = std::to_string(x);
371       const std::string src = "src" + xs;
372       read_src(src, xs);
373       c += "    I" + xs + " += bt * " + src + ";\n";
374     }
375     c += "  }\n";
376   }
377   c += "  {\n";
378   c += "    FLT4 r0 = TO_FLT4(I0 + Bt[2] * I2 + Bt[4] * I4);\n";
379   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
380   c += "    DST_Y++;\n";
381   c += "  }\n";
382   c += "  {\n";
383   c += "    FLT4 r0 = TO_FLT4(Bt[7] * I1 + Bt[8] * I2 + Bt[9] * I3 + Bt[10] * "
384        "I4);\n";
385   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
386   c += "    DST_Y++;\n";
387   c += "  }\n";
388   c += "  {\n";
389   c += "    FLT4 r0 = TO_FLT4(Bt[13] * I1 + Bt[14] * I2 + Bt[15] * I3 + Bt[16] "
390        "* "
391        "I4);\n";
392   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
393   c += "    DST_Y++;\n";
394   c += "  }\n";
395   c += "  {\n";
396   c += "    FLT4 r0 = TO_FLT4(Bt[19] * I1 + Bt[20] * I2 + Bt[21] * I3 + Bt[22] "
397        "* "
398        "I4);\n";
399   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
400   c += "    DST_Y++;\n";
401   c += "  }\n";
402   c += "  {\n";
403   c += "    FLT4 r0 = TO_FLT4(Bt[25] * I1 + Bt[26] * I2 + Bt[27] * I3 + Bt[28] "
404        "* "
405        "I4);\n";
406   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
407   c += "    DST_Y++;\n";
408   c += "  }\n";
409   c += "  {\n";
410   c += "    FLT4 r0 = TO_FLT4(Bt[31] * I1 + Bt[33] * I3 + I5);\n";
411   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
412   c += "    DST_Y++;\n";
413   c += "  }\n";
414   c += "}\n";
415   return c;
416 }
417 
UploadBt()418 void Winograd4x4To36TileX6::UploadBt() {
419   tflite::gpu::Tensor<Linear, DataType::FLOAT32> bt_aligned;
420   bt_aligned.shape = Linear(6 * 8);
421   bt_aligned.data.resize(6 * 8);
422   auto bt_mat = BtMatrixForWinograd4x4To6x6();
423   for (int y = 0; y < 6; ++y) {
424     for (int x = 0; x < 6; ++x) {
425       bt_aligned.data[y * 8 + x] = bt_mat[y * 6 + x];
426     }
427     bt_aligned.data[y * 8 + 6] = 0.0f;
428     bt_aligned.data[y * 8 + 7] = 0.0f;
429   }
430 
431   TensorLinearDescriptor desc;
432   desc.storage_type = LinearStorageType::TEXTURE_2D;
433   desc.element_type = definition_.GetDataType();
434   desc.UploadLinearData(bt_aligned);
435   args_.AddObject("bt",
436                   absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
437 }
438 
SelectBestWorkGroup(const KernelInfo & kernel_info) const439 int3 Winograd4x4To36TileX6::SelectBestWorkGroup(
440     const KernelInfo& kernel_info) const {
441   const std::vector<int3> wgs = {{8, 6, 4}, {8, 6, 2}, {4, 6, 2},
442                                  {4, 6, 2}, {2, 6, 2}, {2, 6, 1},
443                                  {1, 6, 1}, {1, 3, 1}, {1, 1, 1}};
444   return GetFirstSuitableWorkGroup(wgs, kernel_info.max_work_group_size);
445 }
446 
BindArguments(ArgumentsBinder * args)447 absl::Status Winograd4x4To36TileX6::BindArguments(ArgumentsBinder* args) {
448   const int tiles_x = DivideRoundUp(
449       src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2, 4);
450   const int tiles_y = DivideRoundUp(
451       src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2, 4);
452   const int tiles_total = tiles_x * tiles_y;
453   RETURN_IF_ERROR(args->SetInt("padding_x", -padding_.prepended.w));
454   RETURN_IF_ERROR(args->SetInt("padding_y", -padding_.prepended.h));
455   RETURN_IF_ERROR(args->SetInt("tiles_total", tiles_total));
456   RETURN_IF_ERROR(args->SetInt("tiles_x", tiles_x));
457   return absl::OkStatus();
458 }
459 
GetGridSize() const460 int3 Winograd4x4To36TileX6::GetGridSize() const {
461   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
462   const int grid_y = 6;
463   const int grid_z = dst_[0]->Slices();
464   return int3(grid_x, grid_y, grid_z);
465 }
466 
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const467 void Winograd4x4To36TileX6::GetPossibleKernelWorkGroups(
468     TuningType tuning_type, const GpuInfo& gpu_info,
469     const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
470   if (gpu_info.IsIntel()) {
471     work_groups->push_back(int3(4, 6, 1));
472     return;
473   }
474   switch (tuning_type) {
475     case TuningType::kExhaustive:
476       GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
477                             work_groups);
478       return;
479     case TuningType::kFast:
480     default:
481       work_groups->push_back(SelectBestWorkGroup(kernel_info));
482       return;
483   }
484 }
485 
CreateWinograd4x4To36TileX6(const GpuInfo & gpu_info,const OperationDef & definition,const Padding2D & padding)486 Winograd4x4To36TileX6 CreateWinograd4x4To36TileX6(
487     const GpuInfo& gpu_info, const OperationDef& definition,
488     const Padding2D& padding) {
489   Winograd4x4To36TileX6 result(definition, padding, gpu_info);
490   result.UploadBt();
491   return result;
492 }
493 
GetGridSize() const494 int3 Winograd36To4x4::GetGridSize() const {
495   return int3(src_[0]->Width(), 1, src_[0]->Slices());
496 }
497 
CreateWinograd36To4x4(const OperationDef & definition,const tflite::gpu::Tensor<Linear,DataType::FLOAT32> & biases)498 Winograd36To4x4 CreateWinograd36To4x4(
499     const OperationDef& definition,
500     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
501   Winograd36To4x4 desc(definition);
502   desc.code_ = GetKernelWinograd36To4x4();
503 
504   desc.AddSrcTensor("src_tensor", definition.src_tensors[0]);
505   desc.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
506 
507   TensorLinearDescriptor bias_desc;
508   bias_desc.storage_type = LinearStorageType::BUFFER;
509   bias_desc.element_type = definition.GetDataType();
510   bias_desc.UploadLinearData(biases);
511   desc.args_.AddObject("biases", absl::make_unique<TensorLinearDescriptor>(
512                                      std::move(bias_desc)));
513 
514   desc.work_group_size_ = int3(32, 1, 1);
515   return desc;
516 }
517 
Winograd36To4x4Tile4x1(const OperationDef & definition,const GpuInfo & gpu_info)518 Winograd36To4x4Tile4x1::Winograd36To4x4Tile4x1(const OperationDef& definition,
519                                                const GpuInfo& gpu_info)
520     : GPUOperation(definition) {
521   work_group_size_ = int3(32, 1, 1);
522   if (definition_.precision == CalculationsPrecision::F16 &&
523       gpu_info.IsPowerVR()) {
524     compiler_options_.push_back(CompilerOptions::kClPowervrFp16);
525   }
526   code_ = GetWinograd36To4x4Tile4x1Code(definition_);
527 }
528 
GetWinograd36To4x4Tile4x1Code(const OperationDef & op_def)529 std::string Winograd36To4x4Tile4x1::GetWinograd36To4x4Tile4x1Code(
530     const OperationDef& op_def) {
531   std::string c;
532 
533   switch (op_def.precision) {
534     case CalculationsPrecision::F32:
535     case CalculationsPrecision::F32_F16:
536       c += "#define ACCUM_FLT float\n";
537       break;
538     case CalculationsPrecision::F16:
539       c += "#define ACCUM_FLT half\n";
540       break;
541   }
542 
543   const DataType accum_type = op_def.precision == CalculationsPrecision::F16
544                                   ? DataType::FLOAT16
545                                   : DataType::FLOAT32;
546 
547   std::string cl_type = accum_type == DataType::FLOAT16 ? "half" : "float";
548   auto src_desc = op_def.src_tensors[0];
549   src_desc.SetStateVar("ACCUM_FLT", cl_type);
550   AddSrcTensor("src_tensor", src_desc);
551   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
552   args_.AddInt("tiles_x");
553 
554   auto at_mat = AtMatrixForWinograd4x4To6x6();
555   c += "constant ACCUM_FLT At[24] = {\n";
556   for (int y = 0; y < 4; ++y) {
557     c += "\t";
558     for (int x = 0; x < 6; ++x) {
559       c += absl::StrFormat("%.10f", at_mat[y * 6 + x]) + "f, ";
560     }
561     c += "\n";
562   }
563   c += "};\n";
564 
565   c += "MAIN_FUNCTION($0) {\n";
566   c += "  int tile_id = GLOBAL_ID_0;\n";
567   c += "  int DST_Y = GLOBAL_ID_1;\n";
568   c += "  int DST_Z = GLOBAL_ID_2;\n";
569   c += "  int tile_x = (tile_id % args.tiles_x) * 4;\n";
570   c += "  int tile_y = (tile_id / args.tiles_x) * 4 + DST_Y;\n";
571 
572   c += "  if (tile_x >= args.dst_tensor.Width() || tile_y >= "
573        "args.dst_tensor.Height() || DST_Z >= args.dst_tensor.Slices()) {\n";
574   c += "    return; \n";
575   c += "  }\n";
576   c += "  ACCUM_FLT4 I0, I1, I2, I3, I4, I5;\n";
577   c += "  ACCUM_FLT at_ar[6];\n";
578   c += "  ACCUM_FLT4 t00 = TO_ACCUM_TYPE(args.at.Read(DST_Y * 2 + 0));\n";
579   c += "  ACCUM_FLT4 t01 = TO_ACCUM_TYPE(args.at.Read(DST_Y * 2 + 1));\n";
580   c += "  at_ar[0] = t00.x;\n";
581   c += "  at_ar[1] = t00.y;\n";
582   c += "  at_ar[2] = t00.z;\n";
583   c += "  at_ar[3] = t00.w;\n";
584   c += "  at_ar[4] = t01.x;\n";
585   c += "  at_ar[5] = t01.y;\n";
586   c += "  {\n";
587   c += "    ACCUM_FLT at = at_ar[0];\n";
588   for (int x = 0; x < 6; ++x) {
589     const std::string yc = std::to_string(x);
590     const std::string src = "src" + std::to_string(x);
591     c += "    ACCUM_FLT4 " + src +
592          " = args.src_tensor.Read<ACCUM_FLT>(tile_id, " + yc + ", DST_Z);\n";
593     c += "    I" + std::to_string(x) + " = at * " + src + ";\n";
594   }
595   c += "  }\n";
596   for (int y = 1; y < 6; ++y) {
597     c += "  {\n";
598     c += "    ACCUM_FLT at = at_ar[" + std::to_string(y) + "];\n";
599     for (int x = 0; x < 6; ++x) {
600       const std::string yc = std::to_string(y * 6 + x);
601       const std::string src = "src" + std::to_string(x);
602       c += "    ACCUM_FLT4 " + src +
603            " = args.src_tensor.Read<ACCUM_FLT>(tile_id, " + yc + ", DST_Z);\n";
604       c += "    I" + std::to_string(x) + " += at * " + src + ";\n";
605     }
606     c += "  }\n";
607   }
608   c += "  ACCUM_FLT4 t0 = I1 + I2;\n";
609   c += "  ACCUM_FLT4 t1 = I3 + I4;\n";
610   c += "  FLT4 bias_val = args.biases.Read(DST_Z);\n";
611   c += "  {\n";
612   c += "    FLT4 r0 = TO_FLT4(I0 + t0 + t1) + bias_val;\n";
613   c += "    args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
614   c += "    tile_x++;\n";
615   c += "  }\n";
616   c += "  ACCUM_FLT4 t2 = I1 - I2;\n";
617   c += "  ACCUM_FLT4 t3 = I3 - I4;\n";
618   c += "  if (tile_x < args.dst_tensor.Width()) {\n";
619   c += "    FLT4 r0 = TO_FLT4(t2 * At[7] + t3 * At[9]) + bias_val;\n";
620   c += "    args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
621   c += "    tile_x++;\n";
622   c += "  }\n";
623   c += "  if (tile_x < args.dst_tensor.Width()) {\n";
624   c += "    FLT4 r0 = TO_FLT4(t0 * At[13] + t1 * At[15]) + bias_val;\n";
625   c += "    args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
626   c += "    tile_x++;\n";
627   c += "  }\n";
628   c += "  if (tile_x < args.dst_tensor.Width()) {\n";
629   c += "    FLT4 r0 = TO_FLT4(t2 * At[19] + t3 * At[21] + I5) + bias_val;\n";
630   c += "    args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
631   c += "    tile_x++;\n";
632   c += "  }\n";
633   c += "}\n";
634   return c;
635 }
636 
UploadAt()637 void Winograd36To4x4Tile4x1::UploadAt() {
638   tflite::gpu::Tensor<Linear, DataType::FLOAT32> at_aligned;
639   at_aligned.shape = Linear(4 * 8);
640   at_aligned.data.resize(4 * 8);
641   auto at_mat = AtMatrixForWinograd4x4To6x6();
642   for (int y = 0; y < 4; ++y) {
643     for (int x = 0; x < 6; ++x) {
644       at_aligned.data[y * 8 + x] = at_mat[y * 6 + x];
645     }
646     at_aligned.data[y * 8 + 6] = 0.0f;
647     at_aligned.data[y * 8 + 7] = 0.0f;
648   }
649 
650   TensorLinearDescriptor desc;
651   desc.storage_type = LinearStorageType::TEXTURE_2D;
652   desc.element_type = definition_.GetDataType();
653   desc.UploadLinearData(at_aligned);
654   args_.AddObject("at",
655                   absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
656 }
657 
SelectBestWorkGroup(const KernelInfo & kernel_info) const658 int3 Winograd36To4x4Tile4x1::SelectBestWorkGroup(
659     const KernelInfo& kernel_info) const {
660   const std::vector<int3> wgs = {{32, 4, 2}, {16, 4, 2}, {16, 4, 1},
661                                  {8, 4, 1},  {4, 4, 1},  {2, 4, 1},
662                                  {1, 4, 1},  {1, 2, 1},  {1, 1, 1}};
663   return GetFirstSuitableWorkGroup(wgs, kernel_info.max_work_group_size);
664 }
665 
BindArguments(ArgumentsBinder * args)666 absl::Status Winograd36To4x4Tile4x1::BindArguments(ArgumentsBinder* args) {
667   const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
668   RETURN_IF_ERROR(args->SetInt("tiles_x", tiles_x));
669   return absl::OkStatus();
670 }
671 
GetGridSize() const672 int3 Winograd36To4x4Tile4x1::GetGridSize() const {
673   const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
674   const int tiles_y = DivideRoundUp(dst_[0]->Height(), 4);
675   const int grid_x = tiles_x * tiles_y * dst_[0]->Batch();
676   const int grid_y = 4;
677   const int grid_z = dst_[0]->Slices();
678   return int3(grid_x, grid_y, grid_z);
679 }
680 
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const681 void Winograd36To4x4Tile4x1::GetPossibleKernelWorkGroups(
682     TuningType tuning_type, const GpuInfo& gpu_info,
683     const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
684   if (gpu_info.IsIntel()) {
685     work_groups->push_back(int3(8, 4, 1));
686     return;
687   }
688   switch (tuning_type) {
689     case TuningType::kExhaustive:
690       GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
691                             work_groups);
692       return;
693     case TuningType::kFast:
694     default:
695       work_groups->push_back(SelectBestWorkGroup(kernel_info));
696       return;
697   }
698 }
699 
CreateWinograd36To4x4Tile4x1(const GpuInfo & gpu_info,const OperationDef & definition,const tflite::gpu::Tensor<Linear,DataType::FLOAT32> & biases)700 Winograd36To4x4Tile4x1 CreateWinograd36To4x4Tile4x1(
701     const GpuInfo& gpu_info, const OperationDef& definition,
702     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
703   Winograd36To4x4Tile4x1 result(definition, gpu_info);
704   TensorLinearDescriptor desc;
705   desc.storage_type = LinearStorageType::TEXTURE_2D;
706   desc.element_type = definition.GetDataType();
707   desc.UploadLinearData(biases);
708   result.args_.AddObject(
709       "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
710   result.UploadAt();
711   return result;
712 }
713 
714 }  // namespace gpu
715 }  // namespace tflite
716