1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
17 
18 #include <algorithm>
19 #include <limits>
20 #include <set>
21 #include <vector>
22 
23 #include "tensorflow/lite/delegates/gpu/common/util.h"
24 
25 namespace tflite {
26 namespace gpu {
27 
28 namespace {
29 
Get2DWorkgroupsEqualTo128()30 std::vector<int2> Get2DWorkgroupsEqualTo128() {
31   return {{128, 1}, {64, 2}, {32, 4}, {16, 8},
32           {8, 16},  {4, 32}, {2, 64}, {1, 128}};
33 }
34 
GenerateWorkGroupSizesXYMultipleOf(int multiplier,int3 grid,const KernelInfo & kernel_info,const GpuInfo & gpu_info,WorkGroupSizeAlignment z_alignment)35 std::vector<int3> GenerateWorkGroupSizesXYMultipleOf(
36     int multiplier, int3 grid, const KernelInfo& kernel_info,
37     const GpuInfo& gpu_info, WorkGroupSizeAlignment z_alignment) {
38   std::vector<int3> work_groups;
39   work_groups.reserve(32);
40 
41   std::vector<int> possible_z_sizes = GetPossibleSizes(grid.z, z_alignment);
42 
43   for (int x = 1; x <= kernel_info.max_work_group_size; x *= 2) {
44     for (int y = 1; y <= kernel_info.max_work_group_size; y *= 2) {
45       int work_group_size_xy = x * y;
46       if (work_group_size_xy % multiplier != 0 ||
47           work_group_size_xy > kernel_info.max_work_group_size) {
48         continue;
49       }
50       for (auto z : possible_z_sizes) {
51         if (work_group_size_xy * z > kernel_info.max_work_group_size) {
52           continue;
53         }
54         if (x <= gpu_info.GetMaxWorkGroupSizeForX() &&
55             y <= gpu_info.GetMaxWorkGroupSizeForY() &&
56             z <= gpu_info.GetMaxWorkGroupSizeForZ()) {
57           work_groups.push_back({x, y, z});
58         }
59       }
60     }
61   }
62   return work_groups;
63 }
64 
GenerateWorkGroupSizesXMultipleOf(int multiplier,int3 grid,const KernelInfo & kernel_info,const GpuInfo & gpu_info,WorkGroupSizeAlignment z_alignment)65 std::vector<int3> GenerateWorkGroupSizesXMultipleOf(
66     int multiplier, int3 grid, const KernelInfo& kernel_info,
67     const GpuInfo& gpu_info, WorkGroupSizeAlignment z_alignment) {
68   std::vector<int3> work_groups;
69   work_groups.reserve(32);
70 
71   std::vector<int> possible_z_sizes = GetPossibleSizes(grid.z, z_alignment);
72   std::vector<int> possible_y_sizes =
73       GetPossibleSizes(grid.y, WorkGroupSizeAlignment::PRECISE);
74 
75   for (int x = multiplier;
76        x <= kernel_info.max_work_group_size && x < grid.x + multiplier;
77        x += multiplier) {
78     for (auto y : possible_y_sizes) {
79       for (auto z : possible_z_sizes) {
80         if (x <= gpu_info.GetMaxWorkGroupSizeForX() &&
81             y <= gpu_info.GetMaxWorkGroupSizeForY() &&
82             z <= gpu_info.GetMaxWorkGroupSizeForZ() &&
83             x * y * z <= kernel_info.max_work_group_size) {
84           work_groups.push_back({x, y, z});
85         }
86       }
87     }
88   }
89   return work_groups;
90 }
91 
GetWorkGroupsAlignedToGrid(const GpuInfo & gpu_info,const KernelInfo & kernel_info,const int3 & grid,std::vector<int3> * work_groups)92 void GetWorkGroupsAlignedToGrid(const GpuInfo& gpu_info,
93                                 const KernelInfo& kernel_info, const int3& grid,
94                                 std::vector<int3>* work_groups) {
95   int3 max_wg_size;
96   max_wg_size.x = gpu_info.GetMaxWorkGroupSizeForX();
97   max_wg_size.y = gpu_info.GetMaxWorkGroupSizeForY();
98   max_wg_size.z = gpu_info.GetMaxWorkGroupSizeForZ();
99   GenerateWorkGroupSizesAlignedToGrid(
100       grid, max_wg_size, kernel_info.max_work_group_size, work_groups);
101 }
102 
GetPenalty(int grid_size,int group_size)103 int GetPenalty(int grid_size, int group_size) {
104   const int reminder = grid_size % group_size;
105   return reminder == 0 ? 0 : group_size - reminder;
106 }
107 
GetPenalty(int2 grid_size,int2 group_size)108 int GetPenalty(int2 grid_size, int2 group_size) {
109   const int p_x = GetPenalty(grid_size.x, group_size.x);
110   const int p_y = GetPenalty(grid_size.y, group_size.y);
111   return p_x * grid_size.y + p_y * grid_size.x + p_x * p_y;
112 }
113 
GetMaxSizeWithMinPenalty(int size,int max_size)114 int GetMaxSizeWithMinPenalty(int size, int max_size) {
115   int best_size = 128;
116   int min_penalty = GetPenalty(size, best_size);
117   for (int i = 2; i * 128 <= max_size; ++i) {
118     if (GetPenalty(size, i * 128) == min_penalty) {
119       best_size = i * 128;
120     }
121   }
122   return best_size;
123 }
124 
GetMaxSizeWithMinPenalty(int2 size,int max_size)125 int2 GetMaxSizeWithMinPenalty(int2 size, int max_size) {
126   std::vector<int2> base_groups = Get2DWorkgroupsEqualTo128();
127   int min_penalty = std::numeric_limits<int>::max();
128   for (const auto& group : base_groups) {
129     min_penalty = std::min(GetPenalty(size, group), min_penalty);
130   }
131   for (const auto& group : base_groups) {
132     for (int y = 1; y * group.y <= max_size; ++y) {
133       int new_group_y = y * group.y;
134       for (int x = 1; x * group.x <= max_size; ++x) {
135         int new_group_x = x * group.x;
136         if (new_group_x * new_group_y > max_size) {
137           break;
138         }
139         if (GetPenalty(size, int2(new_group_x, new_group_y)) == min_penalty) {
140           return int2(new_group_x, new_group_y);
141         }
142       }
143     }
144   }
145   return int2(0, 0);
146 }
147 
GetBiggestDividerWithPriority(int number,int max_divider)148 int GetBiggestDividerWithPriority(int number, int max_divider) {
149   if (number % 8 == 0 && 8 <= max_divider) {
150     return 8;
151   }
152   if (number % 4 == 0 && 4 <= max_divider) {
153     return 4;
154   }
155   if (number % 2 == 0 && 2 <= max_divider) {
156     return 2;
157   }
158   for (int i = max_divider; i != 0; i--) {
159     if (number % i == 0) {
160       return i;
161     }
162   }
163   return 1;
164 }
165 
GetBiggestDivider(int number,int max_divider)166 int GetBiggestDivider(int number, int max_divider) {
167   for (int i = max_divider; i != 0; i--) {
168     if (number % i == 0) {
169       return i;
170     }
171   }
172   return 1;
173 }
174 
GetOptimalSizeForApple(int grid_size)175 int GetOptimalSizeForApple(int grid_size) {
176   if (grid_size % 8 == 0 || grid_size % 8 >= 4 || grid_size >= 16) {
177     return 8;
178   }
179   if (grid_size % 4 == 0 || grid_size % 4 >= 2 || grid_size >= 8) {
180     return 4;
181   }
182   if (grid_size % 2 == 0 || grid_size >= 4) {
183     return 2;
184   }
185   return 1;
186 }
187 
GetWorkGroupSizeForApple(const uint3 & grid_size)188 int3 GetWorkGroupSizeForApple(const uint3& grid_size) {
189   int x_size = GetOptimalSizeForApple(grid_size.x);
190   int y_size = GetOptimalSizeForApple(grid_size.y);
191   int z_size = std::max(1, 32 / (x_size * y_size));
192   z_size = std::min(z_size, static_cast<int>(grid_size.z));
193   return {x_size, y_size, z_size};
194 }
195 
196 }  // namespace
197 
GetWorkGroupXY128ConvLinear(const int3 & grid)198 int3 GetWorkGroupXY128ConvLinear(const int3& grid) {
199   int grid_z = GetBiggestDividerWithPriority(grid.z, 4);
200   if (grid.x <= 128) {
201     return int3(128, 1, grid_z);
202   }
203   int grid_x = GetMaxSizeWithMinPenalty(grid.x, 512 / grid_z);
204   return {grid_x, 1, grid_z};
205 }
206 
GetWorkGroupXY128Conv(const int3 & grid)207 int3 GetWorkGroupXY128Conv(const int3& grid) {
208   int grid_z = GetBiggestDividerWithPriority(grid.z, 4);
209   if (grid.x <= 16 && grid.y <= 8) {
210     return int3(16, 8, grid_z);
211   }
212   int2 grid_xy = GetMaxSizeWithMinPenalty(int2(grid.x, grid.y), 512 / grid_z);
213   return int3(grid_xy.x, grid_xy.y, grid_z);
214 }
215 
GetWorkGroupXY128Simple(const int3 & grid)216 int3 GetWorkGroupXY128Simple(const int3& grid) { return int3(16, 8, 1); }
217 
GetWorkGroup(const int3 & grid,int max_size)218 int3 GetWorkGroup(const int3& grid, int max_size) {
219   int wg_z = GetBiggestDividerWithPriority(grid.z, 8);
220   int wg_xy_size = max_size / wg_z;
221   int wg_x = std::min(DivideRoundUp(grid.x, 2), wg_xy_size);
222   int wg_y = std::min(wg_xy_size / wg_x, grid.y);
223   return int3(wg_x, wg_y, wg_z);
224 }
225 
GetWorkGroupConv(const int3 & grid,int max_size,int max_z_size)226 int3 GetWorkGroupConv(const int3& grid, int max_size, int max_z_size) {
227   int wg_z = GetBiggestDivider(grid.z, max_z_size);
228   int wg_xy_size = std::min(256, max_size) / wg_z;
229   int wg_x = std::min(grid.x, wg_xy_size);
230   int wg_y = std::min(wg_xy_size / wg_x, grid.y);
231   if (wg_y == grid.y && grid.y % 2 == 0) {
232     wg_y = grid.y / 2;
233   }
234   return int3(wg_x, wg_y, wg_z);
235 }
236 
GetPossibleWorkGroupsXYMultipleOf(int multiplier,const GpuInfo & gpu_info,const KernelInfo & kernel_info,const int3 & grid,WorkGroupSizeAlignment z_alignment,std::vector<int3> * work_groups)237 void GetPossibleWorkGroupsXYMultipleOf(int multiplier, const GpuInfo& gpu_info,
238                                        const KernelInfo& kernel_info,
239                                        const int3& grid,
240                                        WorkGroupSizeAlignment z_alignment,
241                                        std::vector<int3>* work_groups) {
242   *work_groups = GenerateWorkGroupSizesXYMultipleOf(
243       multiplier, grid, kernel_info, gpu_info, z_alignment);
244 }
245 
GetPossibleWorkGroupsXMultipleOf(int multiplier,const GpuInfo & gpu_info,const KernelInfo & kernel_info,const int3 & grid,WorkGroupSizeAlignment z_alignment,std::vector<int3> * work_groups)246 void GetPossibleWorkGroupsXMultipleOf(int multiplier, const GpuInfo& gpu_info,
247                                       const KernelInfo& kernel_info,
248                                       const int3& grid,
249                                       WorkGroupSizeAlignment z_alignment,
250                                       std::vector<int3>* work_groups) {
251   *work_groups = GenerateWorkGroupSizesXMultipleOf(
252       multiplier, grid, kernel_info, gpu_info, z_alignment);
253 }
254 
XY128RequiresMoreWorkGroupsThenXY128Linear(int width,int height)255 bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) {
256   int planar_work_groups = DivideRoundUp(width * height, 128);
257   auto base_work_groups = Get2DWorkgroupsEqualTo128();
258   bool have_equal_work_groups = false;
259   for (auto& work_group : base_work_groups) {
260     int x_groups = DivideRoundUp(width, work_group.x);
261     int y_groups = DivideRoundUp(height, work_group.y);
262     int xy_groups = x_groups * y_groups;
263     if (xy_groups == planar_work_groups) {
264       have_equal_work_groups = true;
265       break;
266     }
267   }
268   return !have_equal_work_groups;
269 }
270 
GetPossibleWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,const int3 & grid,std::vector<int3> * work_groups)271 void GetPossibleWorkGroups(TuningType tuning_type, const GpuInfo& gpu_info,
272                            const KernelInfo& kernel_info, const int3& grid,
273                            std::vector<int3>* work_groups) {
274   if (gpu_info.IsApple()) {
275     work_groups->push_back(GetWorkGroupSizeForApple(grid));
276     return;
277   }
278   switch (tuning_type) {
279     case TuningType::kFast:
280       work_groups->push_back(
281           GetWorkGroup(grid, kernel_info.max_work_group_size));
282       return;
283     case TuningType::kExhaustive: {
284       GetWorkGroupsAlignedToGrid(gpu_info, kernel_info, grid, work_groups);
285       return;
286     }
287     default:
288       work_groups->push_back({8, 4, 1});
289       return;
290   }
291 }
292 
GetPossibleWorkGroupsConv(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,const int3 & grid,std::vector<int3> * work_groups)293 void GetPossibleWorkGroupsConv(TuningType tuning_type, const GpuInfo& gpu_info,
294                                const KernelInfo& kernel_info, const int3& grid,
295                                std::vector<int3>* work_groups) {
296   if (gpu_info.IsApple()) {
297     work_groups->push_back(GetWorkGroupSizeForApple(grid));
298     return;
299   }
300   switch (tuning_type) {
301     case TuningType::kFast: {
302       int max_z_size = 16;
303       if (gpu_info.IsAdreno()) {
304         max_z_size = gpu_info.adreno_info.IsAdreno3xx() ? 16 : 64;
305       }
306       max_z_size = std::min(max_z_size, gpu_info.GetMaxWorkGroupSizeForZ());
307       work_groups->push_back(
308           GetWorkGroupConv(grid, kernel_info.max_work_group_size, max_z_size));
309       return;
310     }
311     case TuningType::kExhaustive: {
312       GetWorkGroupsAlignedToGrid(gpu_info, kernel_info, grid, work_groups);
313       return;
314     }
315     default:
316       work_groups->push_back({8, 4, 1});
317       return;
318   }
319 }
320 
GetFirstSuitableWorkGroup(const std::vector<int3> & wgs,int max_wg_size)321 int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size) {
322   for (const auto& wg : wgs) {
323     const int wg_size = wg.x * wg.y * wg.z;
324     if (wg_size <= max_wg_size) {
325       return wg;
326     }
327   }
328   return {1, 1, 1};
329 }
330 
331 }  // namespace gpu
332 }  // namespace tflite
333