1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_POWERVR_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_POWERVR_H_
18 
19 #include <cstring>
20 #include <vector>
21 
22 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
23 #include "tensorflow/lite/delegates/gpu/common/operations.h"
24 #include "tensorflow/lite/delegates/gpu/common/shape.h"
25 #include "tensorflow/lite/delegates/gpu/common/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
28 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/tensor_linear_desc.h"
30 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
31 #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
32 #include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
33 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
34 #include "tensorflow/lite/delegates/gpu/common/types.h"
35 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
36 
37 namespace tflite {
38 namespace gpu {
39 
40 class ConvPowerVR : public GPUOperation {
41  public:
42   ConvPowerVR() = default;
43   void GetPossibleKernelWorkGroups(
44       TuningType tuning_type, const GpuInfo& gpu_info,
45       const KernelInfo& kernel_info,
46       std::vector<int3>* work_groups) const override;
47   absl::Status BindArguments(ArgumentsBinder* args) override;
48   int3 GetGridSize() const override;
49 
GetWeightsDescription()50   WeightsDescription GetWeightsDescription() const {
51     WeightsDescription desc;
52     desc.layout = conv_params_.weights_layout;
53     desc.output_group_size = conv_params_.block_size.w;
54     return desc;
55   }
56 
57   // Move only
58   ConvPowerVR(ConvPowerVR&& operation);
59   ConvPowerVR& operator=(ConvPowerVR&& operation);
60   ConvPowerVR(const ConvPowerVR&) = delete;
61   ConvPowerVR& operator=(const ConvPowerVR&) = delete;
62 
63  private:
64   enum class WeightsUploadType {
65     LOCAL_MEM_ASYNC_SUBGROUP,  // we use it for PowerVR with workgroup size = 32
66     LOCAL_MEM_BY_THREADS,
67     GLOBAL_MEM,
68     CONSTANT_MEM,
69     PRIVATE_MEM_SIMD_BROADCAST,
70     TEXTURES_MEM_X4,  // 4 textures for weights
71   };
72 
73   struct ConvParams {
74     // Usually we use this combinations for CalculationPrecision:
75     // F32: all F32
76     // F16: all F16
77     // F32_F16: all besides accumulator is F16, including weights
78     // But for PowerVR we can achieve better performance in F32_F16 with F32
79     // weights, so for PowerVR in this kernel we have F32 weights for
80     // F32_F16 precision mode
81     DataType weights_data_type;  // used for weights and biases
82     int4 block_size;             // WHDS
83     bool fixed_work_group_size;
84     bool linear_spatial;  // spatial dimensions are Width/Height/Depth
85     bool linear_all;  // linear_spatial & linear_all can not be used together,
86                       // linear_all can not be used with WeightsUploadTypes
87                       // that use workgroups(subgroups) for
88                       // uploading(LOCAL_MEM_BY_THREADS for example).
89     bool different_weights_for_height;
90     int src_depth_loop_size;
91     WeightsUploadType weights_upload_type;
92     bool x_kernel_is_1;
93     bool y_kernel_is_1;
94     bool z_kernel_is_1;
95     WeightsLayout weights_layout;
96 
97     // used only with PRIVATE_MEM_SIMD_BROADCAST
98     int simd_size = 1;
99 
AreWeightsBufferConvParams100     bool AreWeightsBuffer() const {
101       return weights_upload_type != WeightsUploadType::TEXTURES_MEM_X4;
102     }
103 
IsPrivateMemBroadcastConvParams104     bool IsPrivateMemBroadcast() const {
105       return weights_upload_type ==
106              WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
107     }
108   };
109 
110   ConvPowerVR(const OperationDef& definition,
111               const Convolution2DAttributes& attr, const GpuInfo& gpu_info,
112               const BHWC* dst_shape = nullptr);
113   ConvPowerVR(const OperationDef& definition,
114               const Convolution2DAttributes& attr, const BHWC& weights_shape,
115               const GpuInfo& gpu_info, const BHWC* dst_shape = nullptr);
116   ConvPowerVR(const OperationDef& definition,
117               const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
118               const BHWC* dst_shape = nullptr);
119   explicit ConvPowerVR(const OperationDef& definition);
120   ConvPowerVR(const OperationDef& definition,
121               const Convolution3DAttributes& attr, const GpuInfo& gpu_info,
122               const BHWDC* dst_shape = nullptr);
123 
124   void GenerateCode(const GpuInfo& gpu_info);
125 
126   template <DataType T>
127   void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
128                   const tflite::gpu::Tensor<Linear, T>& biases);
129   template <DataType T>
130   void UploadDataForWinograd4x4To6x6(
131       const tflite::gpu::Tensor<OHWI, T>& weights);
132 
133   template <DataType T>
134   void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
135 
136   template <DataType T>
137   void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
138 
139   template <DataType T>
140   void UploadBias(const tflite::gpu::Tensor<Linear, T>& bias);
141 
142   friend ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
143                                        const OperationDef& definition,
144                                        const Convolution2DAttributes& attr,
145                                        const BHWC* dst_shape);
146 
147   friend ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
148                                        const OperationDef& definition,
149                                        const FullyConnectedAttributes& attr,
150                                        const BHWC* dst_shape);
151 
152   friend ConvPowerVR CreateConvPowerVRDynamicWeights(
153       const GpuInfo& gpu_info, const OperationDef& definition,
154       const Convolution2DAttributes& attr, const BHWC& weights_shape,
155       const BHWC* dst_shape);
156 
157   friend ConvPowerVR CreateConvPowerVRWino4x4To6x6(
158       const GpuInfo& gpu_info, const OperationDef& definition,
159       const Convolution2DAttributes& attr, const BHWC* dst_shape);
160 
161   friend ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
162                                          const OperationDef& definition,
163                                          const Convolution3DAttributes& attr,
164                                          const BHWDC* dst_shape);
165 
166   ConvParams GuessBestParams(const GpuInfo& gpu_info,
167                              const OperationDef& definition,
168                              const Convolution2DAttributes& attr,
169                              const BHWC* dst_shape = nullptr);
170   ConvParams GuessBestParams(const GpuInfo& gpu_info,
171                              const OperationDef& definition,
172                              const Convolution2DAttributes& attr,
173                              const BHWC& weights_shape,
174                              const BHWC* dst_shape = nullptr);
175   ConvParams GuessBestParams(const GpuInfo& gpu_info,
176                              const OperationDef& definition,
177                              const FullyConnectedAttributes& attr,
178                              const BHWC* dst_shape = nullptr);
179   ConvParams GuessBestParamsWinograd(const GpuInfo& gpu_info,
180                                      const OperationDef& definition,
181                                      const Convolution2DAttributes& attr,
182                                      const BHWC* dst_shape = nullptr);
183   ConvParams GuessBestParams(const GpuInfo& gpu_info,
184                              const OperationDef& definition,
185                              const Convolution3DAttributes& attr,
186                              const BHWDC* dst_shape = nullptr);
187   ConvParams GuessBestParams(const GpuInfo& gpu_info,
188                              const OperationDef& definition, int src_depth,
189                              int dst_depth, bool x_kernel_is_1,
190                              bool y_kernel_is_1,
191                              bool different_weights_for_height,
192                              const BHWC* dst_shape = nullptr);
193 
194   std::string GenerateConv(const GpuInfo& gpu_info, const OperationDef& op_def,
195                            bool stride_correction,
196                            const ConvParams& conv_params);
197 
198   int4 stride_;
199   int4 padding_;
200   int4 kernel_size_;
201   int4 dilation_;
202   ConvParams conv_params_;
203 };
204 
205 template <DataType T>
UploadData(const tflite::gpu::Tensor<OHWI,T> & weights,const tflite::gpu::Tensor<Linear,T> & biases)206 void ConvPowerVR::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
207                              const tflite::gpu::Tensor<Linear, T>& biases) {
208   UploadWeights(weights);
209   UploadBias(biases);
210 }
211 
212 template <DataType T>
UploadDataForWinograd4x4To6x6(const tflite::gpu::Tensor<OHWI,T> & weights)213 void ConvPowerVR::UploadDataForWinograd4x4To6x6(
214     const tflite::gpu::Tensor<OHWI, T>& weights) {
215   tflite::gpu::Tensor<OHWI, T> wino_weights;
216   RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
217   UploadWeights(wino_weights);
218   tflite::gpu::Tensor<Linear, DataType::FLOAT32> biases;
219   biases.shape = Linear(weights.shape.o);
220   biases.data.resize(weights.shape.o, 0.0f);
221   UploadBias(biases);
222 }
223 
224 template <DataType T>
UploadBias(const tflite::gpu::Tensor<Linear,T> & bias)225 void ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias) {
226   BufferDescriptor desc;
227   desc.element_type = conv_params_.weights_data_type;
228   desc.element_size = 4;
229   desc.memory_type = conv_params_.weights_upload_type ==
230                              ConvPowerVR::WeightsUploadType::CONSTANT_MEM
231                          ? MemoryType::CONSTANT
232                          : MemoryType::GLOBAL;
233   const int float_size = conv_params_.weights_data_type == DataType::FLOAT32
234                              ? sizeof(float)
235                              : sizeof(half);
236   int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.w);
237   desc.size = float_size * aligned_channels;
238   desc.data.resize(desc.size);
239   if (conv_params_.weights_data_type == DataType::FLOAT32) {
240     float* gpu_data = reinterpret_cast<float*>(desc.data.data());
241     for (int i = 0; i < aligned_channels; ++i) {
242       gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
243     }
244   } else {
245     half* gpu_data = reinterpret_cast<half*>(desc.data.data());
246     for (int i = 0; i < aligned_channels; ++i) {
247       gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
248     }
249   }
250   args_.AddObject("biases",
251                   absl::make_unique<BufferDescriptor>(std::move(desc)));
252 }
253 
254 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights)255 void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
256   const int flt_count =
257       GetTotalElementsCountForLayout(GetWeightsDescription(), weights.shape);
258   DataType weights_type = conv_params_.weights_data_type;
259 
260   std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_type));
261   RearrangeWeights(weights, GetWeightsDescription(), weights_type,
262                    absl::MakeSpan(weights_data));
263 
264   if (conv_params_.AreWeightsBuffer()) {
265     BufferDescriptor desc;
266     desc.element_type = weights_type;
267     desc.element_size = 4;
268     desc.memory_type = conv_params_.weights_upload_type ==
269                                ConvPowerVR::WeightsUploadType::CONSTANT_MEM
270                            ? MemoryType::CONSTANT
271                            : MemoryType::GLOBAL;
272     desc.size = weights_data.size();
273     desc.data = std::move(weights_data);
274     args_.AddObject("weights",
275                     absl::make_unique<BufferDescriptor>(std::move(desc)));
276   } else {
277     const int dst_depth =
278         AlignByN(DivideRoundUp(weights.shape.o, 4), conv_params_.block_size.w);
279     const int src_depth = DivideRoundUp(weights.shape.i, 4);
280     const int kernel_x = weights.shape.w;
281     const int kernel_y = weights.shape.h;
282     int texture_width = dst_depth;
283     int texture_height = src_depth * kernel_x * kernel_y;
284     int sub_size = SizeOf(weights_type) * 4 * texture_width * texture_height;
285     for (int i = 0; i < 4; ++i) {
286       Texture2DDescriptor desc;
287       desc.element_type = weights_type;
288       desc.size = int2(texture_width, texture_height);
289       desc.data.resize(sub_size);
290       memcpy(desc.data.data(), weights_data.data() + sub_size * i, sub_size);
291       const std::string name = "weights" + std::to_string(i);
292       args_.AddObject(name,
293                       absl::make_unique<Texture2DDescriptor>(std::move(desc)));
294     }
295   }
296 }
297 
298 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWDI,T> & weights)299 void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights) {
300   const int block_size = conv_params_.block_size.w;
301   const int dst_slices =
302       AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
303   const int src_slices = DivideRoundUp(weights.shape.i, 4);
304 
305   const int elements_count = weights.shape.d * weights.shape.h *
306                              weights.shape.w * src_slices * dst_slices * 4;
307   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
308 
309   const int float4_size = f32_weights ? 16 : 8;
310 
311   std::vector<uint8_t> data(float4_size * elements_count);
312 
313   if (f32_weights) {
314     float4* ptr = reinterpret_cast<float4*>(data.data());
315     if (conv_params_.AreWeightsBuffer()) {
316       RearrangeWeightsToODHWIOGroupI4O4(weights, conv_params_.block_size.w,
317                                         absl::MakeSpan(ptr, elements_count));
318     } else {
319       RearrangeWeightsToI4DHWIOOGroupO4(weights, conv_params_.block_size.w,
320                                         absl::MakeSpan(ptr, elements_count));
321     }
322   } else {
323     half4* ptr = reinterpret_cast<half4*>(data.data());
324     if (conv_params_.AreWeightsBuffer()) {
325       RearrangeWeightsToODHWIOGroupI4O4(weights, conv_params_.block_size.w,
326                                         absl::MakeSpan(ptr, elements_count));
327     } else {
328       RearrangeWeightsToI4DHWIOOGroupO4(weights, conv_params_.block_size.w,
329                                         absl::MakeSpan(ptr, elements_count));
330     }
331   }
332 
333   if (conv_params_.AreWeightsBuffer()) {
334     BufferDescriptor desc;
335     desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
336     desc.element_size = 4;
337     desc.size = float4_size * elements_count;
338     desc.data = std::move(data);
339     args_.AddObject("weights",
340                     absl::make_unique<BufferDescriptor>(std::move(desc)));
341   } else {
342     const int texture_width = dst_slices;
343     const int texture_height =
344         src_slices * weights.shape.d * weights.shape.h * weights.shape.w;
345     int sub_size = float4_size * texture_width * texture_height;
346     for (int i = 0; i < 4; ++i) {
347       Texture2DDescriptor desc;
348       desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
349       desc.size = int2(texture_width, texture_height);
350       desc.data.resize(sub_size);
351       memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
352       const std::string name = "weights" + std::to_string(i);
353       args_.AddObject(name,
354                       absl::make_unique<Texture2DDescriptor>(std::move(desc)));
355     }
356   }
357 }
358 
359 ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
360                               const OperationDef& definition,
361                               const Convolution2DAttributes& attr,
362                               const BHWC* dst_shape = nullptr);
363 
364 ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
365                               const OperationDef& definition,
366                               const FullyConnectedAttributes& attr,
367                               const BHWC* dst_shape = nullptr);
368 
369 ConvPowerVR CreateConvPowerVRDynamicWeights(const GpuInfo& gpu_info,
370                                             const OperationDef& definition,
371                                             const Convolution2DAttributes& attr,
372                                             const BHWC& weights_shape,
373                                             const BHWC* dst_shape = nullptr);
374 
375 ConvPowerVR CreateConvPowerVRWino4x4To6x6(const GpuInfo& gpu_info,
376                                           const OperationDef& definition,
377                                           const Convolution2DAttributes& attr,
378                                           const BHWC* dst_shape = nullptr);
379 
380 ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
381                                 const OperationDef& definition,
382                                 const Convolution3DAttributes& attr,
383                                 const BHWDC* dst_shape = nullptr);
384 
385 }  // namespace gpu
386 }  // namespace tflite
387 
388 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_POWERVR_H_
389