1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_BUFFER_1X1_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_BUFFER_1X1_H_
18 
19 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
20 #include "tensorflow/lite/delegates/gpu/common/operations.h"
21 #include "tensorflow/lite/delegates/gpu/common/shape.h"
22 #include "tensorflow/lite/delegates/gpu/common/status.h"
23 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/tensor_linear_desc.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
28 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
29 #include "tensorflow/lite/delegates/gpu/common/types.h"
30 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
31 
32 namespace tflite {
33 namespace gpu {
34 
35 class ConvBuffer1x1 : public GPUOperation {
36  public:
37   ConvBuffer1x1() = default;
38 
39   // Move only
40   ConvBuffer1x1(ConvBuffer1x1&& operation);
41   ConvBuffer1x1& operator=(ConvBuffer1x1&& operation);
42   ConvBuffer1x1(const ConvBuffer1x1&) = delete;
43   ConvBuffer1x1& operator=(const ConvBuffer1x1&) = delete;
44 
45   void GetPossibleKernelWorkGroups(
46       TuningType tuning_type, const GpuInfo& gpu_info,
47       const KernelInfo& kernel_info,
48       std::vector<int3>* work_groups) const override;
49   int3 GetGridSize() const override;
50 
GetWeightsDescription()51   WeightsDescription GetWeightsDescription() const {
52     WeightsDescription desc;
53     desc.layout = WeightsLayout::kOHWIOGroupI4O4;
54     desc.output_group_size = conv_params_.block_size.z;
55     return desc;
56   }
57 
58   struct ConvParams {
59     int3 block_size = int3(1, 1, 1);
60     int element_size = 4;  // can be 4, 8 or 16
61 
62     // By default in 2d convolution we have the same weights for WH dims, but in
63     // some cases we need separate weights for H dimension and convolution
64     // kernel requires very small modifications to support it.
65     bool different_weights_for_height = false;
66   };
67 
68  private:
69   ConvBuffer1x1(const OperationDef& definition, const ConvParams& conv_params);
70   friend ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
71                                            const OperationDef& definition,
72                                            const Convolution2DAttributes& attr,
73                                            const BHWC* shape);
74   friend ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
75                                            const OperationDef& definition,
76                                            const FullyConnectedAttributes& attr,
77                                            const BHWC* shape);
78   friend ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
79       const GpuInfo& gpu_info, const OperationDef& definition,
80       const Convolution2DAttributes& attr, const BHWC* shape);
81   friend ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
82       const GpuInfo& gpu_info, const OperationDef& definition,
83       const Convolution2DAttributes& attr, const BHWC& weights_shape,
84       const BHWC* dst_shape);
85 
86   template <DataType T>
87   void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
88                   const tflite::gpu::Tensor<Linear, T>& biases);
89   template <DataType T>
90   void UploadDataForWinograd4x4To6x6(
91       const tflite::gpu::Tensor<OHWI, T>& weights);
92 
93   template <DataType T>
94   void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
95 
96   template <DataType T>
97   void UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases);
98 
99   std::string GenerateConvBuffer1x1(
100       const OperationDef& op_def, const ConvBuffer1x1::ConvParams& conv_params,
101       Arguments* args);
102 
103   ConvParams conv_params_;
104 };
105 
106 template <DataType T>
UploadData(const tflite::gpu::Tensor<OHWI,T> & weights,const tflite::gpu::Tensor<Linear,T> & biases)107 void ConvBuffer1x1::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
108                                const tflite::gpu::Tensor<Linear, T>& biases) {
109   UploadWeights(weights);
110   UploadBiases(biases);
111 }
112 
113 template <DataType T>
UploadDataForWinograd4x4To6x6(const tflite::gpu::Tensor<OHWI,T> & weights)114 void ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
115     const tflite::gpu::Tensor<OHWI, T>& weights) {
116   tflite::gpu::Tensor<OHWI, T> wino_weights;
117   RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
118   UploadWeights(wino_weights);
119   tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
120   bias.shape = Linear(weights.shape.o);
121   bias.data.resize(weights.shape.o, 0.0f);
122   UploadBiases(bias);
123 }
124 
125 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights)126 void ConvBuffer1x1::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
127   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
128   const int src_depth = DivideRoundUp(weights.shape.i, 4);
129 
130   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
131   const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
132 
133   const int dst_depth_aligned = AlignByN(dst_depth, conv_params_.block_size.z);
134   const int elements_count =
135       weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
136 
137   BufferDescriptor desc;
138   desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
139   desc.element_size = 16;
140   desc.memory_type = MemoryType::GLOBAL;
141   desc.size = float4_size * elements_count;
142   desc.data.resize(desc.size);
143 
144   if (f32_weights) {
145     float4* ptr = reinterpret_cast<float4*>(desc.data.data());
146     RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
147                                      absl::MakeSpan(ptr, elements_count));
148   } else {
149     half4* ptr = reinterpret_cast<half4*>(desc.data.data());
150     RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
151                                      absl::MakeSpan(ptr, elements_count));
152   }
153 
154   args_.AddObject("weights",
155                   absl::make_unique<BufferDescriptor>(std::move(desc)));
156 }
157 
158 template <DataType T>
UploadBiases(const tflite::gpu::Tensor<Linear,T> & biases)159 void ConvBuffer1x1::UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases) {
160   TensorLinearDescriptor desc;
161   desc.storage_type = LinearStorageType::BUFFER;
162   desc.element_type = definition_.GetDataType();
163   int depth = AlignByN(biases.shape.v, 4 * conv_params_.block_size.z) / 4;
164   desc.UploadLinearData(biases, depth);
165   args_.AddObject("biases",
166                   absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
167 }
168 
169 bool IsConvBuffer1x1Supported(const OperationDef& definition,
170                               const Convolution2DAttributes& attr);
171 
172 bool IsConvBuffer1x1Supported(const OperationDef& definition,
173                               const BHWC& weights_shape,
174                               const Convolution2DAttributes& attr);
175 
176 ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
177                                   const OperationDef& definition,
178                                   const Convolution2DAttributes& attr,
179                                   const BHWC* shape = nullptr);
180 
181 ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
182                                   const OperationDef& definition,
183                                   const FullyConnectedAttributes& attr,
184                                   const BHWC* shape = nullptr);
185 
186 ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
187     const GpuInfo& gpu_info, const OperationDef& definition,
188     const Convolution2DAttributes& attr, const BHWC& weights_shape,
189     const BHWC* dst_shape = nullptr);
190 
191 ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
192     const GpuInfo& gpu_info, const OperationDef& definition,
193     const Convolution2DAttributes& attr, const BHWC* shape = nullptr);
194 
195 }  // namespace gpu
196 }  // namespace tflite
197 
198 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_BUFFER_1X1_H_
199