1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_H_
18 
19 #include <memory>
20 #include <vector>
21 
22 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
23 #include "tensorflow/lite/delegates/gpu/common/operations.h"
24 #include "tensorflow/lite/delegates/gpu/common/shape.h"
25 #include "tensorflow/lite/delegates/gpu/common/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
28 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
30 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
31 #include "tensorflow/lite/delegates/gpu/common/types.h"
32 
33 namespace tflite {
34 namespace gpu {
35 
36 class DepthwiseConv3x3 : public GPUOperation {
37  public:
38   DepthwiseConv3x3() = default;
39   void GetPossibleKernelWorkGroups(
40       TuningType tuning_type, const GpuInfo& gpu_info,
41       const KernelInfo& kernel_info,
42       std::vector<int3>* work_groups) const override;
43   int3 GetGridSize() const override;
44 
45   // Move only
46   DepthwiseConv3x3(DepthwiseConv3x3&& operation);
47   DepthwiseConv3x3& operator=(DepthwiseConv3x3&& operation);
48   DepthwiseConv3x3(const DepthwiseConv3x3&) = delete;
49   DepthwiseConv3x3& operator=(const DepthwiseConv3x3&) = delete;
50 
51  private:
52   explicit DepthwiseConv3x3(const OperationDef& definition,
53                             bool weights_are_buffer, bool local_mem_uploads,
54                             const GpuInfo& gpu_info);
55   template <DataType T>
56   void UploadWeightsAndBiases(const tflite::gpu::Tensor<OHWI, T>& weights,
57                               const tflite::gpu::Tensor<Linear, T>& biases,
58                               bool weights_are_buffer);
59 
60   friend DepthwiseConv3x3 CreateDepthwiseConv3x3(
61       const GpuInfo& gpu_info, const OperationDef& definition,
62       const DepthwiseConvolution2DAttributes& attr);
63 
64   template <DataType S, typename T>
65   void RearrangeWeightsAndBiasesData(
66       const tflite::gpu::Tensor<OHWI, S>& weights,
67       const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst);
68 
69   std::string GenerateDepthwiseConvCode(const OperationDef& op_def,
70                                         bool weights_are_buffer,
71                                         bool local_mem_uploads);
72 
73   bool local_mem_uploads_;
74 };
75 
76 template <DataType T>
UploadWeightsAndBiases(const tflite::gpu::Tensor<OHWI,T> & weights,const tflite::gpu::Tensor<Linear,T> & biases,bool weights_are_buffer)77 void DepthwiseConv3x3::UploadWeightsAndBiases(
78     const tflite::gpu::Tensor<OHWI, T>& weights,
79     const tflite::gpu::Tensor<Linear, T>& biases, bool weights_are_buffer) {
80   const int src_depth = DivideRoundUp(weights.shape.i, 4);
81   int texture_width = 10;  // 3x3 kernel + 1 bias
82   int texture_height = src_depth;
83   const int elements_count = texture_width * texture_height;
84   const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
85   const int float4_size = fp32_weights ? 16 : 8;
86 
87   std::vector<uint8_t> data(float4_size * elements_count);
88   if (fp32_weights) {
89     float4* ptr = reinterpret_cast<float4*>(data.data());
90     RearrangeWeightsAndBiasesData(weights, biases,
91                                   absl::MakeSpan(ptr, elements_count));
92   } else {
93     half4* ptr = reinterpret_cast<half4*>(data.data());
94     RearrangeWeightsAndBiasesData(weights, biases,
95                                   absl::MakeSpan(ptr, elements_count));
96   }
97 
98   if (weights_are_buffer) {
99     BufferDescriptor desc;
100     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
101     desc.element_size = 4;
102     desc.size = float4_size * elements_count;
103     desc.data = std::move(data);
104     args_.AddObject("weights",
105                     absl::make_unique<BufferDescriptor>(std::move(desc)));
106   } else {
107     Texture2DDescriptor desc;
108     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
109     desc.size = int2(texture_width, texture_height);
110     desc.data = std::move(data);
111     args_.AddObject("weights",
112                     absl::make_unique<Texture2DDescriptor>(std::move(desc)));
113   }
114 }
115 
116 template <DataType S, typename T>
RearrangeWeightsAndBiasesData(const tflite::gpu::Tensor<OHWI,S> & weights,const tflite::gpu::Tensor<Linear,S> & biases,absl::Span<T> dst)117 void DepthwiseConv3x3::RearrangeWeightsAndBiasesData(
118     const tflite::gpu::Tensor<OHWI, S>& weights,
119     const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) {
120   const int src_depth = DivideRoundUp(weights.shape.i, 4);
121 
122   int counter = 0;
123   for (int s = 0; s < src_depth; ++s) {
124     for (int y = 0; y < 3; ++y) {
125       for (int x = 0; x < 3; ++x) {
126         T filter_val;
127         for (int i = 0; i < 4; ++i) {
128           const int s_ch = s * 4 + i;
129           if (s_ch < weights.shape.i) {
130             const int f_index = weights.shape.LinearIndex({0, y, x, s_ch});
131             filter_val[i] = weights.data[f_index];
132           } else {
133             filter_val[i] = 0.0f;
134           }
135         }
136         dst[counter++] = filter_val;
137       }
138     }
139 
140     T bias_val;
141     for (int i = 0; i < 4; ++i) {
142       const int dst_ch = s * 4 + i;
143       bias_val[i] = dst_ch >= biases.shape.v ? 0.0f : biases.data[dst_ch];
144     }
145     dst[counter++] = bias_val;
146   }
147 }
148 
149 bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes& attr);
150 
151 DepthwiseConv3x3 CreateDepthwiseConv3x3(
152     const GpuInfo& gpu_info, const OperationDef& definition,
153     const DepthwiseConvolution2DAttributes& attr);
154 
155 }  // namespace gpu
156 }  // namespace tflite
157 
158 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_H_
159