1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_STRIDE_H2_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_STRIDE_H2_H_
18 
19 #include <vector>
20 
21 #include "tensorflow/lite/delegates/gpu/common/operations.h"
22 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
23 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
25 
26 namespace tflite {
27 namespace gpu {
28 
29 // Depth Wise Convolution for kernel 3x3
30 // require:
31 //   channels_multiplier = 1;
32 //   kernel_size = 3x3;
33 //   dilation.y = 1;
34 //   stride.y = 2;
35 class DepthWiseConv3x3StrideH2 : public GPUOperation {
36  public:
37   DepthWiseConv3x3StrideH2() = default;
38   void GetPossibleKernelWorkGroups(
39       TuningType tuning_type, const GpuInfo& gpu_info,
40       const KernelInfo& kernel_info,
41       std::vector<int3>* work_groups) const override;
42   int3 GetGridSize() const override;
43 
44   // Move only
45   DepthWiseConv3x3StrideH2(DepthWiseConv3x3StrideH2&& kernel) = default;
46   DepthWiseConv3x3StrideH2& operator=(DepthWiseConv3x3StrideH2&& kernel) =
47       default;
48   DepthWiseConv3x3StrideH2(const DepthWiseConv3x3StrideH2&) = delete;
49   DepthWiseConv3x3StrideH2& operator=(const DepthWiseConv3x3StrideH2&) = delete;
50 
51  private:
DepthWiseConv3x3StrideH2(const OperationDef & definition)52   explicit DepthWiseConv3x3StrideH2(const OperationDef& definition)
53       : GPUOperation(definition) {}
54   friend DepthWiseConv3x3StrideH2 CreateDepthWiseConv3x3StrideH2(
55       const OperationDef& definition,
56       const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info);
57 
58   template <DataType T>
59   void UploadWeightsAndBiases(const tflite::gpu::Tensor<OHWI, T>& weights,
60                               const tflite::gpu::Tensor<Linear, T>& biases,
61                               bool weights_are_buffer);
62   template <DataType S, typename T>
63   void RearrangeWeightsAndBiasesData(
64       const tflite::gpu::Tensor<OHWI, S>& weights,
65       const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst);
66 
67   bool local_mem_uploads_;
68 };
69 
70 template <DataType T>
UploadWeightsAndBiases(const tflite::gpu::Tensor<OHWI,T> & weights,const tflite::gpu::Tensor<Linear,T> & biases,bool weights_are_buffer)71 void DepthWiseConv3x3StrideH2::UploadWeightsAndBiases(
72     const tflite::gpu::Tensor<OHWI, T>& weights,
73     const tflite::gpu::Tensor<Linear, T>& biases, bool weights_are_buffer) {
74   const int src_depth = DivideRoundUp(weights.shape.i, 4);
75   int texture_width = 10;  // 3x3 kernel + 1 bias
76   int texture_height = src_depth;
77   const int elements_count = texture_width * texture_height;
78   const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
79   const int float4_size = fp32_weights ? 16 : 8;
80 
81   std::vector<uint8_t> data(float4_size * elements_count);
82   if (fp32_weights) {
83     float4* ptr = reinterpret_cast<float4*>(data.data());
84     RearrangeWeightsAndBiasesData(weights, biases,
85                                   absl::MakeSpan(ptr, elements_count));
86   } else {
87     half4* ptr = reinterpret_cast<half4*>(data.data());
88     RearrangeWeightsAndBiasesData(weights, biases,
89                                   absl::MakeSpan(ptr, elements_count));
90   }
91 
92   if (weights_are_buffer) {
93     BufferDescriptor desc;
94     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
95     desc.element_size = 4;
96     desc.size = float4_size * elements_count;
97     desc.data = std::move(data);
98     args_.AddObject("weights",
99                     absl::make_unique<BufferDescriptor>(std::move(desc)));
100   } else {
101     Texture2DDescriptor desc;
102     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
103     desc.size = int2(texture_width, texture_height);
104     desc.data = std::move(data);
105     args_.AddObject("weights",
106                     absl::make_unique<Texture2DDescriptor>(std::move(desc)));
107   }
108 }
109 
110 template <DataType S, typename T>
RearrangeWeightsAndBiasesData(const tflite::gpu::Tensor<OHWI,S> & weights,const tflite::gpu::Tensor<Linear,S> & biases,absl::Span<T> dst)111 void DepthWiseConv3x3StrideH2::RearrangeWeightsAndBiasesData(
112     const tflite::gpu::Tensor<OHWI, S>& weights,
113     const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) {
114   const int src_depth = DivideRoundUp(weights.shape.i, 4);
115 
116   int counter = 0;
117   for (int s = 0; s < src_depth; ++s) {
118     for (int y = 0; y < 3; ++y) {
119       for (int x = 0; x < 3; ++x) {
120         T filter_val;
121         for (int i = 0; i < 4; ++i) {
122           const int s_ch = s * 4 + i;
123           if (s_ch < weights.shape.i) {
124             const int f_index = weights.shape.LinearIndex({0, y, x, s_ch});
125             filter_val[i] = weights.data[f_index];
126           } else {
127             filter_val[i] = 0.0f;
128           }
129         }
130         dst[counter++] = filter_val;
131       }
132     }
133 
134     T bias_val;
135     for (int i = 0; i < 4; ++i) {
136       const int dst_ch = s * 4 + i;
137       bias_val[i] = dst_ch >= biases.shape.v ? 0.0f : biases.data[dst_ch];
138     }
139     dst[counter++] = bias_val;
140   }
141 }
142 
143 DepthWiseConv3x3StrideH2 CreateDepthWiseConv3x3StrideH2(
144     const OperationDef& definition,
145     const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info);
146 
147 bool IsDepthWiseConv3x3StrideH2Supported(
148     const DepthwiseConvolution2DAttributes& attr);
149 
150 }  // namespace gpu
151 }  // namespace tflite
152 
153 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_STRIDE_H2_H_
154