1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_H_
18 
19 #include <vector>
20 
21 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
22 #include "tensorflow/lite/delegates/gpu/common/operations.h"
23 #include "tensorflow/lite/delegates/gpu/common/shape.h"
24 #include "tensorflow/lite/delegates/gpu/common/status.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
28 #include "tensorflow/lite/delegates/gpu/common/task/tensor_linear_desc.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
30 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
31 #include "tensorflow/lite/delegates/gpu/common/types.h"
32 
33 namespace tflite {
34 namespace gpu {
35 
36 template <DataType S, typename T>
RearrangeWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI,S> & weights,absl::Span<T> dst)37 void RearrangeWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI, S>& weights,
38                                  absl::Span<T> dst) {
39   const int dst_channels = weights.shape.i * weights.shape.o;
40   const int dst_depth = DivideRoundUp(dst_channels, 4);
41   const int kernel_x = weights.shape.w;
42   const int kernel_y = weights.shape.h;
43 
44   int counter = 0;
45   for (int d = 0; d < dst_depth; ++d) {
46     for (int y = 0; y < kernel_y; ++y) {
47       for (int x = 0; x < kernel_x; ++x) {
48         T filter_val;
49         for (int i = 0; i < 4; ++i) {
50           const int d_ch = d * 4 + i;
51           if (d_ch < dst_channels) {
52             const int f_index = weights.shape.LinearIndex(
53                 {d_ch % weights.shape.o, y, x, d_ch / weights.shape.o});
54             filter_val[i] = weights.data[f_index];
55           } else {
56             filter_val[i] = 0.0f;
57           }
58         }
59         dst[counter++] = filter_val;
60       }
61     }
62   }
63 }
64 
65 template <DataType T>
UploadWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI,T> & weights,bool weights_are_buffer,CalculationsPrecision precision,GPUOperation * op)66 void UploadWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI, T>& weights,
67                               bool weights_are_buffer,
68                               CalculationsPrecision precision,
69                               GPUOperation* op) {
70   const int dst_channels = weights.shape.i * weights.shape.o;
71   const int dst_slices = DivideRoundUp(dst_channels, 4);
72   const int kernel_x = weights.shape.w;
73   const int kernel_y = weights.shape.h;
74 
75   const int elements_count = kernel_x * kernel_y * dst_slices;
76 
77   const bool fp32_weights = precision == CalculationsPrecision::F32;
78   const int float4_size = fp32_weights ? 16 : 8;
79 
80   std::vector<uint8_t> data(float4_size * elements_count);
81 
82   if (fp32_weights) {
83     float4* ptr = reinterpret_cast<float4*>(data.data());
84     RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count));
85   } else {
86     half4* ptr = reinterpret_cast<half4*>(data.data());
87     RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count));
88   }
89 
90   if (weights_are_buffer) {
91     BufferDescriptor desc;
92     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
93     desc.element_size = 4;
94     desc.size = float4_size * elements_count;
95     desc.data = std::move(data);
96     op->args_.AddObject("weights", absl::make_unique<BufferDescriptor>(desc));
97   } else {
98     Texture2DDescriptor desc;
99     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
100     desc.size = int2(kernel_x * kernel_y, dst_slices);
101     desc.data = std::move(data);
102     op->args_.AddObject("weights",
103                         absl::make_unique<Texture2DDescriptor>(desc));
104   }
105 }
106 
107 template <DataType S, typename T>
RearrangeWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI,S> & weights,absl::Span<T> dst)108 void RearrangeWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI, S>& weights,
109                                  absl::Span<T> dst) {
110   const int dst_channels = weights.shape.i * weights.shape.o;
111   const int dst_slices = DivideRoundUp(dst_channels, 4);
112   const int kernel_x = weights.shape.w;
113   const int kernel_y = weights.shape.h;
114   const int kernel_z = weights.shape.d;
115 
116   int counter = 0;
117   for (int d = 0; d < dst_slices; ++d) {
118     for (int z = 0; z < kernel_z; ++z) {
119       for (int y = 0; y < kernel_y; ++y) {
120         for (int x = 0; x < kernel_x; ++x) {
121           T filter_val;
122           for (int i = 0; i < 4; ++i) {
123             const int d_ch = d * 4 + i;
124             if (d_ch < dst_channels) {
125               const int f_index = weights.shape.LinearIndex(
126                   {d_ch % weights.shape.o, y, x, z, d_ch / weights.shape.o});
127               filter_val[i] = weights.data[f_index];
128             } else {
129               filter_val[i] = 0.0f;
130             }
131           }
132           dst[counter++] = filter_val;
133         }
134       }
135     }
136   }
137 }
138 
139 template <DataType T>
UploadWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI,T> & weights,bool weights_are_buffer,CalculationsPrecision precision,GPUOperation * op)140 void UploadWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI, T>& weights,
141                               bool weights_are_buffer,
142                               CalculationsPrecision precision,
143                               GPUOperation* op) {
144   const int dst_channels = weights.shape.i * weights.shape.o;
145   const int dst_slices = DivideRoundUp(dst_channels, 4);
146   const int kernel_x = weights.shape.w;
147   const int kernel_y = weights.shape.h;
148   const int kernel_z = weights.shape.d;
149 
150   const int elements_count = kernel_x * kernel_y * kernel_z * dst_slices;
151 
152   const bool fp32_weights = precision == CalculationsPrecision::F32;
153   const int float4_size = fp32_weights ? 16 : 8;
154 
155   std::vector<uint8_t> data(float4_size * elements_count);
156 
157   if (fp32_weights) {
158     float4* ptr = reinterpret_cast<float4*>(data.data());
159     RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count));
160   } else {
161     half4* ptr = reinterpret_cast<half4*>(data.data());
162     RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count));
163   }
164 
165   if (weights_are_buffer) {
166     BufferDescriptor desc;
167     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
168     desc.element_size = 4;
169     desc.size = float4_size * elements_count;
170     desc.data = std::move(data);
171     op->args_.AddObject("weights",
172                         absl::make_unique<BufferDescriptor>(std::move(desc)));
173   } else {
174     Texture2DDescriptor desc;
175     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
176     desc.size = int2(kernel_x * kernel_y * kernel_z, dst_slices);
177     desc.data = std::move(data);
178     op->args_.AddObject(
179         "weights", absl::make_unique<Texture2DDescriptor>(std::move(desc)));
180   }
181 }
182 
183 GPUOperation CreateDepthwiseConvolution2D(
184     const GpuInfo& gpu_info, const OperationDef& definition,
185     const DepthwiseConvolution2DAttributes& attr);
186 
187 GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
188     const GpuInfo& gpu_info, const OperationDef& definition,
189     const DepthwiseConvolution2DAttributes& attr);
190 
191 GPUOperation CreateDepthwiseConvolution3D(
192     const GpuInfo& gpu_info, const OperationDef& definition,
193     const DepthwiseConvolution3DAttributes& attr);
194 
195 }  // namespace gpu
196 }  // namespace tflite
197 
198 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_H_
199