1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_H_
18 
19 #include <stdint.h>
20 
21 #include <string>
22 #include <utility>
23 #include <vector>
24 
25 #include "absl/memory/memory.h"
26 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
27 #include "tensorflow/lite/delegates/gpu/common/operations.h"
28 #include "tensorflow/lite/delegates/gpu/common/shape.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
30 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
31 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
32 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
33 #include "tensorflow/lite/delegates/gpu/common/types.h"
34 #include "tensorflow/lite/delegates/gpu/common/util.h"
35 
36 namespace tflite {
37 namespace gpu {
38 
39 template <DataType T, typename S>
RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)40 void RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
41                                 S* dst) {
42   const int src_channels = weights.shape.i;
43   const int padded_src_channels = AlignByN(src_channels, 4);
44   const int dst_channels = weights.shape.o;
45   const int padded_dst_channels = AlignByN(dst_channels, 4);
46 
47   // Change the travelsal order of the weight matrix in the following way:
48   // The matrix is segmented to blocks of 4x4. If (any) dimension of the matrix
49   // size is not divisible by 4, then pad with zeros. Each block is stored
50   // contigously. The 16 elements within a block are ordered as 4 elements of
51   // the first column, 4 elems of the second, etc. Blocks then traversed as
52   // columns first, rows last. As an example, an 8x8 matrix would be traversed
53   // as below.
54   //
55   //  |  0  4  8 12 32 36 40 44 |
56   //  |  1  5  9 13 33 37 41 45 |
57   //  |  2  6 10 14 34 38 42 46 |
58   //  |  3  7 11 15 35 39 43 47 |
59   //  | 16 20 24 28 48 52 56 60 |
60   //  | 17 21 25 29 49 53 57 61 |
61   //  | 18 22 26 30 50 54 58 62 |
62   //  | 19 23 27 31 51 55 59 63 |
63   //
64   // The benefit of doing this is that reading contigous 16 elements gives a 4x4
65   // block of the matrix, where the first 4 elements is the first row of the
66   // block, second 4 elements is the second row of the block, etc. Subsequent
67   // blocks contain elements of the same 4 columns.
68 
69   for (int block_y = 0; 4 * block_y < padded_dst_channels; block_y++) {
70     for (int y_in_block = 0; y_in_block < 4; y_in_block++) {
71       for (int block_x = 0; 4 * block_x < padded_src_channels; block_x++) {
72         for (int x_in_block = 0; x_in_block < 4; x_in_block++) {
73           int y = 4 * block_y + y_in_block;
74           int x = 4 * block_x + x_in_block;
75           // Consider destination as an array with extents
76           // [padded_src_channels/4][padded_dst_channels/4][4][4]
77           int dst_index = block_x * padded_dst_channels * 4 + block_y * 16 +
78                           x_in_block * 4 + y_in_block;
79           if (x < src_channels && y < dst_channels) {
80             dst[dst_index] = weights.data[src_channels * y + x];
81           } else {
82             dst[dst_index] = 0.0f;
83           }
84         }
85       }
86     }
87   }
88 }
89 
90 template <DataType T, typename S>
RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)91 void RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
92                                 S* dst) {
93   const int src_channels = weights.shape.i;
94   const int src_depth = DivideRoundUp(src_channels, 4);
95   const int dst_channels = weights.shape.o;
96   const int dst_depth = DivideRoundUp(dst_channels, 4);
97 
98   int counter = 0;
99   for (int d = 0; d < dst_depth; ++d) {
100     for (int s = 0; s < src_depth; ++s) {
101       for (int i = 0; i < 4; ++i) {
102         const int src_ch = s * 4 + i;
103         for (int j = 0; j < 4; ++j) {
104           const int dst_ch = d * 4 + j;
105           if (src_ch < src_channels && dst_ch < dst_channels) {
106             dst[counter++] = weights.data[dst_ch * src_channels + src_ch];
107           } else {
108             dst[counter++] = 0.0f;
109           }
110         }
111       }
112     }
113   }
114 }
115 
116 class FullyConnected : public GPUOperation {
117  public:
118   FullyConnected() = default;
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups)119   void GetPossibleKernelWorkGroups(
120       TuningType tuning_type, const GpuInfo& gpu_info,
121       const KernelInfo& kernel_info,
122       std::vector<int3>* work_groups) const override {
123     work_groups->push_back(work_group_size_);
124   }
125   int3 GetGridSize() const override;
126 
127   // Move only
128   FullyConnected(FullyConnected&& kernel);
129   FullyConnected& operator=(FullyConnected&& kernel);
130   FullyConnected(const FullyConnected&) = delete;
131   FullyConnected& operator=(const FullyConnected&) = delete;
132 
133  private:
134   FullyConnected(const OperationDef& definition, const GpuInfo& gpu_info);
135   friend FullyConnected CreateFullyConnected(
136       const GpuInfo& gpu_info, const OperationDef& definition,
137       const FullyConnectedAttributes& attr);
138 
139   template <DataType T>
140   void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
141                      bool weights_are_buffer);
142 
143   std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
144                                           const GpuInfo& gpu_info);
145 };
146 
147 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights,bool weights_are_buffer)148 void FullyConnected::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
149                                    bool weights_are_buffer) {
150   const int src_depth = DivideRoundUp(weights.shape.i, 4);
151   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
152 
153   const int elements_count = src_depth * dst_depth * 4;
154   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
155 
156   const int float4_size = f32_weights ? 16 : 8;
157 
158   if (weights_are_buffer) {
159     BufferDescriptor desc;
160     desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
161     desc.element_size = 16;
162     desc.size = float4_size * elements_count;
163     desc.data.resize(desc.size);
164 
165     if (f32_weights) {
166       float* ptr = reinterpret_cast<float*>(desc.data.data());
167       RearrangeFCWeightsToIOO4I4(weights, ptr);
168     } else {
169       half* ptr = reinterpret_cast<half*>(desc.data.data());
170       RearrangeFCWeightsToIOO4I4(weights, ptr);
171     }
172 
173     args_.AddObject("weights",
174                     absl::make_unique<BufferDescriptor>(std::move(desc)));
175   } else {
176     Texture2DDescriptor desc;
177     desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
178     desc.size = int2(src_depth * 4, dst_depth);
179     desc.data.resize(float4_size * elements_count);
180 
181     if (f32_weights) {
182       float* ptr = reinterpret_cast<float*>(desc.data.data());
183       RearrangeFCWeightsToOIO4I4(weights, ptr);
184     } else {
185       half* ptr = reinterpret_cast<half*>(desc.data.data());
186       RearrangeFCWeightsToOIO4I4(weights, ptr);
187     }
188 
189     args_.AddObject("weights",
190                     absl::make_unique<Texture2DDescriptor>(std::move(desc)));
191   }
192 }
193 
194 FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
195                                     const OperationDef& definition,
196                                     const FullyConnectedAttributes& attr);
197 
198 }  // namespace gpu
199 }  // namespace tflite
200 
201 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_H_
202