1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h"
17 
18 #include <string>
19 #include <utility>
20 #include <vector>
21 
22 #include "absl/memory/memory.h"
23 #include "tensorflow/lite/delegates/gpu/common/operations.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/storage_type_util.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/tensor_linear_desc.h"
28 #include "tensorflow/lite/delegates/gpu/common/types.h"
29 
30 namespace tflite {
31 namespace gpu {
32 
33 namespace {
UseBufferForWeights(const GpuInfo & gpu_info)34 bool UseBufferForWeights(const GpuInfo& gpu_info) {
35   return gpu_info.IsAdreno() || gpu_info.IsAMD() || gpu_info.IsMali() ||
36          gpu_info.IsApple();
37 }
38 }  // namespace
39 
FullyConnected(const OperationDef & definition,const GpuInfo & gpu_info)40 FullyConnected::FullyConnected(const OperationDef& definition,
41                                const GpuInfo& gpu_info)
42     : GPUOperation(definition) {
43   if (gpu_info.IsAdreno()) {
44     if (gpu_info.adreno_info.IsAdreno3xx()) {
45       work_group_size_ = int3(16, 4, 1);
46     } else if (gpu_info.adreno_info.IsAdreno4xx()) {
47       work_group_size_ = int3(32, 4, 1);
48     } else {
49       work_group_size_ = int3(32, 4, 1);
50     }
51   } else if (gpu_info.IsIntel() || gpu_info.IsNvidia() ||
52              gpu_info.IsPowerVR() || gpu_info.IsApple()) {
53     work_group_size_ = int3(8, 4, 1);
54   } else {
55     work_group_size_ = int3(16, 4, 1);
56   }
57   code_ = GetFullyConnectedKernelCode(definition_, gpu_info);
58 }
59 
FullyConnected(FullyConnected && kernel)60 FullyConnected::FullyConnected(FullyConnected&& kernel)
61     : GPUOperation(std::move(kernel)) {}
62 
operator =(FullyConnected && kernel)63 FullyConnected& FullyConnected::operator=(FullyConnected&& kernel) {
64   if (this != &kernel) {
65     GPUOperation::operator=(std::move(kernel));
66   }
67   return *this;
68 }
69 
70 // We split vec vec dot (every thread do vec vec dot product in basic
71 // vec mat mult) on 4 parts to create more threads
72 // tid.y thread process every 4-th element in vec vec dot
73 // Good results for ~1024 x 1024 sizes, for other can be written more
74 // optimized shaders
75 
GetFullyConnectedKernelCode(const OperationDef & op_def,const GpuInfo & gpu_info)76 std::string FullyConnected::GetFullyConnectedKernelCode(
77     const OperationDef& op_def, const GpuInfo& gpu_info) {
78   const int wg_total_size = work_group_size_.x * work_group_size_.y;
79   const std::string barrier =
80       wg_total_size == 32 && gpu_info.IsWaveSizeEqualTo32()
81           ? "SIMD_LOCAL_MEM_BARRIER"
82           : "LOCAL_MEM_BARRIER";
83   AddSrcTensor("src_tensor", op_def.src_tensors[0]);
84   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
85 
86   const bool weights_are_buffer = UseBufferForWeights(gpu_info);
87 
88   std::string c;
89   switch (op_def.precision) {
90     case CalculationsPrecision::F32:
91       c += "#define FLT16 float16\n";
92       break;
93     case CalculationsPrecision::F32_F16:
94     case CalculationsPrecision::F16:
95       c += "#define FLT16 half16\n";
96       break;
97   }
98 
99   c += "#define WG_X " + std::to_string(work_group_size_.x) + "\n";
100   c += "#define WG_Y " + std::to_string(work_group_size_.y) + "\n";
101 
102   c += R"(MAIN_FUNCTION($0) {
103   int gid = GLOBAL_ID_0;
104   int2 tid = INIT_INT2v2(LOCAL_ID_0, LOCAL_ID_1);
105   ACCUM_FLT4 s = INIT_ACCUM_FLT4(0.0f);
106   if (gid < args.dst_tensor.Slices()) {
107     for (int c = tid.y; c < args.src_tensor.Slices(); c += WG_Y) {
108       FLT4 v = args.src_tensor.Read(0, 0, c);
109 )";
110   if (weights_are_buffer) {
111     c += R"(FLT16 w = args.weights.Read(c * args.dst_tensor.Slices() + gid);
112       FLT4 partial = v.x * FLT16_0123(w);
113       partial += v.y * FLT16_4567(w);
114       partial += v.z * FLT16_89ab(w);
115       partial += v.w * FLT16_cdef(w);
116       s += TO_ACCUM_TYPE(partial);
117 )";
118   } else {
119     c += R"(FLT4 w0 = args.weights.Read(c * 4 + 0, gid);
120       FLT4 w1 = args.weights.Read(c * 4 + 1, gid);
121       FLT4 w2 = args.weights.Read(c * 4 + 2, gid);
122       FLT4 w3 = args.weights.Read(c * 4 + 3, gid);
123       FLT4 partial = v.x * w0;
124       partial += v.y * w1;
125       partial += v.z * w2;
126       partial += v.w * w3;
127       s += TO_ACCUM_TYPE(partial);
128 )";
129   }
130   c += R"(    }
131   }
132   __local ACCUM_FLT4 temp[WG_X][WG_Y];
133   temp[tid.x][tid.y] = s;
134 )";
135   c += "  " + barrier + ";\n";
136   c += R"(
137   if (gid >= args.dst_tensor.Slices()) {
138     return;
139   }
140   if (tid.y == 0) {
141 )";
142   for (int i = 1; i < work_group_size_.y; ++i) {
143     c += "    s += temp[tid.x][" + std::to_string(i) + "];\n";
144   }
145   c += R"(    FLT4 r0 = TO_FLT4(s) + args.biases.Read(gid);
146     args.dst_tensor.Write(r0, 0, 0, gid);
147   }
148 })";
149 
150   return c;
151 }
152 
GetGridSize() const153 int3 FullyConnected::GetGridSize() const {
154   return int3(dst_[0]->Slices(), 1, 1);
155 }
156 
CreateFullyConnected(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr)157 FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
158                                     const OperationDef& definition,
159                                     const FullyConnectedAttributes& attr) {
160   FullyConnected result(definition, gpu_info);
161   result.UploadWeights(attr.weights, UseBufferForWeights(gpu_info));
162 
163   TensorLinearDescriptor desc;
164   desc.storage_type = gpu_info.SupportsImages() ? LinearStorageType::TEXTURE_2D
165                                                 : LinearStorageType::BUFFER;
166   if (gpu_info.IsApple()) {
167     desc.storage_type =
168         DeduceLinearStorageType(definition.GetPrimaryStorageType());
169   }
170   desc.element_type = definition.GetDataType();
171   desc.UploadLinearData(attr.bias);
172   result.args_.AddObject(
173       "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
174 
175   return result;
176 }
177 
178 }  // namespace gpu
179 }  // namespace tflite
180