1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization.h"
17 
18 #include <string>
19 
20 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
21 
22 namespace tflite {
23 namespace gpu {
24 
25 namespace {
26 
GetVectorReduceCode()27 std::string GetVectorReduceCode() {
28   return R"(float reduce_vector(float4 v) {
29   return dot(v, INIT_FLOAT4(1.0f));
30 })";
31 }
32 
GetReduceCode(const GpuInfo & gpu_info,int reduction_size)33 std::string GetReduceCode(const GpuInfo& gpu_info, int reduction_size) {
34   // If it is supported, use the built-in work_group_reduce_add function.
35   // Otherwise, implement a reduction using __local memory.
36 
37   // In the reduction step add upper half of the still-to-be-summed vector to
38   // the lower half, while taking care of odd sizes and rounding. E.g.:
39   // Number of items still to be summed before: 5
40   // Local memory before: [a, b, c, d, e];
41   // Local memory after: [a+d, b+e, c, d, e];
42   // Threads doing work: id < 2 = floor(5/2)
43   // Offset to the added items: 3 = ceil(5/2)
44   // Number of items still to be summed after: 3 = ceil(5/2)
45   std::string result;
46   if (gpu_info.IsApiOpenCl()) {
47     result += R"(
48 #if (__OPENCL_C_VERSION__ >= 200) && (__OPENCL_C_VERSION__ < 300) && \
49   !defined(__opencl_c_work_group_collective_functions)
50   #define __opencl_c_work_group_collective_functions 1
51 #endif
52 )";
53   }
54   result += R"(
55 #ifdef __opencl_c_work_group_collective_functions
56 #define local_reduce(item, tmp, local_id) work_group_reduce_add(item)
57 #else  // !defined(__opencl_c_work_group_collective_functions)
58 float local_reduce(float item, __local float* tmp, int local_id) {
59   tmp[local_id] = item;
60   LOCAL_MEM_BARRIER;
61   // The number of items still need to be summed
62 )";
63   result += "  int reduction_size = " + std::to_string(reduction_size) + ";\n";
64   result += R"(  while (reduction_size > 1) {
65     const int active_thread_limit = reduction_size / 2;
66     const int offset = (reduction_size + 1) / 2;
67     if (local_id < active_thread_limit) {
68       item += tmp[local_id + offset];
69       tmp[local_id] = item;
70     }
71     LOCAL_MEM_BARRIER;
72     reduction_size = offset;
73   }
74   return tmp[0];
75 }
76 #endif  // defined(__opencl_c_work_group_collective_functions)
77 )";
78   return result;
79 }
80 
GetFilterCode()81 std::string GetFilterCode() {
82   return R"(
83 float4 filter_outside_tensor(float4 x, int num_channels, int slice) {
84   return select(x, INIT_FLOAT4(0.0f), slice * 4 + INIT_INT4v4(0, 1, 2, 3) >= num_channels);
85 }
86 )";
87 }
88 }  // namespace
89 
MeanStdDevNormalization(const OperationDef & definition,const GpuInfo & gpu_info,const int tensor_slices)90 MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
91                                                  const GpuInfo& gpu_info,
92                                                  const int tensor_slices)
93     : GPUOperation(definition) {
94   // The kernel code does not inherently need a fixed size, but in order to not
95   // hardcode the __local array's size for the reductions, we would need to pass
96   // that size to the kernel at runtime, and that is currently not supported.
97   // For now, fix workgroup size to the biggest supported by the device, but not
98   // larger than the number of tensor slices.
99   int desired_work_group_size =
100       std::min(tensor_slices, gpu_info.GetMaxWorkGroupSizeForX());
101   if (gpu_info.IsMali()) {
102     // Don't use more than 64 work items per work group on ARM Mali. They
103     // implement local memory using the global memory, larger workgroups have
104     // severe performance penalty.
105     desired_work_group_size = 64;
106   }
107   if (gpu_info.IsAdreno()) {
108     AdrenoInfo info = gpu_info.adreno_info;
109     if (info.IsAdreno3xx()) {
110       if (info.adreno_gpu == AdrenoGpu::kAdreno320 ||
111           info.adreno_gpu == AdrenoGpu::kAdreno330) {
112         desired_work_group_size = 128;
113       } else {
114         desired_work_group_size = 64;
115       }
116     } else if (info.IsAdreno4xx()) {
117       if (info.adreno_gpu == AdrenoGpu::kAdreno430) {
118         desired_work_group_size = 256;
119       } else {
120         desired_work_group_size = 128;
121       }
122     } else if (info.IsAdreno5xx()) {
123       if (info.adreno_gpu == AdrenoGpu::kAdreno530 ||
124           info.adreno_gpu == AdrenoGpu::kAdreno540) {
125         desired_work_group_size = 256;
126       } else {
127         desired_work_group_size = 128;
128       }
129     }
130   }
131   if (gpu_info.IsPowerVR()) {
132     desired_work_group_size = 64;
133   }
134   if (gpu_info.IsApple()) {
135     desired_work_group_size = 64;
136   }
137   while (desired_work_group_size >= tensor_slices * 2) {
138     desired_work_group_size /= 2;
139   }
140   work_group_size_.x = desired_work_group_size;
141   work_group_size_.y = 1;  // Required
142   work_group_size_.z = 1;  // Required
143   code_ = GetNormalizationCode(gpu_info);
144   if (gpu_info.IsCL30OrHigher()) {
145     compiler_options_.push_back(CompilerOptions::kCl30);
146   } else if (gpu_info.IsCL20OrHigher()) {
147     compiler_options_.push_back(CompilerOptions::kCl20);
148   }
149 }
150 
GetNormalizationCode(const GpuInfo & gpu_info)151 std::string MeanStdDevNormalization::GetNormalizationCode(
152     const GpuInfo& gpu_info) {
153   AddSrcTensor("src_tensor", definition_.src_tensors[0]);
154   AddDstTensor("dst_tensor", definition_.dst_tensors[0]);
155 
156   std::string c;
157   c += GetVectorReduceCode();
158   c += GetReduceCode(gpu_info, work_group_size_.x);
159   c += GetFilterCode();
160   if (gpu_info.IsApiOpenCl()) {
161     c += "__attribute__((reqd_work_group_size(" +
162          std::to_string(work_group_size_.x) + ", 1, 1)))\n";
163   }
164   if (gpu_info.IsApiMetal()) {
165     c += "#define native_rsqrt(value) rsqrt(value)\n";
166   }
167   c += R"(MAIN_FUNCTION($0) {
168 #ifndef __opencl_c_work_group_collective_functions
169   __local float tmp[)" +
170        std::to_string(work_group_size_.x) + R"(];
171 #endif
172   int B = GLOBAL_ID_1;
173   // Calculate the total sum of the input tensor.
174   // First, get a local sum of input[local_id_x + N*local_size_x] for all N.
175   float4 private_sum4 = INIT_FLOAT4(0.0f);
176   for (int S = LOCAL_ID_0; S < args.src_tensor.Slices(); S += GROUP_SIZE_0) {
177     float4 t = args.src_tensor.Read<float>(0, 0, S, B);
178     private_sum4 += filter_outside_tensor(t, args.src_tensor.Channels(), S);
179   }
180   // Reduce the vector to a single float and do a workgroup reduce.
181   float private_sum = reduce_vector(private_sum4);
182   float sum = local_reduce(private_sum, tmp, LOCAL_ID_0);
183   // Calculate the mean
184   float mean = sum / args.src_tensor.Channels();
185   // Calculate the squared sum of the difference from the mean.
186   float4 private_sum_diff_sq4 = INIT_FLOAT4(0.0f);
187   for (int S = LOCAL_ID_0; S < args.src_tensor.Slices(); S += GROUP_SIZE_0) {
188     float4 t = args.src_tensor.Read<float>(0, 0, S, B);
189     float4 diff = filter_outside_tensor(t - mean, args.src_tensor.Channels(), S);
190     private_sum_diff_sq4 += diff * diff;
191   }
192   // Reduce
193   float private_sum_diff_sq = reduce_vector(private_sum_diff_sq4);
194   float sum_diff_sq = local_reduce(private_sum_diff_sq, tmp, LOCAL_ID_0);
195   // Calculate 1/stddev (with the 'regulazing constant' as in tensor_utils.cc)
196   float variance = sum_diff_sq / args.src_tensor.Channels();
197   float stddev_inv = native_rsqrt(variance + 1.0e-8f);
198   // Calculate (t-mean)/stddev for each element
199   for (int S = LOCAL_ID_0; S < args.src_tensor.Slices(); S += GROUP_SIZE_0) {
200     float4 t = args.src_tensor.Read<float>(0, 0, S, B);
201     FLT4 result = TO_FLT4((t - mean) * stddev_inv);
202     args.dst_tensor.Write(result, 0, 0, S, B);
203   }
204 })";
205   return c;
206 }
207 
GetGridSize() const208 int3 MeanStdDevNormalization::GetGridSize() const {
209   // To avoid dealing with global reductions, we restrict the grid size to the
210   // work group size in the first dimension.
211   const int grid_x = work_group_size_.x;
212   const int grid_y = src_[0]->Batch();
213   const int grid_z = 1;
214   return int3(grid_x, grid_y, grid_z);
215 }
216 
CreateMeanStdDevNormalization(const OperationDef & definition,const GpuInfo & gpu_info,const int tensor_slices)217 MeanStdDevNormalization CreateMeanStdDevNormalization(
218     const OperationDef& definition, const GpuInfo& gpu_info,
219     const int tensor_slices) {
220   return MeanStdDevNormalization(definition, gpu_info, tensor_slices);
221 }
222 
223 }  // namespace gpu
224 }  // namespace tflite
225