1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
17 
18 #include <array>
19 #include <map>
20 #include <string>
21 #include <vector>
22 
23 #include "absl/strings/str_cat.h"
24 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
25 #include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
26 #include "tensorflow/lite/delegates/gpu/cl/util.h"
27 #include "tensorflow/lite/delegates/gpu/common/status.h"
28 #include "tensorflow/lite/delegates/gpu/common/types.h"
29 
30 namespace tflite {
31 namespace gpu {
32 namespace cl {
33 
CLCommandQueue(cl_command_queue queue,bool has_ownership)34 CLCommandQueue::CLCommandQueue(cl_command_queue queue, bool has_ownership)
35     : queue_(queue), has_ownership_(has_ownership) {}
36 
CLCommandQueue(CLCommandQueue && queue)37 CLCommandQueue::CLCommandQueue(CLCommandQueue&& queue)
38     : queue_(queue.queue_), has_ownership_(queue.has_ownership_) {
39   queue.queue_ = nullptr;
40 }
41 
operator =(CLCommandQueue && queue)42 CLCommandQueue& CLCommandQueue::operator=(CLCommandQueue&& queue) {
43   if (this != &queue) {
44     Release();
45     std::swap(queue_, queue.queue_);
46     has_ownership_ = queue.has_ownership_;
47   }
48   return *this;
49 }
50 
~CLCommandQueue()51 CLCommandQueue::~CLCommandQueue() { Release(); }
52 
Release()53 void CLCommandQueue::Release() {
54   if (has_ownership_ && queue_) {
55     clReleaseCommandQueue(queue_);
56     queue_ = nullptr;
57   }
58 }
59 
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size,CLEvent * event)60 absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
61                                       const int3& work_groups_count,
62                                       const int3& work_group_size,
63                                       CLEvent* event) {
64   std::array<size_t, 3> local;
65   std::array<size_t, 3> global;
66   for (int i = 0; i < 3; ++i) {
67     local[i] = work_group_size[i];
68     global[i] = work_groups_count[i] * work_group_size[i];
69   }
70   cl_event resulting_event;
71   const int error_code = clEnqueueNDRangeKernel(
72       queue_, kernel.kernel(), 3, nullptr, global.data(), local.data(), 0,
73       nullptr, event ? &resulting_event : nullptr);
74   if (event) {
75     *event = CLEvent(resulting_event);
76   }
77   if (error_code != CL_SUCCESS) {
78     return absl::UnknownError(
79         absl::StrCat("Failed to clEnqueueNDRangeKernel - ",
80                      CLErrorCodeToString(error_code)));
81   }
82   return absl::OkStatus();
83 }
84 
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size)85 absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
86                                       const int3& work_groups_count,
87                                       const int3& work_group_size) {
88   return Dispatch(kernel, work_groups_count, work_group_size, nullptr);
89 }
90 
EnqueueEvent(CLEvent * event)91 absl::Status CLCommandQueue::EnqueueEvent(CLEvent* event) {
92   cl_event resulting_event;
93   const int error_code = clEnqueueMarker(queue_, &resulting_event);
94   *event = CLEvent(resulting_event);
95   if (error_code != CL_SUCCESS) {
96     return absl::UnknownError(absl::StrCat("Failed to clEnqueueMarker - ",
97                                            CLErrorCodeToString(error_code)));
98   }
99   return absl::OkStatus();
100 }
101 
EnqueueWriteImage(cl_mem memory,int3 region,const void * data)102 absl::Status CLCommandQueue::EnqueueWriteImage(cl_mem memory, int3 region,
103                                                const void* data) {
104   const size_t origin[] = {0, 0, 0};
105   const size_t r[] = {static_cast<size_t>(region.x),
106                       static_cast<size_t>(region.y),
107                       static_cast<size_t>(region.z)};
108   auto error_code = clEnqueueWriteImage(queue_, memory, CL_TRUE, origin, r, 0,
109                                         0, data, 0, nullptr, nullptr);
110   if (error_code != CL_SUCCESS) {
111     return absl::UnknownError(
112         absl::StrCat("Failed to upload data to GPU (clEnqueueWriteImage) - ",
113                      CLErrorCodeToString(error_code)));
114   }
115 
116   return absl::OkStatus();
117 }
118 
EnqueueReadImage(cl_mem memory,int3 region,void * data)119 absl::Status CLCommandQueue::EnqueueReadImage(cl_mem memory, int3 region,
120                                               void* data) {
121   const size_t origin[] = {0, 0, 0};
122   const size_t r[] = {static_cast<size_t>(region.x),
123                       static_cast<size_t>(region.y),
124                       static_cast<size_t>(region.z)};
125   auto error_code = clEnqueueReadImage(queue_, memory, CL_TRUE, origin, r, 0, 0,
126                                        data, 0, nullptr, nullptr);
127   if (error_code != CL_SUCCESS) {
128     return absl::UnknownError(
129         absl::StrCat("Failed to read data from GPU (clEnqueueReadImage) - ",
130                      CLErrorCodeToString(error_code)));
131   }
132 
133   return absl::OkStatus();
134 }
135 
EnqueueWriteBuffer(cl_mem memory,size_t size_in_bytes,const void * data)136 absl::Status CLCommandQueue::EnqueueWriteBuffer(cl_mem memory,
137                                                 size_t size_in_bytes,
138                                                 const void* data) {
139   auto error_code = clEnqueueWriteBuffer(
140       queue_, memory, CL_TRUE, 0, size_in_bytes, data, 0, nullptr, nullptr);
141   if (error_code != CL_SUCCESS) {
142     return absl::UnknownError(
143         absl::StrCat("Failed to upload data to GPU (clEnqueueWriteBuffer) - ",
144                      CLErrorCodeToString(error_code)));
145   }
146   return absl::OkStatus();
147 }
148 
EnqueueReadBuffer(cl_mem memory,size_t size_in_bytes,void * data)149 absl::Status CLCommandQueue::EnqueueReadBuffer(cl_mem memory,
150                                                size_t size_in_bytes,
151                                                void* data) {
152   auto error_code = clEnqueueReadBuffer(
153       queue_, memory, CL_TRUE, 0, size_in_bytes, data, 0, nullptr, nullptr);
154   if (error_code != CL_SUCCESS) {
155     return absl::UnknownError(
156         absl::StrCat("Failed to read data from GPU (clEnqueueReadBuffer) - ",
157                      CLErrorCodeToString(error_code)));
158   }
159   return absl::OkStatus();
160 }
161 
WaitForCompletion()162 absl::Status CLCommandQueue::WaitForCompletion() {
163   auto error_code = clFinish(queue_);
164   if (error_code != CL_SUCCESS) {
165     return absl::UnknownError(
166         absl::StrCat("Failed to clFinish - ", CLErrorCodeToString(error_code)));
167   }
168   return absl::OkStatus();
169 }
170 
ProfilingCommandQueue(cl_command_queue queue)171 ProfilingCommandQueue::ProfilingCommandQueue(cl_command_queue queue)
172     : CLCommandQueue(queue, true) {
173   events_.reserve(128);
174 }
175 
ProfilingCommandQueue(ProfilingCommandQueue && queue)176 ProfilingCommandQueue::ProfilingCommandQueue(ProfilingCommandQueue&& queue)
177     : CLCommandQueue(std::move(queue)),
178       events_(std::move(queue.events_)),
179       current_label_(std::move(queue.current_label_)) {}
180 
operator =(ProfilingCommandQueue && queue)181 ProfilingCommandQueue& ProfilingCommandQueue::operator=(
182     ProfilingCommandQueue&& queue) {
183   if (this != &queue) {
184     events_ = std::move(queue.events_);
185     current_label_ = std::move(queue.current_label_);
186     CLCommandQueue::operator=(std::move(queue));
187   }
188   return *this;
189 }
190 
SetEventsLabel(const std::string & name)191 void ProfilingCommandQueue::SetEventsLabel(const std::string& name) {
192   current_label_ = name;
193 }
194 
ResetMeasurements()195 void ProfilingCommandQueue::ResetMeasurements() { events_.clear(); }
196 
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size)197 absl::Status ProfilingCommandQueue::Dispatch(const CLKernel& kernel,
198                                              const int3& work_groups_count,
199                                              const int3& work_group_size) {
200   events_.push_back(CLEvent());
201   RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
202                                            work_group_size,
203                                            &events_[events_.size() - 1]));
204   events_.back().SetName(current_label_);
205   return absl::OkStatus();
206 }
207 
GetProfilingInfo() const208 ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
209   ProfilingInfo result;
210   result.dispatches.resize(events_.size());
211   for (int i = 0; i < events_.size(); ++i) {
212     result.dispatches[i].label = events_[i].GetName();
213     result.dispatches[i].duration =
214         absl::Nanoseconds(events_[i].GetEventTimeNs());
215   }
216   return result;
217 }
218 
GetBestWorkGroupIndex(const CLKernel & kernel,const GpuInfo & gpu_info,const std::vector<int3> & work_groups_count,const std::vector<int3> & work_group_sizes,int * index)219 absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
220     const CLKernel& kernel, const GpuInfo& gpu_info,
221     const std::vector<int3>& work_groups_count,
222     const std::vector<int3>& work_group_sizes, int* index) {
223   // Some Adreno 3xx can have wrong numbers for some events
224   const bool possible_bug_with_events =
225       gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx();
226   events_.resize(work_group_sizes.size());
227   for (int i = 0; i < work_group_sizes.size(); ++i) {
228     RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count[i],
229                                              work_group_sizes[i], &events_[i]));
230 
231     // reducing the speed of memory leak on Mali for some kernels
232     if (gpu_info.IsMali() && i % 8 == 7) {
233       events_[i - 7].Wait();
234     }
235     if (possible_bug_with_events) {
236       // We are trying to increase probability for correct result.
237       RETURN_IF_ERROR(WaitForCompletion());
238     }
239   }
240 
241   RETURN_IF_ERROR(WaitForCompletion());
242 
243   // To release memory of some kernel pool on Mali.
244   if (gpu_info.IsMali()) {
245     RETURN_IF_ERROR(kernel.ReInit());
246   }
247 
248   int minimum_index = 0;
249   double minimum_time = std::numeric_limits<double>::max();
250   if (possible_bug_with_events) {  // we will try to cut out suspicious results
251     double average_time = 0.0;
252     int average_samples_count = 0;
253     for (int i = 0; i < work_group_sizes.size(); ++i) {
254       if (events_[i].GetEventTimeMs() < 100 * 1000) {  // 100 sec
255         average_time += events_[i].GetEventTimeMs();
256         average_samples_count++;
257       }
258     }
259     average_time /= average_samples_count;
260     for (int i = 0; i < work_group_sizes.size(); ++i) {
261       double time = events_[i].GetEventTimeMs();
262       if (time < minimum_time && time >= 0.1 * average_time) {
263         minimum_index = i;
264         minimum_time = time;
265       }
266     }
267   } else {
268     for (int i = 0; i < work_group_sizes.size(); ++i) {
269       double time = events_[i].GetEventTimeMs();
270       if (time < minimum_time) {
271         minimum_index = i;
272         minimum_time = time;
273       }
274     }
275   }
276 
277   *index = minimum_index;
278 
279   return absl::OkStatus();
280 }
281 
CreateCLCommandQueue(const CLDevice & device,const CLContext & context,CLCommandQueue * result)282 absl::Status CreateCLCommandQueue(const CLDevice& device,
283                                   const CLContext& context,
284                                   CLCommandQueue* result) {
285   int error_code;
286   cl_command_queue queue =
287       clCreateCommandQueue(context.context(), device.id(), 0, &error_code);
288   if (!queue) {
289     return absl::UnknownError(
290         absl::StrCat("Failed to create a command queue - ",
291                      CLErrorCodeToString(error_code)));
292   }
293   *result = CLCommandQueue(queue, true);
294   return absl::OkStatus();
295 }
296 
GetQueueExecutionTimeMs() const297 double ProfilingCommandQueue::GetQueueExecutionTimeMs() const {
298   const uint64_t start = events_.front().GetStartedTimeNs();
299   const uint64_t end = events_.back().GetFinishedTimeNs();
300   const uint64_t time_ns = (end - start);
301 
302   return static_cast<double>(time_ns) / 1000000.0;
303 }
304 
GetSumOfEventsTimeMs() const305 double ProfilingCommandQueue::GetSumOfEventsTimeMs() const {
306   double sum = 0.0;
307   for (int i = 0; i < events_.size(); ++i) {
308     sum += events_[i].GetEventTimeMs();
309   }
310   return sum;
311 }
312 
CreateProfilingCommandQueue(const CLDevice & device,const CLContext & context,ProfilingCommandQueue * result)313 absl::Status CreateProfilingCommandQueue(const CLDevice& device,
314                                          const CLContext& context,
315                                          ProfilingCommandQueue* result) {
316   int error_code;
317   cl_command_queue queue = clCreateCommandQueue(
318       context.context(), device.id(), CL_QUEUE_PROFILING_ENABLE, &error_code);
319   if (!queue) {
320     return absl::UnknownError(
321         absl::StrCat("Failed to create a command queue - ",
322                      CLErrorCodeToString(error_code)));
323   }
324 
325   *result = ProfilingCommandQueue(queue);
326   return absl::OkStatus();
327 }
328 
329 }  // namespace cl
330 }  // namespace gpu
331 }  // namespace tflite
332