1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
18 
19 #include <cstdint>
20 #include <string>
21 #include <vector>
22 
23 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
24 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
25 #include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
26 #include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
27 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
28 #include "tensorflow/lite/delegates/gpu/common/status.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/profiling_info.h"
30 #include "tensorflow/lite/delegates/gpu/common/types.h"
31 
32 namespace tflite {
33 namespace gpu {
34 namespace cl {
35 
36 // A wrapper around opencl command queue
37 class CLCommandQueue {
38  public:
CLCommandQueue()39   CLCommandQueue() {}
40   CLCommandQueue(cl_command_queue queue, bool has_ownership);
41 
42   // Move only
43   CLCommandQueue(CLCommandQueue&& queue);
44   CLCommandQueue& operator=(CLCommandQueue&& queue);
45   CLCommandQueue(const CLCommandQueue&) = delete;
46   CLCommandQueue& operator=(const CLCommandQueue&) = delete;
47 
48   virtual ~CLCommandQueue();
49 
queue()50   cl_command_queue queue() const { return queue_; }
51 
52   virtual absl::Status Dispatch(const CLKernel& kernel,
53                                 const int3& work_groups_count,
54                                 const int3& work_group_size);
55 
56   absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
57                         const int3& work_group_size, CLEvent* event);
58 
59   absl::Status EnqueueEvent(CLEvent* event);
60 
61   absl::Status EnqueueWriteImage(cl_mem memory, int3 region, const void* data);
62   absl::Status EnqueueReadImage(cl_mem memory, int3 region, void* data);
63 
64   absl::Status EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes,
65                                   const void* data);
66   absl::Status EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes,
67                                  void* data);
68 
69   absl::Status WaitForCompletion();
70 
71  protected:
72   void Release();
73 
74   cl_command_queue queue_ = nullptr;
75   bool has_ownership_ = false;
76 };
77 
78 class ProfilingCommandQueue : public CLCommandQueue {
79  public:
ProfilingCommandQueue()80   ProfilingCommandQueue() {}
81   explicit ProfilingCommandQueue(cl_command_queue queue);
82 
83   // Move only
84   ProfilingCommandQueue(ProfilingCommandQueue&& queue);
85   ProfilingCommandQueue& operator=(ProfilingCommandQueue&& queue);
86   ProfilingCommandQueue(const ProfilingCommandQueue&) = delete;
87   ProfilingCommandQueue& operator=(const ProfilingCommandQueue&) = delete;
88 
89   absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
90                         const int3& work_group_size) override;
91 
92   // will write index for fastest work_group among work_group_sizes
93   absl::Status GetBestWorkGroupIndex(const CLKernel& kernel,
94                                      const GpuInfo& gpu_info,
95                                      const std::vector<int3>& work_groups_count,
96                                      const std::vector<int3>& work_group_sizes,
97                                      int* index);
98 
99   // call ResetMeasurements() to start new seriese of measurements
100   void ResetMeasurements();
101 
102   double GetQueueExecutionTimeMs() const;
103 
104   // Difference from GetQueueExecutionTimeMs is that this number doesn't include
105   // time between kernels(kernels launches or preparing) on GPU. Usually, this
106   // time should be 5-10% better than GetQueueExecutionTimeMs, because 5-10%
107   // spend on something else(maybe kernels launches or preparing)
108   double GetSumOfEventsTimeMs() const;
109 
110   // This label will be used for all subsequent dispatches.
111   void SetEventsLabel(const std::string& name);
112 
113   ProfilingInfo GetProfilingInfo() const;
114 
115  private:
116   std::vector<CLEvent> events_;
117   std::string current_label_;
118 };
119 
120 absl::Status CreateCLCommandQueue(const CLDevice& device,
121                                   const CLContext& context,
122                                   CLCommandQueue* result);
123 
124 absl::Status CreateProfilingCommandQueue(const CLDevice& device,
125                                          const CLContext& context,
126                                          ProfilingCommandQueue* result);
127 
128 }  // namespace cl
129 }  // namespace gpu
130 }  // namespace tflite
131 
132 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
133