1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41 
42 #include "precomp.hpp"
43 #include <list>
44 #include <map>
45 #include <string>
46 #include <sstream>
47 #include <iostream> // std::cerr
48 
49 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
50 #define CV_OPENCL_SHOW_RUN_ERRORS       0
51 #define CV_OPENCL_SHOW_SVM_ERROR_LOG    1
52 #define CV_OPENCL_SHOW_SVM_LOG          0
53 
54 #include "opencv2/core/bufferpool.hpp"
55 #ifndef LOG_BUFFER_POOL
56 # if 0
57 #   define LOG_BUFFER_POOL printf
58 # else
59 #   define LOG_BUFFER_POOL(...)
60 # endif
61 #endif
62 
63 
64 // TODO Move to some common place
getBoolParameter(const char * name,bool defaultValue)65 static bool getBoolParameter(const char* name, bool defaultValue)
66 {
67 /*
68  * If your system doesn't support getenv(), define NO_GETENV to disable
69  * this feature.
70  */
71 #ifdef NO_GETENV
72     const char* envValue = NULL;
73 #else
74     const char* envValue = getenv(name);
75 #endif
76     if (envValue == NULL)
77     {
78         return defaultValue;
79     }
80     cv::String value = envValue;
81     if (value == "1" || value == "True" || value == "true" || value == "TRUE")
82     {
83         return true;
84     }
85     if (value == "0" || value == "False" || value == "false" || value == "FALSE")
86     {
87         return false;
88     }
89     CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
90 }
91 
92 
93 // TODO Move to some common place
getConfigurationParameterForSize(const char * name,size_t defaultValue)94 static size_t getConfigurationParameterForSize(const char* name, size_t defaultValue)
95 {
96 #ifdef NO_GETENV
97     const char* envValue = NULL;
98 #else
99     const char* envValue = getenv(name);
100 #endif
101     if (envValue == NULL)
102     {
103         return defaultValue;
104     }
105     cv::String value = envValue;
106     size_t pos = 0;
107     for (; pos < value.size(); pos++)
108     {
109         if (!isdigit(value[pos]))
110             break;
111     }
112     cv::String valueStr = value.substr(0, pos);
113     cv::String suffixStr = value.substr(pos, value.length() - pos);
114     int v = atoi(valueStr.c_str());
115     if (suffixStr.length() == 0)
116         return v;
117     else if (suffixStr == "MB" || suffixStr == "Mb" || suffixStr == "mb")
118         return v * 1024 * 1024;
119     else if (suffixStr == "KB" || suffixStr == "Kb" || suffixStr == "kb")
120         return v * 1024;
121     CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
122 }
123 
124 #if CV_OPENCL_SHOW_SVM_LOG
125 // TODO add timestamp logging
126 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
127 #else
128 #define CV_OPENCL_SVM_TRACE_P(...)
129 #endif
130 
131 #if CV_OPENCL_SHOW_SVM_ERROR_LOG
132 // TODO add timestamp logging
133 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
134 #else
135 #define CV_OPENCL_SVM_TRACE_ERROR_P(...)
136 #endif
137 
138 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
139 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
140 
141 #ifdef HAVE_OPENCL
142 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
143 #else
144 // TODO FIXIT: This file can't be build without OPENCL
145 
146 /*
147   Part of the file is an extract from the standard OpenCL headers from Khronos site.
148   Below is the original copyright.
149 */
150 
151 /*******************************************************************************
152  * Copyright (c) 2008 - 2012 The Khronos Group Inc.
153  *
154  * Permission is hereby granted, free of charge, to any person obtaining a
155  * copy of this software and/or associated documentation files (the
156  * "Materials"), to deal in the Materials without restriction, including
157  * without limitation the rights to use, copy, modify, merge, publish,
158  * distribute, sublicense, and/or sell copies of the Materials, and to
159  * permit persons to whom the Materials are furnished to do so, subject to
160  * the following conditions:
161  *
162  * The above copyright notice and this permission notice shall be included
163  * in all copies or substantial portions of the Materials.
164  *
165  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
166  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
167  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
168  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
169  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
170  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
171  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
172  ******************************************************************************/
173 
174 #if 0 //defined __APPLE__
175 #define HAVE_OPENCL 1
176 #else
177 #undef HAVE_OPENCL
178 #endif
179 
180 #define OPENCV_CL_NOT_IMPLEMENTED -1000
181 
182 #ifdef HAVE_OPENCL
183 
184 #if defined __APPLE__
185 #include <OpenCL/opencl.h>
186 #else
187 #include <CL/opencl.h>
188 #endif
189 
190 static const bool g_haveOpenCL = true;
191 
192 #else
193 
194 extern "C" {
195 
196 struct _cl_platform_id { int dummy; };
197 struct _cl_device_id { int dummy; };
198 struct _cl_context { int dummy; };
199 struct _cl_command_queue { int dummy; };
200 struct _cl_mem { int dummy; };
201 struct _cl_program { int dummy; };
202 struct _cl_kernel { int dummy; };
203 struct _cl_event { int dummy; };
204 struct _cl_sampler { int dummy; };
205 
206 typedef struct _cl_platform_id *    cl_platform_id;
207 typedef struct _cl_device_id *      cl_device_id;
208 typedef struct _cl_context *        cl_context;
209 typedef struct _cl_command_queue *  cl_command_queue;
210 typedef struct _cl_mem *            cl_mem;
211 typedef struct _cl_program *        cl_program;
212 typedef struct _cl_kernel *         cl_kernel;
213 typedef struct _cl_event *          cl_event;
214 typedef struct _cl_sampler *        cl_sampler;
215 
216 typedef int cl_int;
217 typedef unsigned cl_uint;
218 #if defined (_WIN32) && defined(_MSC_VER)
219     typedef __int64 cl_long;
220     typedef unsigned __int64 cl_ulong;
221 #else
222     typedef long cl_long;
223     typedef unsigned long cl_ulong;
224 #endif
225 
226 typedef cl_uint             cl_bool; /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
227 typedef cl_ulong            cl_bitfield;
228 typedef cl_bitfield         cl_device_type;
229 typedef cl_uint             cl_platform_info;
230 typedef cl_uint             cl_device_info;
231 typedef cl_bitfield         cl_device_fp_config;
232 typedef cl_uint             cl_device_mem_cache_type;
233 typedef cl_uint             cl_device_local_mem_type;
234 typedef cl_bitfield         cl_device_exec_capabilities;
235 typedef cl_bitfield         cl_command_queue_properties;
236 typedef intptr_t            cl_device_partition_property;
237 typedef cl_bitfield         cl_device_affinity_domain;
238 
239 typedef intptr_t            cl_context_properties;
240 typedef cl_uint             cl_context_info;
241 typedef cl_uint             cl_command_queue_info;
242 typedef cl_uint             cl_channel_order;
243 typedef cl_uint             cl_channel_type;
244 typedef cl_bitfield         cl_mem_flags;
245 typedef cl_uint             cl_mem_object_type;
246 typedef cl_uint             cl_mem_info;
247 typedef cl_bitfield         cl_mem_migration_flags;
248 typedef cl_uint             cl_image_info;
249 typedef cl_uint             cl_buffer_create_type;
250 typedef cl_uint             cl_addressing_mode;
251 typedef cl_uint             cl_filter_mode;
252 typedef cl_uint             cl_sampler_info;
253 typedef cl_bitfield         cl_map_flags;
254 typedef cl_uint             cl_program_info;
255 typedef cl_uint             cl_program_build_info;
256 typedef cl_uint             cl_program_binary_type;
257 typedef cl_int              cl_build_status;
258 typedef cl_uint             cl_kernel_info;
259 typedef cl_uint             cl_kernel_arg_info;
260 typedef cl_uint             cl_kernel_arg_address_qualifier;
261 typedef cl_uint             cl_kernel_arg_access_qualifier;
262 typedef cl_bitfield         cl_kernel_arg_type_qualifier;
263 typedef cl_uint             cl_kernel_work_group_info;
264 typedef cl_uint             cl_event_info;
265 typedef cl_uint             cl_command_type;
266 typedef cl_uint             cl_profiling_info;
267 
268 
269 typedef struct _cl_image_format {
270     cl_channel_order        image_channel_order;
271     cl_channel_type         image_channel_data_type;
272 } cl_image_format;
273 
274 typedef struct _cl_image_desc {
275     cl_mem_object_type      image_type;
276     size_t                  image_width;
277     size_t                  image_height;
278     size_t                  image_depth;
279     size_t                  image_array_size;
280     size_t                  image_row_pitch;
281     size_t                  image_slice_pitch;
282     cl_uint                 num_mip_levels;
283     cl_uint                 num_samples;
284     cl_mem                  buffer;
285 } cl_image_desc;
286 
287 typedef struct _cl_buffer_region {
288     size_t                  origin;
289     size_t                  size;
290 } cl_buffer_region;
291 
292 
293 //////////////////////////////////////////////////////////
294 
295 #define CL_SUCCESS                                  0
296 #define CL_DEVICE_NOT_FOUND                         -1
297 #define CL_DEVICE_NOT_AVAILABLE                     -2
298 #define CL_COMPILER_NOT_AVAILABLE                   -3
299 #define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
300 #define CL_OUT_OF_RESOURCES                         -5
301 #define CL_OUT_OF_HOST_MEMORY                       -6
302 #define CL_PROFILING_INFO_NOT_AVAILABLE             -7
303 #define CL_MEM_COPY_OVERLAP                         -8
304 #define CL_IMAGE_FORMAT_MISMATCH                    -9
305 #define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
306 #define CL_BUILD_PROGRAM_FAILURE                    -11
307 #define CL_MAP_FAILURE                              -12
308 #define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
309 #define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
310 #define CL_COMPILE_PROGRAM_FAILURE                  -15
311 #define CL_LINKER_NOT_AVAILABLE                     -16
312 #define CL_LINK_PROGRAM_FAILURE                     -17
313 #define CL_DEVICE_PARTITION_FAILED                  -18
314 #define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
315 
316 #define CL_INVALID_VALUE                            -30
317 #define CL_INVALID_DEVICE_TYPE                      -31
318 #define CL_INVALID_PLATFORM                         -32
319 #define CL_INVALID_DEVICE                           -33
320 #define CL_INVALID_CONTEXT                          -34
321 #define CL_INVALID_QUEUE_PROPERTIES                 -35
322 #define CL_INVALID_COMMAND_QUEUE                    -36
323 #define CL_INVALID_HOST_PTR                         -37
324 #define CL_INVALID_MEM_OBJECT                       -38
325 #define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
326 #define CL_INVALID_IMAGE_SIZE                       -40
327 #define CL_INVALID_SAMPLER                          -41
328 #define CL_INVALID_BINARY                           -42
329 #define CL_INVALID_BUILD_OPTIONS                    -43
330 #define CL_INVALID_PROGRAM                          -44
331 #define CL_INVALID_PROGRAM_EXECUTABLE               -45
332 #define CL_INVALID_KERNEL_NAME                      -46
333 #define CL_INVALID_KERNEL_DEFINITION                -47
334 #define CL_INVALID_KERNEL                           -48
335 #define CL_INVALID_ARG_INDEX                        -49
336 #define CL_INVALID_ARG_VALUE                        -50
337 #define CL_INVALID_ARG_SIZE                         -51
338 #define CL_INVALID_KERNEL_ARGS                      -52
339 #define CL_INVALID_WORK_DIMENSION                   -53
340 #define CL_INVALID_WORK_GROUP_SIZE                  -54
341 #define CL_INVALID_WORK_ITEM_SIZE                   -55
342 #define CL_INVALID_GLOBAL_OFFSET                    -56
343 #define CL_INVALID_EVENT_WAIT_LIST                  -57
344 #define CL_INVALID_EVENT                            -58
345 #define CL_INVALID_OPERATION                        -59
346 #define CL_INVALID_GL_OBJECT                        -60
347 #define CL_INVALID_BUFFER_SIZE                      -61
348 #define CL_INVALID_MIP_LEVEL                        -62
349 #define CL_INVALID_GLOBAL_WORK_SIZE                 -63
350 #define CL_INVALID_PROPERTY                         -64
351 #define CL_INVALID_IMAGE_DESCRIPTOR                 -65
352 #define CL_INVALID_COMPILER_OPTIONS                 -66
353 #define CL_INVALID_LINKER_OPTIONS                   -67
354 #define CL_INVALID_DEVICE_PARTITION_COUNT           -68
355 
356 /*#define CL_VERSION_1_0                              1
357 #define CL_VERSION_1_1                              1
358 #define CL_VERSION_1_2                              1*/
359 
360 #define CL_FALSE                                    0
361 #define CL_TRUE                                     1
362 #define CL_BLOCKING                                 CL_TRUE
363 #define CL_NON_BLOCKING                             CL_FALSE
364 
365 #define CL_PLATFORM_PROFILE                         0x0900
366 #define CL_PLATFORM_VERSION                         0x0901
367 #define CL_PLATFORM_NAME                            0x0902
368 #define CL_PLATFORM_VENDOR                          0x0903
369 #define CL_PLATFORM_EXTENSIONS                      0x0904
370 
371 #define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
372 #define CL_DEVICE_TYPE_CPU                          (1 << 1)
373 #define CL_DEVICE_TYPE_GPU                          (1 << 2)
374 #define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
375 #define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
376 #define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
377 #define CL_DEVICE_TYPE                              0x1000
378 #define CL_DEVICE_VENDOR_ID                         0x1001
379 #define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
380 #define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
381 #define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
382 #define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
383 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
384 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
385 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
386 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
387 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
388 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
389 #define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
390 #define CL_DEVICE_ADDRESS_BITS                      0x100D
391 #define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
392 #define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
393 #define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
394 #define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
395 #define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
396 #define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
397 #define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
398 #define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
399 #define CL_DEVICE_IMAGE_SUPPORT                     0x1016
400 #define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
401 #define CL_DEVICE_MAX_SAMPLERS                      0x1018
402 #define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
403 #define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
404 #define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
405 #define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
406 #define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
407 #define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
408 #define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
409 #define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
410 #define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
411 #define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
412 #define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
413 #define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
414 #define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
415 #define CL_DEVICE_ENDIAN_LITTLE                     0x1026
416 #define CL_DEVICE_AVAILABLE                         0x1027
417 #define CL_DEVICE_COMPILER_AVAILABLE                0x1028
418 #define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
419 #define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
420 #define CL_DEVICE_NAME                              0x102B
421 #define CL_DEVICE_VENDOR                            0x102C
422 #define CL_DRIVER_VERSION                           0x102D
423 #define CL_DEVICE_PROFILE                           0x102E
424 #define CL_DEVICE_VERSION                           0x102F
425 #define CL_DEVICE_EXTENSIONS                        0x1030
426 #define CL_DEVICE_PLATFORM                          0x1031
427 #define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
428 #define CL_DEVICE_HALF_FP_CONFIG                    0x1033
429 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
430 #define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
431 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
432 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
433 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
434 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
435 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
436 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
437 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
438 #define CL_DEVICE_OPENCL_C_VERSION                  0x103D
439 #define CL_DEVICE_LINKER_AVAILABLE                  0x103E
440 #define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
441 #define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
442 #define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
443 #define CL_DEVICE_PARENT_DEVICE                     0x1042
444 #define CL_DEVICE_PARTITION_MAX_SUB_DEVICES         0x1043
445 #define CL_DEVICE_PARTITION_PROPERTIES              0x1044
446 #define CL_DEVICE_PARTITION_AFFINITY_DOMAIN         0x1045
447 #define CL_DEVICE_PARTITION_TYPE                    0x1046
448 #define CL_DEVICE_REFERENCE_COUNT                   0x1047
449 #define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       0x1048
450 #define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
451 #define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
452 #define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT      0x104B
453 
454 #define CL_FP_DENORM                                (1 << 0)
455 #define CL_FP_INF_NAN                               (1 << 1)
456 #define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
457 #define CL_FP_ROUND_TO_ZERO                         (1 << 3)
458 #define CL_FP_ROUND_TO_INF                          (1 << 4)
459 #define CL_FP_FMA                                   (1 << 5)
460 #define CL_FP_SOFT_FLOAT                            (1 << 6)
461 #define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
462 
463 #define CL_NONE                                     0x0
464 #define CL_READ_ONLY_CACHE                          0x1
465 #define CL_READ_WRITE_CACHE                         0x2
466 #define CL_LOCAL                                    0x1
467 #define CL_GLOBAL                                   0x2
468 #define CL_EXEC_KERNEL                              (1 << 0)
469 #define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
470 #define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
471 #define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
472 
473 #define CL_CONTEXT_REFERENCE_COUNT                  0x1080
474 #define CL_CONTEXT_DEVICES                          0x1081
475 #define CL_CONTEXT_PROPERTIES                       0x1082
476 #define CL_CONTEXT_NUM_DEVICES                      0x1083
477 #define CL_CONTEXT_PLATFORM                         0x1084
478 #define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
479 
480 #define CL_DEVICE_PARTITION_EQUALLY                 0x1086
481 #define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
482 #define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
483 #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
484 #define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
485 #define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
486 #define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
487 #define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
488 #define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
489 #define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE       (1 << 5)
490 #define CL_QUEUE_CONTEXT                            0x1090
491 #define CL_QUEUE_DEVICE                             0x1091
492 #define CL_QUEUE_REFERENCE_COUNT                    0x1092
493 #define CL_QUEUE_PROPERTIES                         0x1093
494 #define CL_MEM_READ_WRITE                           (1 << 0)
495 #define CL_MEM_WRITE_ONLY                           (1 << 1)
496 #define CL_MEM_READ_ONLY                            (1 << 2)
497 #define CL_MEM_USE_HOST_PTR                         (1 << 3)
498 #define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
499 #define CL_MEM_COPY_HOST_PTR                        (1 << 5)
500 // reserved                                         (1 << 6)
501 #define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
502 #define CL_MEM_HOST_READ_ONLY                       (1 << 8)
503 #define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
504 #define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
505 #define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
506 
507 #define CL_R                                        0x10B0
508 #define CL_A                                        0x10B1
509 #define CL_RG                                       0x10B2
510 #define CL_RA                                       0x10B3
511 #define CL_RGB                                      0x10B4
512 #define CL_RGBA                                     0x10B5
513 #define CL_BGRA                                     0x10B6
514 #define CL_ARGB                                     0x10B7
515 #define CL_INTENSITY                                0x10B8
516 #define CL_LUMINANCE                                0x10B9
517 #define CL_Rx                                       0x10BA
518 #define CL_RGx                                      0x10BB
519 #define CL_RGBx                                     0x10BC
520 #define CL_DEPTH                                    0x10BD
521 #define CL_DEPTH_STENCIL                            0x10BE
522 
523 #define CL_SNORM_INT8                               0x10D0
524 #define CL_SNORM_INT16                              0x10D1
525 #define CL_UNORM_INT8                               0x10D2
526 #define CL_UNORM_INT16                              0x10D3
527 #define CL_UNORM_SHORT_565                          0x10D4
528 #define CL_UNORM_SHORT_555                          0x10D5
529 #define CL_UNORM_INT_101010                         0x10D6
530 #define CL_SIGNED_INT8                              0x10D7
531 #define CL_SIGNED_INT16                             0x10D8
532 #define CL_SIGNED_INT32                             0x10D9
533 #define CL_UNSIGNED_INT8                            0x10DA
534 #define CL_UNSIGNED_INT16                           0x10DB
535 #define CL_UNSIGNED_INT32                           0x10DC
536 #define CL_HALF_FLOAT                               0x10DD
537 #define CL_FLOAT                                    0x10DE
538 #define CL_UNORM_INT24                              0x10DF
539 
540 #define CL_MEM_OBJECT_BUFFER                        0x10F0
541 #define CL_MEM_OBJECT_IMAGE2D                       0x10F1
542 #define CL_MEM_OBJECT_IMAGE3D                       0x10F2
543 #define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
544 #define CL_MEM_OBJECT_IMAGE1D                       0x10F4
545 #define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
546 #define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
547 
548 #define CL_MEM_TYPE                                 0x1100
549 #define CL_MEM_FLAGS                                0x1101
550 #define CL_MEM_SIZE                                 0x1102
551 #define CL_MEM_HOST_PTR                             0x1103
552 #define CL_MEM_MAP_COUNT                            0x1104
553 #define CL_MEM_REFERENCE_COUNT                      0x1105
554 #define CL_MEM_CONTEXT                              0x1106
555 #define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
556 #define CL_MEM_OFFSET                               0x1108
557 
558 #define CL_IMAGE_FORMAT                             0x1110
559 #define CL_IMAGE_ELEMENT_SIZE                       0x1111
560 #define CL_IMAGE_ROW_PITCH                          0x1112
561 #define CL_IMAGE_SLICE_PITCH                        0x1113
562 #define CL_IMAGE_WIDTH                              0x1114
563 #define CL_IMAGE_HEIGHT                             0x1115
564 #define CL_IMAGE_DEPTH                              0x1116
565 #define CL_IMAGE_ARRAY_SIZE                         0x1117
566 #define CL_IMAGE_BUFFER                             0x1118
567 #define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
568 #define CL_IMAGE_NUM_SAMPLES                        0x111A
569 
570 #define CL_ADDRESS_NONE                             0x1130
571 #define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
572 #define CL_ADDRESS_CLAMP                            0x1132
573 #define CL_ADDRESS_REPEAT                           0x1133
574 #define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
575 
576 #define CL_FILTER_NEAREST                           0x1140
577 #define CL_FILTER_LINEAR                            0x1141
578 
579 #define CL_SAMPLER_REFERENCE_COUNT                  0x1150
580 #define CL_SAMPLER_CONTEXT                          0x1151
581 #define CL_SAMPLER_NORMALIZED_COORDS                0x1152
582 #define CL_SAMPLER_ADDRESSING_MODE                  0x1153
583 #define CL_SAMPLER_FILTER_MODE                      0x1154
584 
585 #define CL_MAP_READ                                 (1 << 0)
586 #define CL_MAP_WRITE                                (1 << 1)
587 #define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
588 
589 #define CL_PROGRAM_REFERENCE_COUNT                  0x1160
590 #define CL_PROGRAM_CONTEXT                          0x1161
591 #define CL_PROGRAM_NUM_DEVICES                      0x1162
592 #define CL_PROGRAM_DEVICES                          0x1163
593 #define CL_PROGRAM_SOURCE                           0x1164
594 #define CL_PROGRAM_BINARY_SIZES                     0x1165
595 #define CL_PROGRAM_BINARIES                         0x1166
596 #define CL_PROGRAM_NUM_KERNELS                      0x1167
597 #define CL_PROGRAM_KERNEL_NAMES                     0x1168
598 #define CL_PROGRAM_BUILD_STATUS                     0x1181
599 #define CL_PROGRAM_BUILD_OPTIONS                    0x1182
600 #define CL_PROGRAM_BUILD_LOG                        0x1183
601 #define CL_PROGRAM_BINARY_TYPE                      0x1184
602 #define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
603 #define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
604 #define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
605 #define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
606 
607 #define CL_BUILD_SUCCESS                            0
608 #define CL_BUILD_NONE                               -1
609 #define CL_BUILD_ERROR                              -2
610 #define CL_BUILD_IN_PROGRESS                        -3
611 
612 #define CL_KERNEL_FUNCTION_NAME                     0x1190
613 #define CL_KERNEL_NUM_ARGS                          0x1191
614 #define CL_KERNEL_REFERENCE_COUNT                   0x1192
615 #define CL_KERNEL_CONTEXT                           0x1193
616 #define CL_KERNEL_PROGRAM                           0x1194
617 #define CL_KERNEL_ATTRIBUTES                        0x1195
618 #define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
619 #define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
620 #define CL_KERNEL_ARG_TYPE_NAME                     0x1198
621 #define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
622 #define CL_KERNEL_ARG_NAME                          0x119A
623 #define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
624 #define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
625 #define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
626 #define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
627 #define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
628 #define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
629 #define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
630 #define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
631 #define CL_KERNEL_ARG_TYPE_NONE                     0
632 #define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
633 #define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
634 #define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
635 #define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
636 #define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
637 #define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
638 #define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
639 #define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
640 #define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
641 
642 #define CL_EVENT_COMMAND_QUEUE                      0x11D0
643 #define CL_EVENT_COMMAND_TYPE                       0x11D1
644 #define CL_EVENT_REFERENCE_COUNT                    0x11D2
645 #define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
646 #define CL_EVENT_CONTEXT                            0x11D4
647 
648 #define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
649 #define CL_COMMAND_TASK                             0x11F1
650 #define CL_COMMAND_NATIVE_KERNEL                    0x11F2
651 #define CL_COMMAND_READ_BUFFER                      0x11F3
652 #define CL_COMMAND_WRITE_BUFFER                     0x11F4
653 #define CL_COMMAND_COPY_BUFFER                      0x11F5
654 #define CL_COMMAND_READ_IMAGE                       0x11F6
655 #define CL_COMMAND_WRITE_IMAGE                      0x11F7
656 #define CL_COMMAND_COPY_IMAGE                       0x11F8
657 #define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
658 #define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
659 #define CL_COMMAND_MAP_BUFFER                       0x11FB
660 #define CL_COMMAND_MAP_IMAGE                        0x11FC
661 #define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
662 #define CL_COMMAND_MARKER                           0x11FE
663 #define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
664 #define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
665 #define CL_COMMAND_READ_BUFFER_RECT                 0x1201
666 #define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
667 #define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
668 #define CL_COMMAND_USER                             0x1204
669 #define CL_COMMAND_BARRIER                          0x1205
670 #define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
671 #define CL_COMMAND_FILL_BUFFER                      0x1207
672 #define CL_COMMAND_FILL_IMAGE                       0x1208
673 
674 #define CL_COMPLETE                                 0x0
675 #define CL_RUNNING                                  0x1
676 #define CL_SUBMITTED                                0x2
677 #define CL_QUEUED                                   0x3
678 #define CL_BUFFER_CREATE_TYPE_REGION                0x1220
679 
680 #define CL_PROFILING_COMMAND_QUEUED                 0x1280
681 #define CL_PROFILING_COMMAND_SUBMIT                 0x1281
682 #define CL_PROFILING_COMMAND_START                  0x1282
683 #define CL_PROFILING_COMMAND_END                    0x1283
684 
685 #define CL_CALLBACK CV_STDCALL
686 
687 static volatile bool g_haveOpenCL = false;
688 static const char* oclFuncToCheck = "clEnqueueReadBufferRect";
689 
690 #if defined(__APPLE__)
691 #include <dlfcn.h>
692 
initOpenCLAndLoad(const char * funcname)693 static void* initOpenCLAndLoad(const char* funcname)
694 {
695     static bool initialized = false;
696     static void* handle = 0;
697     if (!handle)
698     {
699         if(!initialized)
700         {
701             const char* oclpath = getenv("OPENCV_OPENCL_RUNTIME");
702             oclpath = oclpath && strlen(oclpath) > 0 ? oclpath :
703                 "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL";
704             handle = dlopen(oclpath, RTLD_LAZY);
705             initialized = true;
706             g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
707             if( g_haveOpenCL )
708                 fprintf(stderr, "Successfully loaded OpenCL v1.1+ runtime from %s\n", oclpath);
709             else
710                 fprintf(stderr, "Failed to load OpenCL runtime\n");
711         }
712         if(!handle)
713             return 0;
714     }
715 
716     return funcname && handle ? dlsym(handle, funcname) : 0;
717 }
718 
719 #elif defined WIN32 || defined _WIN32
720 
721 #ifndef _WIN32_WINNT           // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?)
722   #define _WIN32_WINNT 0x0400  // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx
723 #endif
724 #include <windows.h>
725 #if (_WIN32_WINNT >= 0x0602)
726   #include <synchapi.h>
727 #endif
728 #undef small
729 #undef min
730 #undef max
731 #undef abs
732 
initOpenCLAndLoad(const char * funcname)733 static void* initOpenCLAndLoad(const char* funcname)
734 {
735     static bool initialized = false;
736     static HMODULE handle = 0;
737     if (!handle)
738     {
739 #ifndef WINRT
740         if(!initialized)
741         {
742             handle = LoadLibraryA("OpenCL.dll");
743             initialized = true;
744             g_haveOpenCL = handle != 0 && GetProcAddress(handle, oclFuncToCheck) != 0;
745         }
746 #endif
747         if(!handle)
748             return 0;
749     }
750 
751     return funcname ? (void*)GetProcAddress(handle, funcname) : 0;
752 }
753 
754 #elif defined(__linux)
755 
756 #include <dlfcn.h>
757 #include <stdio.h>
758 
initOpenCLAndLoad(const char * funcname)759 static void* initOpenCLAndLoad(const char* funcname)
760 {
761     static bool initialized = false;
762     static void* handle = 0;
763     if (!handle)
764     {
765         if(!initialized)
766         {
767             handle = dlopen("libOpenCL.so", RTLD_LAZY);
768             if(!handle)
769                 handle = dlopen("libCL.so", RTLD_LAZY);
770             initialized = true;
771             g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
772         }
773         if(!handle)
774             return 0;
775     }
776 
777     return funcname ? (void*)dlsym(handle, funcname) : 0;
778 }
779 
780 #else
781 
initOpenCLAndLoad(const char *)782 static void* initOpenCLAndLoad(const char*)
783 {
784     return 0;
785 }
786 
787 #endif
788 
789 
790 #define OCL_FUNC(rettype, funcname, argsdecl, args) \
791     typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
792     static rettype funcname argsdecl \
793     { \
794         static funcname##_t funcname##_p = 0; \
795         if( !funcname##_p ) \
796         { \
797             funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
798             if( !funcname##_p ) \
799                 return OPENCV_CL_NOT_IMPLEMENTED; \
800         } \
801         return funcname##_p args; \
802     }
803 
804 
805 #define OCL_FUNC_P(rettype, funcname, argsdecl, args) \
806     typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
807     static rettype funcname argsdecl \
808     { \
809         static funcname##_t funcname##_p = 0; \
810         if( !funcname##_p ) \
811         { \
812             funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
813             if( !funcname##_p ) \
814             { \
815                 if( errcode_ret ) \
816                     *errcode_ret = OPENCV_CL_NOT_IMPLEMENTED; \
817                 return 0; \
818             } \
819         } \
820         return funcname##_p args; \
821     }
822 
823 OCL_FUNC(cl_int, clGetPlatformIDs,
824     (cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms),
825     (num_entries, platforms, num_platforms))
826 
827 OCL_FUNC(cl_int, clGetPlatformInfo,
828     (cl_platform_id platform, cl_platform_info param_name,
829     size_t param_value_size, void * param_value,
830     size_t * param_value_size_ret),
831     (platform, param_name, param_value_size, param_value, param_value_size_ret))
832 
833 OCL_FUNC(cl_int, clGetDeviceInfo,
834          (cl_device_id device,
835           cl_device_info param_name,
836           size_t param_value_size,
837           void * param_value,
838           size_t * param_value_size_ret),
839          (device, param_name, param_value_size, param_value, param_value_size_ret))
840 
841 
842 OCL_FUNC(cl_int, clGetDeviceIDs,
843     (cl_platform_id platform,
844     cl_device_type device_type,
845     cl_uint num_entries,
846     cl_device_id * devices,
847     cl_uint * num_devices),
848     (platform, device_type, num_entries, devices, num_devices))
849 
850 OCL_FUNC_P(cl_context, clCreateContext,
851     (const cl_context_properties * properties,
852     cl_uint num_devices,
853     const cl_device_id * devices,
854     void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
855     void * user_data,
856     cl_int * errcode_ret),
857     (properties, num_devices, devices, pfn_notify, user_data, errcode_ret))
858 
859 OCL_FUNC(cl_int, clReleaseContext, (cl_context context), (context))
860 
861 /*
862 OCL_FUNC(cl_int, clRetainContext, (cl_context context), (context))
863 
864 OCL_FUNC_P(cl_context, clCreateContextFromType,
865     (const cl_context_properties * properties,
866     cl_device_type device_type,
867     void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
868     void * user_data,
869     cl_int * errcode_ret),
870     (properties, device_type, pfn_notify, user_data, errcode_ret))
871 
872 OCL_FUNC(cl_int, clGetContextInfo,
873     (cl_context context,
874     cl_context_info param_name,
875     size_t param_value_size,
876     void * param_value,
877     size_t * param_value_size_ret),
878     (context, param_name, param_value_size,
879     param_value, param_value_size_ret))
880 */
881 OCL_FUNC_P(cl_command_queue, clCreateCommandQueue,
882     (cl_context context,
883     cl_device_id device,
884     cl_command_queue_properties properties,
885     cl_int * errcode_ret),
886     (context, device, properties, errcode_ret))
887 
888 OCL_FUNC(cl_int, clReleaseCommandQueue, (cl_command_queue command_queue), (command_queue))
889 
890 OCL_FUNC_P(cl_mem, clCreateBuffer,
891     (cl_context context,
892     cl_mem_flags flags,
893     size_t size,
894     void * host_ptr,
895     cl_int * errcode_ret),
896     (context, flags, size, host_ptr, errcode_ret))
897 
898 /*
899 OCL_FUNC(cl_int, clRetainCommandQueue, (cl_command_queue command_queue), (command_queue))
900 
901 OCL_FUNC(cl_int, clGetCommandQueueInfo,
902  (cl_command_queue command_queue,
903  cl_command_queue_info param_name,
904  size_t param_value_size,
905  void * param_value,
906  size_t * param_value_size_ret),
907  (command_queue, param_name, param_value_size, param_value, param_value_size_ret))
908 
909 OCL_FUNC_P(cl_mem, clCreateSubBuffer,
910     (cl_mem buffer,
911     cl_mem_flags flags,
912     cl_buffer_create_type buffer_create_type,
913     const void * buffer_create_info,
914     cl_int * errcode_ret),
915     (buffer, flags, buffer_create_type, buffer_create_info, errcode_ret))
916 */
917 
918 OCL_FUNC_P(cl_mem, clCreateImage,
919     (cl_context context,
920     cl_mem_flags flags,
921     const cl_image_format * image_format,
922     const cl_image_desc * image_desc,
923     void * host_ptr,
924     cl_int * errcode_ret),
925     (context, flags, image_format, image_desc, host_ptr, errcode_ret))
926 
927 OCL_FUNC_P(cl_mem, clCreateImage2D,
928     (cl_context context,
929     cl_mem_flags flags,
930     const cl_image_format * image_format,
931     size_t image_width,
932     size_t image_height,
933     size_t image_row_pitch,
934     void * host_ptr,
935     cl_int *errcode_ret),
936     (context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret))
937 
938 OCL_FUNC(cl_int, clGetSupportedImageFormats,
939  (cl_context context,
940  cl_mem_flags flags,
941  cl_mem_object_type image_type,
942  cl_uint num_entries,
943  cl_image_format * image_formats,
944  cl_uint * num_image_formats),
945  (context, flags, image_type, num_entries, image_formats, num_image_formats))
946 
947 
948 /*
949 OCL_FUNC(cl_int, clGetMemObjectInfo,
950  (cl_mem memobj,
951  cl_mem_info param_name,
952  size_t param_value_size,
953  void * param_value,
954  size_t * param_value_size_ret),
955  (memobj, param_name, param_value_size, param_value, param_value_size_ret))
956 
957 OCL_FUNC(cl_int, clGetImageInfo,
958  (cl_mem image,
959  cl_image_info param_name,
960  size_t param_value_size,
961  void * param_value,
962  size_t * param_value_size_ret),
963  (image, param_name, param_value_size, param_value, param_value_size_ret))
964 
965 OCL_FUNC(cl_int, clCreateKernelsInProgram,
966  (cl_program program,
967  cl_uint num_kernels,
968  cl_kernel * kernels,
969  cl_uint * num_kernels_ret),
970  (program, num_kernels, kernels, num_kernels_ret))
971 
972 OCL_FUNC(cl_int, clRetainKernel, (cl_kernel kernel), (kernel))
973 
974 OCL_FUNC(cl_int, clGetKernelArgInfo,
975  (cl_kernel kernel,
976  cl_uint arg_indx,
977  cl_kernel_arg_info param_name,
978  size_t param_value_size,
979  void * param_value,
980  size_t * param_value_size_ret),
981  (kernel, arg_indx, param_name, param_value_size, param_value, param_value_size_ret))
982 
983 OCL_FUNC(cl_int, clEnqueueReadImage,
984  (cl_command_queue command_queue,
985  cl_mem image,
986  cl_bool blocking_read,
987  const size_t * origin[3],
988  const size_t * region[3],
989  size_t row_pitch,
990  size_t slice_pitch,
991  void * ptr,
992  cl_uint num_events_in_wait_list,
993  const cl_event * event_wait_list,
994  cl_event * event),
995  (command_queue, image, blocking_read, origin, region,
996  row_pitch, slice_pitch,
997  ptr,
998  num_events_in_wait_list,
999  event_wait_list,
1000  event))
1001 
1002 OCL_FUNC(cl_int, clEnqueueWriteImage,
1003  (cl_command_queue command_queue,
1004  cl_mem image,
1005  cl_bool blocking_write,
1006  const size_t * origin[3],
1007  const size_t * region[3],
1008  size_t input_row_pitch,
1009  size_t input_slice_pitch,
1010  const void * ptr,
1011  cl_uint num_events_in_wait_list,
1012  const cl_event * event_wait_list,
1013  cl_event * event),
1014  (command_queue, image, blocking_write, origin, region, input_row_pitch,
1015  input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
1016 
1017 OCL_FUNC(cl_int, clEnqueueFillImage,
1018  (cl_command_queue command_queue,
1019  cl_mem image,
1020  const void * fill_color,
1021  const size_t * origin[3],
1022  const size_t * region[3],
1023  cl_uint num_events_in_wait_list,
1024  const cl_event * event_wait_list,
1025  cl_event * event),
1026  (command_queue, image, fill_color, origin, region,
1027  num_events_in_wait_list, event_wait_list, event))
1028 
1029 OCL_FUNC(cl_int, clEnqueueCopyImage,
1030  (cl_command_queue command_queue,
1031  cl_mem src_image,
1032  cl_mem dst_image,
1033  const size_t * src_origin[3],
1034  const size_t * dst_origin[3],
1035  const size_t * region[3],
1036  cl_uint num_events_in_wait_list,
1037  const cl_event * event_wait_list,
1038  cl_event * event),
1039  (command_queue, src_image, dst_image, src_origin, dst_origin,
1040  region, num_events_in_wait_list, event_wait_list, event))
1041 
1042 OCL_FUNC(cl_int, clEnqueueCopyImageToBuffer,
1043  (cl_command_queue command_queue,
1044  cl_mem src_image,
1045  cl_mem dst_buffer,
1046  const size_t * src_origin[3],
1047  const size_t * region[3],
1048  size_t dst_offset,
1049  cl_uint num_events_in_wait_list,
1050  const cl_event * event_wait_list,
1051  cl_event * event),
1052  (command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
1053  num_events_in_wait_list, event_wait_list, event))
1054 */
1055 
1056 OCL_FUNC(cl_int, clEnqueueCopyBufferToImage,
1057  (cl_command_queue command_queue,
1058  cl_mem src_buffer,
1059  cl_mem dst_image,
1060  size_t src_offset,
1061  const size_t dst_origin[3],
1062  const size_t region[3],
1063  cl_uint num_events_in_wait_list,
1064  const cl_event * event_wait_list,
1065  cl_event * event),
1066  (command_queue, src_buffer, dst_image, src_offset, dst_origin,
1067  region, num_events_in_wait_list, event_wait_list, event))
1068 
1069  OCL_FUNC(cl_int, clFlush,
1070  (cl_command_queue command_queue),
1071  (command_queue))
1072 
1073 /*
1074 OCL_FUNC_P(void*, clEnqueueMapImage,
1075  (cl_command_queue command_queue,
1076  cl_mem image,
1077  cl_bool blocking_map,
1078  cl_map_flags map_flags,
1079  const size_t * origin[3],
1080  const size_t * region[3],
1081  size_t * image_row_pitch,
1082  size_t * image_slice_pitch,
1083  cl_uint num_events_in_wait_list,
1084  const cl_event * event_wait_list,
1085  cl_event * event,
1086  cl_int * errcode_ret),
1087  (command_queue, image, blocking_map, map_flags, origin, region,
1088  image_row_pitch, image_slice_pitch, num_events_in_wait_list,
1089  event_wait_list, event, errcode_ret))
1090 */
1091 
1092 /*
1093 OCL_FUNC(cl_int, clRetainProgram, (cl_program program), (program))
1094 
1095 OCL_FUNC(cl_int, clGetKernelInfo,
1096  (cl_kernel kernel,
1097  cl_kernel_info param_name,
1098  size_t param_value_size,
1099  void * param_value,
1100  size_t * param_value_size_ret),
1101  (kernel, param_name, param_value_size, param_value, param_value_size_ret))
1102 
1103 OCL_FUNC(cl_int, clRetainMemObject, (cl_mem memobj), (memobj))
1104 
1105 */
1106 
1107 OCL_FUNC(cl_int, clReleaseMemObject, (cl_mem memobj), (memobj))
1108 
1109 
1110 OCL_FUNC_P(cl_program, clCreateProgramWithSource,
1111     (cl_context context,
1112     cl_uint count,
1113     const char ** strings,
1114     const size_t * lengths,
1115     cl_int * errcode_ret),
1116     (context, count, strings, lengths, errcode_ret))
1117 
1118 OCL_FUNC_P(cl_program, clCreateProgramWithBinary,
1119     (cl_context context,
1120     cl_uint num_devices,
1121     const cl_device_id * device_list,
1122     const size_t * lengths,
1123     const unsigned char ** binaries,
1124     cl_int * binary_status,
1125     cl_int * errcode_ret),
1126     (context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret))
1127 
1128 OCL_FUNC(cl_int, clReleaseProgram, (cl_program program), (program))
1129 
1130 OCL_FUNC(cl_int, clBuildProgram,
1131     (cl_program program,
1132     cl_uint num_devices,
1133     const cl_device_id * device_list,
1134     const char * options,
1135     void (CL_CALLBACK * pfn_notify)(cl_program, void *),
1136     void * user_data),
1137     (program, num_devices, device_list, options, pfn_notify, user_data))
1138 
1139 OCL_FUNC(cl_int, clGetProgramInfo,
1140     (cl_program program,
1141     cl_program_info param_name,
1142     size_t param_value_size,
1143     void * param_value,
1144     size_t * param_value_size_ret),
1145     (program, param_name, param_value_size, param_value, param_value_size_ret))
1146 
1147 OCL_FUNC(cl_int, clGetProgramBuildInfo,
1148     (cl_program program,
1149     cl_device_id device,
1150     cl_program_build_info param_name,
1151     size_t param_value_size,
1152     void * param_value,
1153     size_t * param_value_size_ret),
1154     (program, device, param_name, param_value_size, param_value, param_value_size_ret))
1155 
1156 OCL_FUNC_P(cl_kernel, clCreateKernel,
1157     (cl_program program,
1158     const char * kernel_name,
1159     cl_int * errcode_ret),
1160     (program, kernel_name, errcode_ret))
1161 
1162 OCL_FUNC(cl_int, clReleaseKernel, (cl_kernel kernel), (kernel))
1163 
1164 OCL_FUNC(cl_int, clSetKernelArg,
1165     (cl_kernel kernel,
1166     cl_uint arg_index,
1167     size_t arg_size,
1168     const void * arg_value),
1169     (kernel, arg_index, arg_size, arg_value))
1170 
1171 OCL_FUNC(cl_int, clGetKernelWorkGroupInfo,
1172     (cl_kernel kernel,
1173     cl_device_id device,
1174     cl_kernel_work_group_info param_name,
1175     size_t param_value_size,
1176     void * param_value,
1177     size_t * param_value_size_ret),
1178     (kernel, device, param_name, param_value_size, param_value, param_value_size_ret))
1179 
1180 OCL_FUNC(cl_int, clFinish, (cl_command_queue command_queue), (command_queue))
1181 
1182 OCL_FUNC(cl_int, clEnqueueReadBuffer,
1183     (cl_command_queue command_queue,
1184     cl_mem buffer,
1185     cl_bool blocking_read,
1186     size_t offset,
1187     size_t size,
1188     void * ptr,
1189     cl_uint num_events_in_wait_list,
1190     const cl_event * event_wait_list,
1191     cl_event * event),
1192     (command_queue, buffer, blocking_read, offset, size, ptr,
1193     num_events_in_wait_list, event_wait_list, event))
1194 
1195 OCL_FUNC(cl_int, clEnqueueReadBufferRect,
1196     (cl_command_queue command_queue,
1197     cl_mem buffer,
1198     cl_bool blocking_read,
1199     const size_t * buffer_offset,
1200     const size_t * host_offset,
1201     const size_t * region,
1202     size_t buffer_row_pitch,
1203     size_t buffer_slice_pitch,
1204     size_t host_row_pitch,
1205     size_t host_slice_pitch,
1206     void * ptr,
1207     cl_uint num_events_in_wait_list,
1208     const cl_event * event_wait_list,
1209     cl_event * event),
1210     (command_queue, buffer, blocking_read, buffer_offset, host_offset, region, buffer_row_pitch,
1211     buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list,
1212     event_wait_list, event))
1213 
1214 OCL_FUNC(cl_int, clEnqueueWriteBuffer,
1215     (cl_command_queue command_queue,
1216     cl_mem buffer,
1217     cl_bool blocking_write,
1218     size_t offset,
1219     size_t size,
1220     const void * ptr,
1221     cl_uint num_events_in_wait_list,
1222     const cl_event * event_wait_list,
1223     cl_event * event),
1224     (command_queue, buffer, blocking_write, offset, size, ptr,
1225     num_events_in_wait_list, event_wait_list, event))
1226 
1227 OCL_FUNC(cl_int, clEnqueueWriteBufferRect,
1228     (cl_command_queue command_queue,
1229     cl_mem buffer,
1230     cl_bool blocking_write,
1231     const size_t * buffer_offset,
1232     const size_t * host_offset,
1233     const size_t * region,
1234     size_t buffer_row_pitch,
1235     size_t buffer_slice_pitch,
1236     size_t host_row_pitch,
1237     size_t host_slice_pitch,
1238     const void * ptr,
1239     cl_uint num_events_in_wait_list,
1240     const cl_event * event_wait_list,
1241     cl_event * event),
1242     (command_queue, buffer, blocking_write, buffer_offset, host_offset,
1243     region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch,
1244     host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
1245 
1246 /*OCL_FUNC(cl_int, clEnqueueFillBuffer,
1247     (cl_command_queue command_queue,
1248     cl_mem buffer,
1249     const void * pattern,
1250     size_t pattern_size,
1251     size_t offset,
1252     size_t size,
1253     cl_uint num_events_in_wait_list,
1254     const cl_event * event_wait_list,
1255     cl_event * event),
1256     (command_queue, buffer, pattern, pattern_size, offset, size,
1257     num_events_in_wait_list, event_wait_list, event))*/
1258 
1259 OCL_FUNC(cl_int, clEnqueueCopyBuffer,
1260     (cl_command_queue command_queue,
1261     cl_mem src_buffer,
1262     cl_mem dst_buffer,
1263     size_t src_offset,
1264     size_t dst_offset,
1265     size_t size,
1266     cl_uint num_events_in_wait_list,
1267     const cl_event * event_wait_list,
1268     cl_event * event),
1269     (command_queue, src_buffer, dst_buffer, src_offset, dst_offset,
1270     size, num_events_in_wait_list, event_wait_list, event))
1271 
1272 OCL_FUNC(cl_int, clEnqueueCopyBufferRect,
1273     (cl_command_queue command_queue,
1274     cl_mem src_buffer,
1275     cl_mem dst_buffer,
1276     const size_t * src_origin,
1277     const size_t * dst_origin,
1278     const size_t * region,
1279     size_t src_row_pitch,
1280     size_t src_slice_pitch,
1281     size_t dst_row_pitch,
1282     size_t dst_slice_pitch,
1283     cl_uint num_events_in_wait_list,
1284     const cl_event * event_wait_list,
1285     cl_event * event),
1286     (command_queue, src_buffer, dst_buffer, src_origin, dst_origin,
1287     region, src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch,
1288     num_events_in_wait_list, event_wait_list, event))
1289 
1290 OCL_FUNC_P(void*, clEnqueueMapBuffer,
1291     (cl_command_queue command_queue,
1292     cl_mem buffer,
1293     cl_bool blocking_map,
1294     cl_map_flags map_flags,
1295     size_t offset,
1296     size_t size,
1297     cl_uint num_events_in_wait_list,
1298     const cl_event * event_wait_list,
1299     cl_event * event,
1300     cl_int * errcode_ret),
1301     (command_queue, buffer, blocking_map, map_flags, offset, size,
1302     num_events_in_wait_list, event_wait_list, event, errcode_ret))
1303 
1304 OCL_FUNC(cl_int, clEnqueueUnmapMemObject,
1305     (cl_command_queue command_queue,
1306     cl_mem memobj,
1307     void * mapped_ptr,
1308     cl_uint num_events_in_wait_list,
1309     const cl_event * event_wait_list,
1310     cl_event * event),
1311     (command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event))
1312 
1313 OCL_FUNC(cl_int, clEnqueueNDRangeKernel,
1314     (cl_command_queue command_queue,
1315     cl_kernel kernel,
1316     cl_uint work_dim,
1317     const size_t * global_work_offset,
1318     const size_t * global_work_size,
1319     const size_t * local_work_size,
1320     cl_uint num_events_in_wait_list,
1321     const cl_event * event_wait_list,
1322     cl_event * event),
1323     (command_queue, kernel, work_dim, global_work_offset, global_work_size,
1324     local_work_size, num_events_in_wait_list, event_wait_list, event))
1325 
1326 OCL_FUNC(cl_int, clEnqueueTask,
1327     (cl_command_queue command_queue,
1328     cl_kernel kernel,
1329     cl_uint num_events_in_wait_list,
1330     const cl_event * event_wait_list,
1331     cl_event * event),
1332     (command_queue, kernel, num_events_in_wait_list, event_wait_list, event))
1333 
1334 OCL_FUNC(cl_int, clSetEventCallback,
1335     (cl_event event,
1336     cl_int command_exec_callback_type ,
1337     void (CL_CALLBACK  *pfn_event_notify) (cl_event event, cl_int event_command_exec_status, void *user_data),
1338     void *user_data),
1339     (event, command_exec_callback_type, pfn_event_notify, user_data))
1340 
1341 OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event))
1342 
1343 }
1344 
1345 #endif
1346 
1347 #ifndef CL_VERSION_1_2
1348 #define CL_VERSION_1_2
1349 #endif
1350 
1351 #endif
1352 
1353 #ifdef _DEBUG
1354 #define CV_OclDbgAssert CV_DbgAssert
1355 #else
isRaiseError()1356 static bool isRaiseError()
1357 {
1358     static bool initialized = false;
1359     static bool value = false;
1360     if (!initialized)
1361     {
1362         value = getBoolParameter("OPENCV_OPENCL_RAISE_ERROR", false);
1363         initialized = true;
1364     }
1365     return value;
1366 }
1367 #define CV_OclDbgAssert(expr) do { if (isRaiseError()) { CV_Assert(expr); } else { (void)(expr); } } while ((void)0, 0)
1368 #endif
1369 
1370 #ifdef HAVE_OPENCL_SVM
1371 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
1372 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
1373 #include "opencv2/core/opencl/opencl_svm.hpp"
1374 #endif
1375 
1376 namespace cv { namespace ocl {
1377 
1378 struct UMat2D
1379 {
UMat2Dcv::ocl::UMat2D1380     UMat2D(const UMat& m)
1381     {
1382         offset = (int)m.offset;
1383         step = (int)m.step;
1384         rows = m.rows;
1385         cols = m.cols;
1386     }
1387     int offset;
1388     int step;
1389     int rows;
1390     int cols;
1391 };
1392 
1393 struct UMat3D
1394 {
UMat3Dcv::ocl::UMat3D1395     UMat3D(const UMat& m)
1396     {
1397         offset = (int)m.offset;
1398         step = (int)m.step.p[1];
1399         slicestep = (int)m.step.p[0];
1400         slices = (int)m.size.p[0];
1401         rows = m.size.p[1];
1402         cols = m.size.p[2];
1403     }
1404     int offset;
1405     int slicestep;
1406     int step;
1407     int slices;
1408     int rows;
1409     int cols;
1410 };
1411 
1412 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
crc64(const uchar * data,size_t size,uint64 crc0=0)1413 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
1414 {
1415     static uint64 table[256];
1416     static bool initialized = false;
1417 
1418     if( !initialized )
1419     {
1420         for( int i = 0; i < 256; i++ )
1421         {
1422             uint64 c = i;
1423             for( int j = 0; j < 8; j++ )
1424                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
1425             table[i] = c;
1426         }
1427         initialized = true;
1428     }
1429 
1430     uint64 crc = ~crc0;
1431     for( size_t idx = 0; idx < size; idx++ )
1432         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
1433 
1434     return ~crc;
1435 }
1436 
1437 struct HashKey
1438 {
1439     typedef uint64 part;
HashKeycv::ocl::HashKey1440     HashKey(part _a, part _b) : a(_a), b(_b) {}
1441     part a, b;
1442 };
1443 
operator ==(const HashKey & h1,const HashKey & h2)1444 inline bool operator == (const HashKey& h1, const HashKey& h2)
1445 {
1446     return h1.a == h2.a && h1.b == h2.b;
1447 }
1448 
operator <(const HashKey & h1,const HashKey & h2)1449 inline bool operator < (const HashKey& h1, const HashKey& h2)
1450 {
1451     return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b);
1452 }
1453 
1454 
haveOpenCL()1455 bool haveOpenCL()
1456 {
1457 #ifdef HAVE_OPENCL
1458     static bool g_isOpenCLInitialized = false;
1459     static bool g_isOpenCLAvailable = false;
1460 
1461     if (!g_isOpenCLInitialized)
1462     {
1463         try
1464         {
1465             cl_uint n = 0;
1466             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
1467         }
1468         catch (...)
1469         {
1470             g_isOpenCLAvailable = false;
1471         }
1472         g_isOpenCLInitialized = true;
1473     }
1474     return g_isOpenCLAvailable;
1475 #else
1476     return false;
1477 #endif
1478 }
1479 
useOpenCL()1480 bool useOpenCL()
1481 {
1482     CoreTLSData* data = getCoreTlsData().get();
1483     if( data->useOpenCL < 0 )
1484     {
1485         try
1486         {
1487             data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available();
1488         }
1489         catch (...)
1490         {
1491             data->useOpenCL = 0;
1492         }
1493     }
1494     return data->useOpenCL > 0;
1495 }
1496 
setUseOpenCL(bool flag)1497 void setUseOpenCL(bool flag)
1498 {
1499     if( haveOpenCL() )
1500     {
1501         CoreTLSData* data = getCoreTlsData().get();
1502         data->useOpenCL = (flag && Device::getDefault().ptr() != NULL) ? 1 : 0;
1503     }
1504 }
1505 
1506 #ifdef HAVE_CLAMDBLAS
1507 
1508 class AmdBlasHelper
1509 {
1510 public:
getInstance()1511     static AmdBlasHelper & getInstance()
1512     {
1513         static AmdBlasHelper amdBlas;
1514         return amdBlas;
1515     }
1516 
isAvailable() const1517     bool isAvailable() const
1518     {
1519         return g_isAmdBlasAvailable;
1520     }
1521 
~AmdBlasHelper()1522     ~AmdBlasHelper()
1523     {
1524         try
1525         {
1526             clAmdBlasTeardown();
1527         }
1528         catch (...) { }
1529     }
1530 
1531 protected:
AmdBlasHelper()1532     AmdBlasHelper()
1533     {
1534         if (!g_isAmdBlasInitialized)
1535         {
1536             AutoLock lock(m);
1537 
1538             if (!g_isAmdBlasInitialized && haveOpenCL())
1539             {
1540                 try
1541                 {
1542                     g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
1543                 }
1544                 catch (...)
1545                 {
1546                     g_isAmdBlasAvailable = false;
1547                 }
1548             }
1549             else
1550                 g_isAmdBlasAvailable = false;
1551 
1552             g_isAmdBlasInitialized = true;
1553         }
1554     }
1555 
1556 private:
1557     static Mutex m;
1558     static bool g_isAmdBlasInitialized;
1559     static bool g_isAmdBlasAvailable;
1560 };
1561 
1562 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
1563 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
1564 Mutex AmdBlasHelper::m;
1565 
haveAmdBlas()1566 bool haveAmdBlas()
1567 {
1568     return AmdBlasHelper::getInstance().isAvailable();
1569 }
1570 
1571 #else
1572 
haveAmdBlas()1573 bool haveAmdBlas()
1574 {
1575     return false;
1576 }
1577 
1578 #endif
1579 
1580 #ifdef HAVE_CLAMDFFT
1581 
1582 class AmdFftHelper
1583 {
1584 public:
getInstance()1585     static AmdFftHelper & getInstance()
1586     {
1587         static AmdFftHelper amdFft;
1588         return amdFft;
1589     }
1590 
isAvailable() const1591     bool isAvailable() const
1592     {
1593         return g_isAmdFftAvailable;
1594     }
1595 
~AmdFftHelper()1596     ~AmdFftHelper()
1597     {
1598         try
1599         {
1600 //            clAmdFftTeardown();
1601         }
1602         catch (...) { }
1603     }
1604 
1605 protected:
AmdFftHelper()1606     AmdFftHelper()
1607     {
1608         if (!g_isAmdFftInitialized)
1609         {
1610             AutoLock lock(m);
1611 
1612             if (!g_isAmdFftInitialized && haveOpenCL())
1613             {
1614                 try
1615                 {
1616                     cl_uint major, minor, patch;
1617                     CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1618 
1619                     // it throws exception in case AmdFft binaries are not found
1620                     CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1621                     g_isAmdFftAvailable = true;
1622                 }
1623                 catch (const Exception &)
1624                 {
1625                     g_isAmdFftAvailable = false;
1626                 }
1627             }
1628             else
1629                 g_isAmdFftAvailable = false;
1630 
1631             g_isAmdFftInitialized = true;
1632         }
1633     }
1634 
1635 private:
1636     static clAmdFftSetupData setupData;
1637     static Mutex m;
1638     static bool g_isAmdFftInitialized;
1639     static bool g_isAmdFftAvailable;
1640 };
1641 
1642 clAmdFftSetupData AmdFftHelper::setupData;
1643 bool AmdFftHelper::g_isAmdFftAvailable = false;
1644 bool AmdFftHelper::g_isAmdFftInitialized = false;
1645 Mutex AmdFftHelper::m;
1646 
haveAmdFft()1647 bool haveAmdFft()
1648 {
1649     return AmdFftHelper::getInstance().isAvailable();
1650 }
1651 
1652 #else
1653 
haveAmdFft()1654 bool haveAmdFft()
1655 {
1656     return false;
1657 }
1658 
1659 #endif
1660 
haveSVM()1661 bool haveSVM()
1662 {
1663 #ifdef HAVE_OPENCL_SVM
1664     return true;
1665 #else
1666     return false;
1667 #endif
1668 }
1669 
finish()1670 void finish()
1671 {
1672     Queue::getDefault().finish();
1673 }
1674 
1675 #define IMPLEMENT_REFCOUNTABLE() \
1676     void addref() { CV_XADD(&refcount, 1); } \
1677     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
1678     int refcount
1679 
1680 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1681 
1682 struct Platform::Impl
1683 {
Implcv::ocl::Platform::Impl1684     Impl()
1685     {
1686         refcount = 1;
1687         handle = 0;
1688         initialized = false;
1689     }
1690 
~Implcv::ocl::Platform::Impl1691     ~Impl() {}
1692 
initcv::ocl::Platform::Impl1693     void init()
1694     {
1695         if( !initialized )
1696         {
1697             //cl_uint num_entries
1698             cl_uint n = 0;
1699             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1700                 handle = 0;
1701             if( handle != 0 )
1702             {
1703                 char buf[1000];
1704                 size_t len = 0;
1705                 CV_OclDbgAssert(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len) == CL_SUCCESS);
1706                 buf[len] = '\0';
1707                 vendor = String(buf);
1708             }
1709 
1710             initialized = true;
1711         }
1712     }
1713 
1714     IMPLEMENT_REFCOUNTABLE();
1715 
1716     cl_platform_id handle;
1717     String vendor;
1718     bool initialized;
1719 };
1720 
Platform()1721 Platform::Platform()
1722 {
1723     p = 0;
1724 }
1725 
~Platform()1726 Platform::~Platform()
1727 {
1728     if(p)
1729         p->release();
1730 }
1731 
Platform(const Platform & pl)1732 Platform::Platform(const Platform& pl)
1733 {
1734     p = (Impl*)pl.p;
1735     if(p)
1736         p->addref();
1737 }
1738 
operator =(const Platform & pl)1739 Platform& Platform::operator = (const Platform& pl)
1740 {
1741     Impl* newp = (Impl*)pl.p;
1742     if(newp)
1743         newp->addref();
1744     if(p)
1745         p->release();
1746     p = newp;
1747     return *this;
1748 }
1749 
ptr() const1750 void* Platform::ptr() const
1751 {
1752     return p ? p->handle : 0;
1753 }
1754 
getDefault()1755 Platform& Platform::getDefault()
1756 {
1757     static Platform p;
1758     if( !p.p )
1759     {
1760         p.p = new Impl;
1761         p.p->init();
1762     }
1763     return p;
1764 }
1765 
1766 /////////////////////////////////////// Device ////////////////////////////////////////////
1767 
1768 // deviceVersion has format
1769 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1770 // by specification
1771 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1772 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
parseDeviceVersion(const String & deviceVersion,int & major,int & minor)1773 static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
1774 {
1775     major = minor = 0;
1776     if (10 >= deviceVersion.length())
1777         return;
1778     const char *pstr = deviceVersion.c_str();
1779     if (0 != strncmp(pstr, "OpenCL ", 7))
1780         return;
1781     size_t ppos = deviceVersion.find('.', 7);
1782     if (String::npos == ppos)
1783         return;
1784     String temp = deviceVersion.substr(7, ppos - 7);
1785     major = atoi(temp.c_str());
1786     temp = deviceVersion.substr(ppos + 1);
1787     minor = atoi(temp.c_str());
1788 }
1789 
1790 struct Device::Impl
1791 {
Implcv::ocl::Device::Impl1792     Impl(void* d)
1793     {
1794         handle = (cl_device_id)d;
1795         refcount = 1;
1796 
1797         name_ = getStrProp(CL_DEVICE_NAME);
1798         version_ = getStrProp(CL_DEVICE_VERSION);
1799         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1800         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1801         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1802         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1803         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1804         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1805 
1806         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1807         parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1808 
1809         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1810         if (vendorName_ == "Advanced Micro Devices, Inc." ||
1811             vendorName_ == "AMD")
1812             vendorID_ = VENDOR_AMD;
1813         else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1814             vendorID_ = VENDOR_INTEL;
1815         else if (vendorName_ == "NVIDIA Corporation")
1816             vendorID_ = VENDOR_NVIDIA;
1817         else
1818             vendorID_ = UNKNOWN_VENDOR;
1819     }
1820 
1821     template<typename _TpCL, typename _TpOut>
getPropcv::ocl::Device::Impl1822     _TpOut getProp(cl_device_info prop) const
1823     {
1824         _TpCL temp=_TpCL();
1825         size_t sz = 0;
1826 
1827         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1828             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1829     }
1830 
getBoolPropcv::ocl::Device::Impl1831     bool getBoolProp(cl_device_info prop) const
1832     {
1833         cl_bool temp = CL_FALSE;
1834         size_t sz = 0;
1835 
1836         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1837             sz == sizeof(temp) ? temp != 0 : false;
1838     }
1839 
getStrPropcv::ocl::Device::Impl1840     String getStrProp(cl_device_info prop) const
1841     {
1842         char buf[1024];
1843         size_t sz=0;
1844         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1845             sz < sizeof(buf) ? String(buf) : String();
1846     }
1847 
1848     IMPLEMENT_REFCOUNTABLE();
1849     cl_device_id handle;
1850 
1851     String name_;
1852     String version_;
1853     int doubleFPConfig_;
1854     bool hostUnifiedMemory_;
1855     int maxComputeUnits_;
1856     size_t maxWorkGroupSize_;
1857     int type_;
1858     int deviceVersionMajor_;
1859     int deviceVersionMinor_;
1860     String driverVersion_;
1861     String vendorName_;
1862     int vendorID_;
1863 };
1864 
1865 
Device()1866 Device::Device()
1867 {
1868     p = 0;
1869 }
1870 
Device(void * d)1871 Device::Device(void* d)
1872 {
1873     p = 0;
1874     set(d);
1875 }
1876 
Device(const Device & d)1877 Device::Device(const Device& d)
1878 {
1879     p = d.p;
1880     if(p)
1881         p->addref();
1882 }
1883 
operator =(const Device & d)1884 Device& Device::operator = (const Device& d)
1885 {
1886     Impl* newp = (Impl*)d.p;
1887     if(newp)
1888         newp->addref();
1889     if(p)
1890         p->release();
1891     p = newp;
1892     return *this;
1893 }
1894 
~Device()1895 Device::~Device()
1896 {
1897     if(p)
1898         p->release();
1899 }
1900 
set(void * d)1901 void Device::set(void* d)
1902 {
1903     if(p)
1904         p->release();
1905     p = new Impl(d);
1906 }
1907 
ptr() const1908 void* Device::ptr() const
1909 {
1910     return p ? p->handle : 0;
1911 }
1912 
name() const1913 String Device::name() const
1914 { return p ? p->name_ : String(); }
1915 
extensions() const1916 String Device::extensions() const
1917 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
1918 
version() const1919 String Device::version() const
1920 { return p ? p->version_ : String(); }
1921 
vendorName() const1922 String Device::vendorName() const
1923 { return p ? p->vendorName_ : String(); }
1924 
vendorID() const1925 int Device::vendorID() const
1926 { return p ? p->vendorID_ : 0; }
1927 
OpenCL_C_Version() const1928 String Device::OpenCL_C_Version() const
1929 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1930 
OpenCLVersion() const1931 String Device::OpenCLVersion() const
1932 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
1933 
deviceVersionMajor() const1934 int Device::deviceVersionMajor() const
1935 { return p ? p->deviceVersionMajor_ : 0; }
1936 
deviceVersionMinor() const1937 int Device::deviceVersionMinor() const
1938 { return p ? p->deviceVersionMinor_ : 0; }
1939 
driverVersion() const1940 String Device::driverVersion() const
1941 { return p ? p->driverVersion_ : String(); }
1942 
type() const1943 int Device::type() const
1944 { return p ? p->type_ : 0; }
1945 
addressBits() const1946 int Device::addressBits() const
1947 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS) : 0; }
1948 
available() const1949 bool Device::available() const
1950 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1951 
compilerAvailable() const1952 bool Device::compilerAvailable() const
1953 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1954 
linkerAvailable() const1955 bool Device::linkerAvailable() const
1956 #ifdef CL_VERSION_1_2
1957 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1958 #else
1959 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1960 #endif
1961 
doubleFPConfig() const1962 int Device::doubleFPConfig() const
1963 { return p ? p->doubleFPConfig_ : 0; }
1964 
singleFPConfig() const1965 int Device::singleFPConfig() const
1966 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1967 
halfFPConfig() const1968 int Device::halfFPConfig() const
1969 #ifdef CL_VERSION_1_2
1970 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1971 #else
1972 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1973 #endif
1974 
endianLittle() const1975 bool Device::endianLittle() const
1976 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1977 
errorCorrectionSupport() const1978 bool Device::errorCorrectionSupport() const
1979 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1980 
executionCapabilities() const1981 int Device::executionCapabilities() const
1982 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1983 
globalMemCacheSize() const1984 size_t Device::globalMemCacheSize() const
1985 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1986 
globalMemCacheType() const1987 int Device::globalMemCacheType() const
1988 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1989 
globalMemCacheLineSize() const1990 int Device::globalMemCacheLineSize() const
1991 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1992 
globalMemSize() const1993 size_t Device::globalMemSize() const
1994 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1995 
localMemSize() const1996 size_t Device::localMemSize() const
1997 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1998 
localMemType() const1999 int Device::localMemType() const
2000 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
2001 
hostUnifiedMemory() const2002 bool Device::hostUnifiedMemory() const
2003 { return p ? p->hostUnifiedMemory_ : false; }
2004 
imageSupport() const2005 bool Device::imageSupport() const
2006 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
2007 
imageFromBufferSupport() const2008 bool Device::imageFromBufferSupport() const
2009 {
2010     bool ret = false;
2011     if (p)
2012     {
2013         size_t pos = p->getStrProp(CL_DEVICE_EXTENSIONS).find("cl_khr_image2d_from_buffer");
2014         if (pos != String::npos)
2015         {
2016             ret = true;
2017         }
2018     }
2019     return ret;
2020 }
2021 
imagePitchAlignment() const2022 uint Device::imagePitchAlignment() const
2023 {
2024 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
2025     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
2026 #else
2027     return 0;
2028 #endif
2029 }
2030 
imageBaseAddressAlignment() const2031 uint Device::imageBaseAddressAlignment() const
2032 {
2033 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
2034     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
2035 #else
2036     return 0;
2037 #endif
2038 }
2039 
image2DMaxWidth() const2040 size_t Device::image2DMaxWidth() const
2041 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
2042 
image2DMaxHeight() const2043 size_t Device::image2DMaxHeight() const
2044 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
2045 
image3DMaxWidth() const2046 size_t Device::image3DMaxWidth() const
2047 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
2048 
image3DMaxHeight() const2049 size_t Device::image3DMaxHeight() const
2050 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
2051 
image3DMaxDepth() const2052 size_t Device::image3DMaxDepth() const
2053 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
2054 
imageMaxBufferSize() const2055 size_t Device::imageMaxBufferSize() const
2056 #ifdef CL_VERSION_1_2
2057 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
2058 #else
2059 { CV_REQUIRE_OPENCL_1_2_ERROR; }
2060 #endif
2061 
imageMaxArraySize() const2062 size_t Device::imageMaxArraySize() const
2063 #ifdef CL_VERSION_1_2
2064 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
2065 #else
2066 { CV_REQUIRE_OPENCL_1_2_ERROR; }
2067 #endif
2068 
maxClockFrequency() const2069 int Device::maxClockFrequency() const
2070 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
2071 
maxComputeUnits() const2072 int Device::maxComputeUnits() const
2073 { return p ? p->maxComputeUnits_ : 0; }
2074 
maxConstantArgs() const2075 int Device::maxConstantArgs() const
2076 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
2077 
maxConstantBufferSize() const2078 size_t Device::maxConstantBufferSize() const
2079 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
2080 
maxMemAllocSize() const2081 size_t Device::maxMemAllocSize() const
2082 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
2083 
maxParameterSize() const2084 size_t Device::maxParameterSize() const
2085 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
2086 
maxReadImageArgs() const2087 int Device::maxReadImageArgs() const
2088 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
2089 
maxWriteImageArgs() const2090 int Device::maxWriteImageArgs() const
2091 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
2092 
maxSamplers() const2093 int Device::maxSamplers() const
2094 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
2095 
maxWorkGroupSize() const2096 size_t Device::maxWorkGroupSize() const
2097 { return p ? p->maxWorkGroupSize_ : 0; }
2098 
maxWorkItemDims() const2099 int Device::maxWorkItemDims() const
2100 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
2101 
maxWorkItemSizes(size_t * sizes) const2102 void Device::maxWorkItemSizes(size_t* sizes) const
2103 {
2104     if(p)
2105     {
2106         const int MAX_DIMS = 32;
2107         size_t retsz = 0;
2108         CV_OclDbgAssert(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
2109                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz) == CL_SUCCESS);
2110     }
2111 }
2112 
memBaseAddrAlign() const2113 int Device::memBaseAddrAlign() const
2114 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
2115 
nativeVectorWidthChar() const2116 int Device::nativeVectorWidthChar() const
2117 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
2118 
nativeVectorWidthShort() const2119 int Device::nativeVectorWidthShort() const
2120 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
2121 
nativeVectorWidthInt() const2122 int Device::nativeVectorWidthInt() const
2123 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
2124 
nativeVectorWidthLong() const2125 int Device::nativeVectorWidthLong() const
2126 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
2127 
nativeVectorWidthFloat() const2128 int Device::nativeVectorWidthFloat() const
2129 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
2130 
nativeVectorWidthDouble() const2131 int Device::nativeVectorWidthDouble() const
2132 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
2133 
nativeVectorWidthHalf() const2134 int Device::nativeVectorWidthHalf() const
2135 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
2136 
preferredVectorWidthChar() const2137 int Device::preferredVectorWidthChar() const
2138 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
2139 
preferredVectorWidthShort() const2140 int Device::preferredVectorWidthShort() const
2141 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
2142 
preferredVectorWidthInt() const2143 int Device::preferredVectorWidthInt() const
2144 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
2145 
preferredVectorWidthLong() const2146 int Device::preferredVectorWidthLong() const
2147 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
2148 
preferredVectorWidthFloat() const2149 int Device::preferredVectorWidthFloat() const
2150 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
2151 
preferredVectorWidthDouble() const2152 int Device::preferredVectorWidthDouble() const
2153 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
2154 
preferredVectorWidthHalf() const2155 int Device::preferredVectorWidthHalf() const
2156 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
2157 
printfBufferSize() const2158 size_t Device::printfBufferSize() const
2159 #ifdef CL_VERSION_1_2
2160 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
2161 #else
2162 { CV_REQUIRE_OPENCL_1_2_ERROR; }
2163 #endif
2164 
2165 
profilingTimerResolution() const2166 size_t Device::profilingTimerResolution() const
2167 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
2168 
getDefault()2169 const Device& Device::getDefault()
2170 {
2171     const Context& ctx = Context::getDefault();
2172     int idx = getCoreTlsData().get()->device;
2173     const Device& device = ctx.device(idx);
2174     return device;
2175 }
2176 
2177 ////////////////////////////////////// Context ///////////////////////////////////////////////////
2178 
2179 template <typename Functor, typename ObjectType>
getStringInfo(Functor f,ObjectType obj,cl_uint name,std::string & param)2180 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
2181 {
2182     ::size_t required;
2183     cl_int err = f(obj, name, 0, NULL, &required);
2184     if (err != CL_SUCCESS)
2185         return err;
2186 
2187     param.clear();
2188     if (required > 0)
2189     {
2190         AutoBuffer<char> buf(required + 1);
2191         char* ptr = (char*)buf; // cleanup is not needed
2192         err = f(obj, name, required, ptr, NULL);
2193         if (err != CL_SUCCESS)
2194             return err;
2195         param = ptr;
2196     }
2197 
2198     return CL_SUCCESS;
2199 }
2200 
split(const std::string & s,char delim,std::vector<std::string> & elems)2201 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
2202 {
2203     elems.clear();
2204     if (s.size() == 0)
2205         return;
2206     std::istringstream ss(s);
2207     std::string item;
2208     while (!ss.eof())
2209     {
2210         std::getline(ss, item, delim);
2211         elems.push_back(item);
2212     }
2213 }
2214 
2215 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
2216 // Sample: AMD:GPU:
2217 // Sample: AMD:GPU:Tahiti
2218 // Sample: :GPU|CPU: = '' = ':' = '::'
parseOpenCLDeviceConfiguration(const std::string & configurationStr,std::string & platform,std::vector<std::string> & deviceTypes,std::string & deviceNameOrID)2219 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
2220         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
2221 {
2222     std::vector<std::string> parts;
2223     split(configurationStr, ':', parts);
2224     if (parts.size() > 3)
2225     {
2226         std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
2227         return false;
2228     }
2229     if (parts.size() > 2)
2230         deviceNameOrID = parts[2];
2231     if (parts.size() > 1)
2232     {
2233         split(parts[1], '|', deviceTypes);
2234     }
2235     if (parts.size() > 0)
2236     {
2237         platform = parts[0];
2238     }
2239     return true;
2240 }
2241 
2242 #ifdef WINRT
selectOpenCLDevice()2243 static cl_device_id selectOpenCLDevice()
2244 {
2245     return NULL;
2246 }
2247 #else
selectOpenCLDevice()2248 static cl_device_id selectOpenCLDevice()
2249 {
2250     std::string platform, deviceName;
2251     std::vector<std::string> deviceTypes;
2252 
2253     const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
2254     if (configuration &&
2255             (strcmp(configuration, "disabled") == 0 ||
2256              !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
2257             ))
2258         return NULL;
2259 
2260     bool isID = false;
2261     int deviceID = -1;
2262     if (deviceName.length() == 1)
2263     // We limit ID range to 0..9, because we want to write:
2264     // - '2500' to mean i5-2500
2265     // - '8350' to mean AMD FX-8350
2266     // - '650' to mean GeForce 650
2267     // To extend ID range change condition to '> 0'
2268     {
2269         isID = true;
2270         for (size_t i = 0; i < deviceName.length(); i++)
2271         {
2272             if (!isdigit(deviceName[i]))
2273             {
2274                 isID = false;
2275                 break;
2276             }
2277         }
2278         if (isID)
2279         {
2280             deviceID = atoi(deviceName.c_str());
2281             if (deviceID < 0)
2282                 return NULL;
2283         }
2284     }
2285 
2286     std::vector<cl_platform_id> platforms;
2287     {
2288         cl_uint numPlatforms = 0;
2289         CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
2290 
2291         if (numPlatforms == 0)
2292             return NULL;
2293         platforms.resize((size_t)numPlatforms);
2294         CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
2295         platforms.resize(numPlatforms);
2296     }
2297 
2298     int selectedPlatform = -1;
2299     if (platform.length() > 0)
2300     {
2301         for (size_t i = 0; i < platforms.size(); i++)
2302         {
2303             std::string name;
2304             CV_OclDbgAssert(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name) == CL_SUCCESS);
2305             if (name.find(platform) != std::string::npos)
2306             {
2307                 selectedPlatform = (int)i;
2308                 break;
2309             }
2310         }
2311         if (selectedPlatform == -1)
2312         {
2313             std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
2314             goto not_found;
2315         }
2316     }
2317     if (deviceTypes.size() == 0)
2318     {
2319         if (!isID)
2320         {
2321             deviceTypes.push_back("GPU");
2322             if (configuration)
2323                 deviceTypes.push_back("CPU");
2324         }
2325         else
2326             deviceTypes.push_back("ALL");
2327     }
2328     for (size_t t = 0; t < deviceTypes.size(); t++)
2329     {
2330         int deviceType = 0;
2331         std::string tempStrDeviceType = deviceTypes[t];
2332         std::transform( tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), tolower );
2333 
2334         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2335             deviceType = Device::TYPE_GPU;
2336         else if (tempStrDeviceType == "cpu")
2337             deviceType = Device::TYPE_CPU;
2338         else if (tempStrDeviceType == "accelerator")
2339             deviceType = Device::TYPE_ACCELERATOR;
2340         else if (tempStrDeviceType == "all")
2341             deviceType = Device::TYPE_ALL;
2342         else
2343         {
2344             std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
2345             goto not_found;
2346         }
2347 
2348         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
2349         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
2350                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
2351                 i++)
2352         {
2353             cl_uint count = 0;
2354             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
2355             CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
2356             if (count == 0)
2357                 continue;
2358             size_t base = devices.size();
2359             devices.resize(base + count);
2360             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
2361             CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
2362         }
2363 
2364         for (size_t i = (isID ? deviceID : 0);
2365              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
2366              i++)
2367         {
2368             std::string name;
2369             CV_OclDbgAssert(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name) == CL_SUCCESS);
2370             cl_bool useGPU = true;
2371             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2372             {
2373                 cl_bool isIGPU = CL_FALSE;
2374                 clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL);
2375                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
2376             }
2377             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
2378             {
2379                 // TODO check for OpenCL 1.1
2380                 return devices[i];
2381             }
2382         }
2383     }
2384 
2385 not_found:
2386     if (!configuration)
2387         return NULL; // suppress messages on stderr
2388 
2389     std::cerr << "ERROR: Requested OpenCL device not found, check configuration: " << (configuration == NULL ? "" : configuration) << std::endl
2390             << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
2391             << "    Device types: ";
2392     for (size_t t = 0; t < deviceTypes.size(); t++)
2393         std::cerr << deviceTypes[t] << " ";
2394 
2395     std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
2396     return NULL;
2397 }
2398 #endif
2399 
2400 #ifdef HAVE_OPENCL_SVM
2401 namespace svm {
2402 
2403 enum AllocatorFlags { // don't use first 16 bits
2404         OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
2405         OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
2406         OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
2407         OPENCL_SVM_BUFFER_MASK = 3 << 16,
2408         OPENCL_SVM_BUFFER_MAP = 4 << 16
2409 };
2410 
checkForceSVMUmatUsage()2411 static bool checkForceSVMUmatUsage()
2412 {
2413     static bool initialized = false;
2414     static bool force = false;
2415     if (!initialized)
2416     {
2417         force = getBoolParameter("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
2418         initialized = true;
2419     }
2420     return force;
2421 }
checkDisableSVMUMatUsage()2422 static bool checkDisableSVMUMatUsage()
2423 {
2424     static bool initialized = false;
2425     static bool force = false;
2426     if (!initialized)
2427     {
2428         force = getBoolParameter("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
2429         initialized = true;
2430     }
2431     return force;
2432 }
checkDisableSVM()2433 static bool checkDisableSVM()
2434 {
2435     static bool initialized = false;
2436     static bool force = false;
2437     if (!initialized)
2438     {
2439         force = getBoolParameter("OPENCV_OPENCL_SVM_DISABLE", false);
2440         initialized = true;
2441     }
2442     return force;
2443 }
2444 // see SVMCapabilities
getSVMCapabilitiesMask()2445 static unsigned int getSVMCapabilitiesMask()
2446 {
2447     static bool initialized = false;
2448     static unsigned int mask = 0;
2449     if (!initialized)
2450     {
2451         const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
2452         if (envValue == NULL)
2453         {
2454             return ~0U; // all bits 1
2455         }
2456         mask = atoi(envValue);
2457         initialized = true;
2458     }
2459     return mask;
2460 }
2461 } // namespace
2462 #endif
2463 
2464 struct Context::Impl
2465 {
getcv::ocl::Context::Impl2466     static Context::Impl* get(Context& context) { return context.p; }
2467 
__initcv::ocl::Context::Impl2468     void __init()
2469     {
2470         refcount = 1;
2471         handle = 0;
2472 #ifdef HAVE_OPENCL_SVM
2473         svmInitialized = false;
2474 #endif
2475     }
2476 
Implcv::ocl::Context::Impl2477     Impl()
2478     {
2479         __init();
2480     }
2481 
setDefaultcv::ocl::Context::Impl2482     void setDefault()
2483     {
2484         CV_Assert(handle == NULL);
2485 
2486         cl_device_id d = selectOpenCLDevice();
2487 
2488         if (d == NULL)
2489             return;
2490 
2491         cl_platform_id pl = NULL;
2492         CV_OclDbgAssert(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL) == CL_SUCCESS);
2493 
2494         cl_context_properties prop[] =
2495         {
2496             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2497             0
2498         };
2499 
2500         // !!! in the current implementation force the number of devices to 1 !!!
2501         cl_uint nd = 1;
2502         cl_int status;
2503 
2504         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
2505 
2506         bool ok = handle != 0 && status == CL_SUCCESS;
2507         if( ok )
2508         {
2509             devices.resize(nd);
2510             devices[0].set(d);
2511         }
2512         else
2513             handle = NULL;
2514     }
2515 
Implcv::ocl::Context::Impl2516     Impl(int dtype0)
2517     {
2518         __init();
2519 
2520         cl_int retval = 0;
2521         cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr();
2522         cl_context_properties prop[] =
2523         {
2524             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2525             0
2526         };
2527 
2528         cl_uint i, nd0 = 0, nd = 0;
2529         int dtype = dtype0 & 15;
2530         CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, 0, 0, &nd0 ) == CL_SUCCESS);
2531 
2532         AutoBuffer<void*> dlistbuf(nd0*2+1);
2533         cl_device_id* dlist = (cl_device_id*)(void**)dlistbuf;
2534         cl_device_id* dlist_new = dlist + nd0;
2535         CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, nd0, dlist, &nd0 ) == CL_SUCCESS);
2536         String name0;
2537 
2538         for(i = 0; i < nd0; i++)
2539         {
2540             Device d(dlist[i]);
2541             if( !d.available() || !d.compilerAvailable() )
2542                 continue;
2543             if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() )
2544                 continue;
2545             if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() )
2546                 continue;
2547             String name = d.name();
2548             if( nd != 0 && name != name0 )
2549                 continue;
2550             name0 = name;
2551             dlist_new[nd++] = dlist[i];
2552         }
2553 
2554         if(nd == 0)
2555             return;
2556 
2557         // !!! in the current implementation force the number of devices to 1 !!!
2558         nd = 1;
2559 
2560         handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
2561         bool ok = handle != 0 && retval == CL_SUCCESS;
2562         if( ok )
2563         {
2564             devices.resize(nd);
2565             for( i = 0; i < nd; i++ )
2566                 devices[i].set(dlist_new[i]);
2567         }
2568     }
2569 
~Implcv::ocl::Context::Impl2570     ~Impl()
2571     {
2572         if(handle)
2573         {
2574             clReleaseContext(handle);
2575             handle = NULL;
2576         }
2577         devices.clear();
2578     }
2579 
getProgcv::ocl::Context::Impl2580     Program getProg(const ProgramSource& src,
2581                     const String& buildflags, String& errmsg)
2582     {
2583         String prefix = Program::getPrefix(buildflags);
2584         HashKey k(src.hash(), crc64((const uchar*)prefix.c_str(), prefix.size()));
2585         phash_t::iterator it = phash.find(k);
2586         if( it != phash.end() )
2587             return it->second;
2588         //String filename = format("%08x%08x_%08x%08x.clb2",
2589         Program prog(src, buildflags, errmsg);
2590         if(prog.ptr())
2591             phash.insert(std::pair<HashKey,Program>(k, prog));
2592         return prog;
2593     }
2594 
2595     IMPLEMENT_REFCOUNTABLE();
2596 
2597     cl_context handle;
2598     std::vector<Device> devices;
2599 
2600     typedef ProgramSource::hash_t hash_t;
2601 
2602     struct HashKey
2603     {
HashKeycv::ocl::Context::Impl::HashKey2604         HashKey(hash_t _a, hash_t _b) : a(_a), b(_b) {}
operator <cv::ocl::Context::Impl::HashKey2605         bool operator < (const HashKey& k) const { return a < k.a || (a == k.a && b < k.b); }
operator ==cv::ocl::Context::Impl::HashKey2606         bool operator == (const HashKey& k) const { return a == k.a && b == k.b; }
operator !=cv::ocl::Context::Impl::HashKey2607         bool operator != (const HashKey& k) const { return a != k.a || b != k.b; }
2608         hash_t a, b;
2609     };
2610     typedef std::map<HashKey, Program> phash_t;
2611     phash_t phash;
2612 
2613 #ifdef HAVE_OPENCL_SVM
2614     bool svmInitialized;
2615     bool svmAvailable;
2616     bool svmEnabled;
2617     svm::SVMCapabilities svmCapabilities;
2618     svm::SVMFunctions svmFunctions;
2619 
svmInitcv::ocl::Context::Impl2620     void svmInit()
2621     {
2622         CV_Assert(handle != NULL);
2623         const Device& device = devices[0];
2624         cl_device_svm_capabilities deviceCaps = 0;
2625         CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
2626         cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
2627         if (status != CL_SUCCESS)
2628         {
2629             CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
2630             goto noSVM;
2631         }
2632         CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
2633         CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
2634         svmCapabilities.value_ =
2635                 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
2636                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
2637                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
2638                 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
2639         svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
2640         if (svmCapabilities.value_ == 0)
2641         {
2642             CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
2643             goto noSVM;
2644         }
2645         try
2646         {
2647             // Try OpenCL 2.0
2648             CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
2649             void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
2650             if (!ptr)
2651             {
2652                 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
2653                 CV_ErrorNoReturn(Error::StsBadArg, "clSVMAlloc returned NULL");
2654             }
2655             try
2656             {
2657                 bool error = false;
2658                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
2659                 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
2660                 {
2661                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
2662                     CV_ErrorNoReturn(Error::StsBadArg, "clEnqueueSVMMap FAILED");
2663                 }
2664                 clFinish(q);
2665                 try
2666                 {
2667                     ((int*)ptr)[0] = 100;
2668                 }
2669                 catch (...)
2670                 {
2671                     CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
2672                     error = true;
2673                 }
2674                 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
2675                 {
2676                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
2677                     CV_ErrorNoReturn(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
2678                 }
2679                 clFinish(q);
2680                 if (error)
2681                 {
2682                     CV_ErrorNoReturn(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
2683                 }
2684             }
2685             catch (...)
2686             {
2687                 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
2688                 clSVMFree(handle, ptr);
2689                 throw;
2690             }
2691             clSVMFree(handle, ptr);
2692             svmFunctions.fn_clSVMAlloc = clSVMAlloc;
2693             svmFunctions.fn_clSVMFree = clSVMFree;
2694             svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
2695             //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
2696             //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
2697             svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
2698             svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
2699             svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
2700             svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
2701         }
2702         catch (...)
2703         {
2704             CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
2705             try
2706             {
2707                 // Try HSA extension
2708                 String extensions = device.extensions();
2709                 if (extensions.find("cl_amd_svm") == String::npos)
2710                 {
2711                     CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
2712                     goto noSVM;
2713                 }
2714                 cl_platform_id p = NULL;
2715                 status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL);
2716                 CV_Assert(status == CL_SUCCESS);
2717                 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
2718                 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
2719                 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
2720                 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
2721                 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
2722                 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
2723                 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
2724                 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
2725                 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
2726                 CV_Assert(svmFunctions.isValid());
2727             }
2728             catch (...)
2729             {
2730                 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
2731                 goto noSVM;
2732             }
2733         }
2734 
2735         svmAvailable = true;
2736         svmEnabled = !svm::checkDisableSVM();
2737         svmInitialized = true;
2738         CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
2739         return;
2740     noSVM:
2741         CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
2742         svmAvailable = false;
2743         svmEnabled = false;
2744         svmCapabilities.value_ = 0;
2745         svmInitialized = true;
2746         svmFunctions.fn_clSVMAlloc = NULL;
2747         return;
2748     }
2749 #endif
2750 };
2751 
2752 
Context()2753 Context::Context()
2754 {
2755     p = 0;
2756 }
2757 
Context(int dtype)2758 Context::Context(int dtype)
2759 {
2760     p = 0;
2761     create(dtype);
2762 }
2763 
create()2764 bool Context::create()
2765 {
2766     if( !haveOpenCL() )
2767         return false;
2768     if(p)
2769         p->release();
2770     p = new Impl();
2771     if(!p->handle)
2772     {
2773         delete p;
2774         p = 0;
2775     }
2776     return p != 0;
2777 }
2778 
create(int dtype0)2779 bool Context::create(int dtype0)
2780 {
2781     if( !haveOpenCL() )
2782         return false;
2783     if(p)
2784         p->release();
2785     p = new Impl(dtype0);
2786     if(!p->handle)
2787     {
2788         delete p;
2789         p = 0;
2790     }
2791     return p != 0;
2792 }
2793 
~Context()2794 Context::~Context()
2795 {
2796     if (p)
2797     {
2798         p->release();
2799         p = NULL;
2800     }
2801 }
2802 
Context(const Context & c)2803 Context::Context(const Context& c)
2804 {
2805     p = (Impl*)c.p;
2806     if(p)
2807         p->addref();
2808 }
2809 
operator =(const Context & c)2810 Context& Context::operator = (const Context& c)
2811 {
2812     Impl* newp = (Impl*)c.p;
2813     if(newp)
2814         newp->addref();
2815     if(p)
2816         p->release();
2817     p = newp;
2818     return *this;
2819 }
2820 
ptr() const2821 void* Context::ptr() const
2822 {
2823     return p == NULL ? NULL : p->handle;
2824 }
2825 
ndevices() const2826 size_t Context::ndevices() const
2827 {
2828     return p ? p->devices.size() : 0;
2829 }
2830 
device(size_t idx) const2831 const Device& Context::device(size_t idx) const
2832 {
2833     static Device dummy;
2834     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2835 }
2836 
getDefault(bool initialize)2837 Context& Context::getDefault(bool initialize)
2838 {
2839     static Context* ctx = new Context();
2840     if(!ctx->p && haveOpenCL())
2841     {
2842         if (!ctx->p)
2843             ctx->p = new Impl();
2844         if (initialize)
2845         {
2846             // do not create new Context right away.
2847             // First, try to retrieve existing context of the same type.
2848             // In its turn, Platform::getContext() may call Context::create()
2849             // if there is no such context.
2850             if (ctx->p->handle == NULL)
2851                 ctx->p->setDefault();
2852         }
2853     }
2854 
2855     return *ctx;
2856 }
2857 
getProg(const ProgramSource & prog,const String & buildopts,String & errmsg)2858 Program Context::getProg(const ProgramSource& prog,
2859                          const String& buildopts, String& errmsg)
2860 {
2861     return p ? p->getProg(prog, buildopts, errmsg) : Program();
2862 }
2863 
2864 
2865 
2866 #ifdef HAVE_OPENCL_SVM
useSVM() const2867 bool Context::useSVM() const
2868 {
2869     Context::Impl* i = p;
2870     CV_Assert(i);
2871     if (!i->svmInitialized)
2872         i->svmInit();
2873     return i->svmEnabled;
2874 }
setUseSVM(bool enabled)2875 void Context::setUseSVM(bool enabled)
2876 {
2877     Context::Impl* i = p;
2878     CV_Assert(i);
2879     if (!i->svmInitialized)
2880         i->svmInit();
2881     if (enabled && !i->svmAvailable)
2882     {
2883         CV_ErrorNoReturn(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
2884     }
2885     i->svmEnabled = enabled;
2886 }
2887 #else
useSVM() const2888 bool Context::useSVM() const { return false; }
setUseSVM(bool enabled)2889 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
2890 #endif
2891 
2892 #ifdef HAVE_OPENCL_SVM
2893 namespace svm {
2894 
getSVMCapabilitites(const ocl::Context & context)2895 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
2896 {
2897     Context::Impl* i = context.p;
2898     CV_Assert(i);
2899     if (!i->svmInitialized)
2900         i->svmInit();
2901     return i->svmCapabilities;
2902 }
2903 
getSVMFunctions(const ocl::Context & context)2904 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
2905 {
2906     Context::Impl* i = context.p;
2907     CV_Assert(i);
2908     CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
2909     CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
2910     return &i->svmFunctions;
2911 }
2912 
useSVM(UMatUsageFlags usageFlags)2913 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
2914 {
2915     if (checkForceSVMUmatUsage())
2916         return true;
2917     if (checkDisableSVMUMatUsage())
2918         return false;
2919     if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
2920         return true;
2921     return false; // don't use SVM by default
2922 }
2923 
2924 } // namespace cv::ocl::svm
2925 #endif // HAVE_OPENCL_SVM
2926 
2927 
2928 
initializeContextFromHandle(Context & ctx,void * platform,void * _context,void * _device)2929 void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device)
2930 {
2931     cl_context context = (cl_context)_context;
2932     cl_device_id device = (cl_device_id)_device;
2933 
2934     // cleanup old context
2935     Context::Impl * impl = ctx.p;
2936     if (impl->handle)
2937     {
2938         CV_OclDbgAssert(clReleaseContext(impl->handle) == CL_SUCCESS);
2939     }
2940     impl->devices.clear();
2941 
2942     impl->handle = context;
2943     impl->devices.resize(1);
2944     impl->devices[0].set(device);
2945 
2946     Platform& p = Platform::getDefault();
2947     Platform::Impl* pImpl = p.p;
2948     pImpl->handle = (cl_platform_id)platform;
2949 }
2950 
2951 /////////////////////////////////////////// Queue /////////////////////////////////////////////
2952 
2953 struct Queue::Impl
2954 {
Implcv::ocl::Queue::Impl2955     Impl(const Context& c, const Device& d)
2956     {
2957         refcount = 1;
2958         const Context* pc = &c;
2959         cl_context ch = (cl_context)pc->ptr();
2960         if( !ch )
2961         {
2962             pc = &Context::getDefault();
2963             ch = (cl_context)pc->ptr();
2964         }
2965         cl_device_id dh = (cl_device_id)d.ptr();
2966         if( !dh )
2967             dh = (cl_device_id)pc->device(0).ptr();
2968         cl_int retval = 0;
2969         handle = clCreateCommandQueue(ch, dh, 0, &retval);
2970         CV_OclDbgAssert(retval == CL_SUCCESS);
2971     }
2972 
~Implcv::ocl::Queue::Impl2973     ~Impl()
2974     {
2975 #ifdef _WIN32
2976         if (!cv::__termination)
2977 #endif
2978         {
2979             if(handle)
2980             {
2981                 clFinish(handle);
2982                 clReleaseCommandQueue(handle);
2983                 handle = NULL;
2984             }
2985         }
2986     }
2987 
2988     IMPLEMENT_REFCOUNTABLE();
2989 
2990     cl_command_queue handle;
2991 };
2992 
Queue()2993 Queue::Queue()
2994 {
2995     p = 0;
2996 }
2997 
Queue(const Context & c,const Device & d)2998 Queue::Queue(const Context& c, const Device& d)
2999 {
3000     p = 0;
3001     create(c, d);
3002 }
3003 
Queue(const Queue & q)3004 Queue::Queue(const Queue& q)
3005 {
3006     p = q.p;
3007     if(p)
3008         p->addref();
3009 }
3010 
operator =(const Queue & q)3011 Queue& Queue::operator = (const Queue& q)
3012 {
3013     Impl* newp = (Impl*)q.p;
3014     if(newp)
3015         newp->addref();
3016     if(p)
3017         p->release();
3018     p = newp;
3019     return *this;
3020 }
3021 
~Queue()3022 Queue::~Queue()
3023 {
3024     if(p)
3025         p->release();
3026 }
3027 
create(const Context & c,const Device & d)3028 bool Queue::create(const Context& c, const Device& d)
3029 {
3030     if(p)
3031         p->release();
3032     p = new Impl(c, d);
3033     return p->handle != 0;
3034 }
3035 
finish()3036 void Queue::finish()
3037 {
3038     if(p && p->handle)
3039     {
3040         CV_OclDbgAssert(clFinish(p->handle) == CL_SUCCESS);
3041     }
3042 }
3043 
ptr() const3044 void* Queue::ptr() const
3045 {
3046     return p ? p->handle : 0;
3047 }
3048 
getDefault()3049 Queue& Queue::getDefault()
3050 {
3051     Queue& q = getCoreTlsData().get()->oclQueue;
3052     if( !q.p && haveOpenCL() )
3053         q.create(Context::getDefault());
3054     return q;
3055 }
3056 
getQueue(const Queue & q)3057 static cl_command_queue getQueue(const Queue& q)
3058 {
3059     cl_command_queue qq = (cl_command_queue)q.ptr();
3060     if(!qq)
3061         qq = (cl_command_queue)Queue::getDefault().ptr();
3062     return qq;
3063 }
3064 
3065 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
3066 
KernelArg()3067 KernelArg::KernelArg()
3068     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
3069 {
3070 }
3071 
KernelArg(int _flags,UMat * _m,int _wscale,int _iwscale,const void * _obj,size_t _sz)3072 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
3073     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
3074 {
3075 }
3076 
Constant(const Mat & m)3077 KernelArg KernelArg::Constant(const Mat& m)
3078 {
3079     CV_Assert(m.isContinuous());
3080     return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
3081 }
3082 
3083 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
3084 
3085 struct Kernel::Impl
3086 {
Implcv::ocl::Kernel::Impl3087     Impl(const char* kname, const Program& prog) :
3088         refcount(1), e(0), nu(0)
3089     {
3090         cl_program ph = (cl_program)prog.ptr();
3091         cl_int retval = 0;
3092         handle = ph != 0 ?
3093             clCreateKernel(ph, kname, &retval) : 0;
3094         CV_OclDbgAssert(retval == CL_SUCCESS);
3095         for( int i = 0; i < MAX_ARRS; i++ )
3096             u[i] = 0;
3097         haveTempDstUMats = false;
3098     }
3099 
cleanupUMatscv::ocl::Kernel::Impl3100     void cleanupUMats()
3101     {
3102         for( int i = 0; i < MAX_ARRS; i++ )
3103             if( u[i] )
3104             {
3105                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
3106                     u[i]->currAllocator->deallocate(u[i]);
3107                 u[i] = 0;
3108             }
3109         nu = 0;
3110         haveTempDstUMats = false;
3111     }
3112 
addUMatcv::ocl::Kernel::Impl3113     void addUMat(const UMat& m, bool dst)
3114     {
3115         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
3116         u[nu] = m.u;
3117         CV_XADD(&m.u->urefcount, 1);
3118         nu++;
3119         if(dst && m.u->tempUMat())
3120             haveTempDstUMats = true;
3121     }
3122 
addImagecv::ocl::Kernel::Impl3123     void addImage(const Image2D& image)
3124     {
3125         images.push_back(image);
3126     }
3127 
finitcv::ocl::Kernel::Impl3128     void finit()
3129     {
3130         cleanupUMats();
3131         images.clear();
3132         if(e) { clReleaseEvent(e); e = 0; }
3133         release();
3134     }
3135 
~Implcv::ocl::Kernel::Impl3136     ~Impl()
3137     {
3138         if(handle)
3139             clReleaseKernel(handle);
3140     }
3141 
3142     IMPLEMENT_REFCOUNTABLE();
3143 
3144     cl_kernel handle;
3145     cl_event e;
3146     enum { MAX_ARRS = 16 };
3147     UMatData* u[MAX_ARRS];
3148     int nu;
3149     std::list<Image2D> images;
3150     bool haveTempDstUMats;
3151 };
3152 
3153 }}
3154 
3155 extern "C"
3156 {
oclCleanupCallback(cl_event,cl_int,void * p)3157 static void CL_CALLBACK oclCleanupCallback(cl_event, cl_int, void *p)
3158 {
3159     ((cv::ocl::Kernel::Impl*)p)->finit();
3160 }
3161 
3162 }
3163 
3164 namespace cv { namespace ocl {
3165 
Kernel()3166 Kernel::Kernel()
3167 {
3168     p = 0;
3169 }
3170 
Kernel(const char * kname,const Program & prog)3171 Kernel::Kernel(const char* kname, const Program& prog)
3172 {
3173     p = 0;
3174     create(kname, prog);
3175 }
3176 
Kernel(const char * kname,const ProgramSource & src,const String & buildopts,String * errmsg)3177 Kernel::Kernel(const char* kname, const ProgramSource& src,
3178                const String& buildopts, String* errmsg)
3179 {
3180     p = 0;
3181     create(kname, src, buildopts, errmsg);
3182 }
3183 
Kernel(const Kernel & k)3184 Kernel::Kernel(const Kernel& k)
3185 {
3186     p = k.p;
3187     if(p)
3188         p->addref();
3189 }
3190 
operator =(const Kernel & k)3191 Kernel& Kernel::operator = (const Kernel& k)
3192 {
3193     Impl* newp = (Impl*)k.p;
3194     if(newp)
3195         newp->addref();
3196     if(p)
3197         p->release();
3198     p = newp;
3199     return *this;
3200 }
3201 
~Kernel()3202 Kernel::~Kernel()
3203 {
3204     if(p)
3205         p->release();
3206 }
3207 
create(const char * kname,const Program & prog)3208 bool Kernel::create(const char* kname, const Program& prog)
3209 {
3210     if(p)
3211         p->release();
3212     p = new Impl(kname, prog);
3213     if(p->handle == 0)
3214     {
3215         p->release();
3216         p = 0;
3217     }
3218 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
3219     CV_Assert(p);
3220 #endif
3221     return p != 0;
3222 }
3223 
create(const char * kname,const ProgramSource & src,const String & buildopts,String * errmsg)3224 bool Kernel::create(const char* kname, const ProgramSource& src,
3225                     const String& buildopts, String* errmsg)
3226 {
3227     if(p)
3228     {
3229         p->release();
3230         p = 0;
3231     }
3232     String tempmsg;
3233     if( !errmsg ) errmsg = &tempmsg;
3234     const Program& prog = Context::getDefault().getProg(src, buildopts, *errmsg);
3235     return create(kname, prog);
3236 }
3237 
ptr() const3238 void* Kernel::ptr() const
3239 {
3240     return p ? p->handle : 0;
3241 }
3242 
empty() const3243 bool Kernel::empty() const
3244 {
3245     return ptr() == 0;
3246 }
3247 
set(int i,const void * value,size_t sz)3248 int Kernel::set(int i, const void* value, size_t sz)
3249 {
3250     if (!p || !p->handle)
3251         return -1;
3252     if (i < 0)
3253         return i;
3254     if( i == 0 )
3255         p->cleanupUMats();
3256 
3257     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
3258     CV_OclDbgAssert(retval == CL_SUCCESS);
3259     if (retval != CL_SUCCESS)
3260         return -1;
3261     return i+1;
3262 }
3263 
set(int i,const Image2D & image2D)3264 int Kernel::set(int i, const Image2D& image2D)
3265 {
3266     p->addImage(image2D);
3267     cl_mem h = (cl_mem)image2D.ptr();
3268     return set(i, &h, sizeof(h));
3269 }
3270 
set(int i,const UMat & m)3271 int Kernel::set(int i, const UMat& m)
3272 {
3273     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
3274 }
3275 
set(int i,const KernelArg & arg)3276 int Kernel::set(int i, const KernelArg& arg)
3277 {
3278     if( !p || !p->handle )
3279         return -1;
3280     if (i < 0)
3281         return i;
3282     if( i == 0 )
3283         p->cleanupUMats();
3284     if( arg.m )
3285     {
3286         int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
3287                           ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
3288         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
3289         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
3290 
3291         if (!h)
3292         {
3293             p->release();
3294             p = 0;
3295             return -1;
3296         }
3297 
3298 #ifdef HAVE_OPENCL_SVM
3299         if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
3300         {
3301             const Context& ctx = Context::getDefault();
3302             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
3303             uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
3304             CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
3305 #if 1 // TODO
3306             cl_int status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
3307 #else
3308             cl_int status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
3309 #endif
3310             CV_Assert(status == CL_SUCCESS);
3311         }
3312         else
3313 #endif
3314         {
3315             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
3316         }
3317 
3318         if (ptronly)
3319         {
3320             i++;
3321         }
3322         else if( arg.m->dims <= 2 )
3323         {
3324             UMat2D u2d(*arg.m);
3325             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step) == CL_SUCCESS);
3326             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset) == CL_SUCCESS);
3327             i += 3;
3328 
3329             if( !(arg.flags & KernelArg::NO_SIZE) )
3330             {
3331                 int cols = u2d.cols*arg.wscale/arg.iwscale;
3332                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows) == CL_SUCCESS);
3333                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols) == CL_SUCCESS);
3334                 i += 2;
3335             }
3336         }
3337         else
3338         {
3339             UMat3D u3d(*arg.m);
3340             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep) == CL_SUCCESS);
3341             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step) == CL_SUCCESS);
3342             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset) == CL_SUCCESS);
3343             i += 4;
3344             if( !(arg.flags & KernelArg::NO_SIZE) )
3345             {
3346                 int cols = u3d.cols*arg.wscale/arg.iwscale;
3347                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows) == CL_SUCCESS);
3348                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows) == CL_SUCCESS);
3349                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols) == CL_SUCCESS);
3350                 i += 3;
3351             }
3352         }
3353         p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0);
3354         return i;
3355     }
3356     CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj) == CL_SUCCESS);
3357     return i+1;
3358 }
3359 
3360 
run(int dims,size_t _globalsize[],size_t _localsize[],bool sync,const Queue & q)3361 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3362                  bool sync, const Queue& q)
3363 {
3364     if(!p || !p->handle || p->e != 0)
3365         return false;
3366 
3367     cl_command_queue qq = getQueue(q);
3368     size_t offset[CV_MAX_DIM] = {0}, globalsize[CV_MAX_DIM] = {1,1,1};
3369     size_t total = 1;
3370     CV_Assert(_globalsize != 0);
3371     for (int i = 0; i < dims; i++)
3372     {
3373         size_t val = _localsize ? _localsize[i] :
3374             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3375         CV_Assert( val > 0 );
3376         total *= _globalsize[i];
3377         globalsize[i] = ((_globalsize[i] + val - 1)/val)*val;
3378     }
3379     if( total == 0 )
3380         return true;
3381     if( p->haveTempDstUMats )
3382         sync = true;
3383     cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
3384                                            offset, globalsize, _localsize, 0, 0,
3385                                            sync ? 0 : &p->e);
3386 #if CV_OPENCL_SHOW_RUN_ERRORS
3387     if (retval != CL_SUCCESS)
3388     {
3389         printf("OpenCL program returns error: %d\n", retval);
3390         fflush(stdout);
3391     }
3392 #endif
3393     if( sync || retval != CL_SUCCESS )
3394     {
3395         CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
3396         p->cleanupUMats();
3397     }
3398     else
3399     {
3400         p->addref();
3401         CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
3402     }
3403     return retval == CL_SUCCESS;
3404 }
3405 
runTask(bool sync,const Queue & q)3406 bool Kernel::runTask(bool sync, const Queue& q)
3407 {
3408     if(!p || !p->handle || p->e != 0)
3409         return false;
3410 
3411     cl_command_queue qq = getQueue(q);
3412     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
3413     if( sync || retval != CL_SUCCESS )
3414     {
3415         CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
3416         p->cleanupUMats();
3417     }
3418     else
3419     {
3420         p->addref();
3421         CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
3422     }
3423     return retval == CL_SUCCESS;
3424 }
3425 
3426 
workGroupSize() const3427 size_t Kernel::workGroupSize() const
3428 {
3429     if(!p || !p->handle)
3430         return 0;
3431     size_t val = 0, retsz = 0;
3432     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3433     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE,
3434                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
3435 }
3436 
preferedWorkGroupSizeMultiple() const3437 size_t Kernel::preferedWorkGroupSizeMultiple() const
3438 {
3439     if(!p || !p->handle)
3440         return 0;
3441     size_t val = 0, retsz = 0;
3442     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3443     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
3444                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
3445 }
3446 
compileWorkGroupSize(size_t wsz[]) const3447 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3448 {
3449     if(!p || !p->handle || !wsz)
3450         return 0;
3451     size_t retsz = 0;
3452     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3453     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
3454                                     sizeof(wsz[0])*3, wsz, &retsz) == CL_SUCCESS;
3455 }
3456 
localMemSize() const3457 size_t Kernel::localMemSize() const
3458 {
3459     if(!p || !p->handle)
3460         return 0;
3461     size_t retsz = 0;
3462     cl_ulong val = 0;
3463     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3464     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE,
3465                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? (size_t)val : 0;
3466 }
3467 
3468 /////////////////////////////////////////// Program /////////////////////////////////////////////
3469 
3470 struct Program::Impl
3471 {
Implcv::ocl::Program::Impl3472     Impl(const ProgramSource& _src,
3473          const String& _buildflags, String& errmsg)
3474     {
3475         refcount = 1;
3476         const Context& ctx = Context::getDefault();
3477         src = _src;
3478         buildflags = _buildflags;
3479         const String& srcstr = src.source();
3480         const char* srcptr = srcstr.c_str();
3481         size_t srclen = srcstr.size();
3482         cl_int retval = 0;
3483 
3484         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
3485         if( handle && retval == CL_SUCCESS )
3486         {
3487             int i, n = (int)ctx.ndevices();
3488             AutoBuffer<void*> deviceListBuf(n+1);
3489             void** deviceList = deviceListBuf;
3490             for( i = 0; i < n; i++ )
3491                 deviceList[i] = ctx.device(i).ptr();
3492 
3493             Device device = Device::getDefault();
3494             if (device.isAMD())
3495                 buildflags += " -D AMD_DEVICE";
3496             else if (device.isIntel())
3497                 buildflags += " -D INTEL_DEVICE";
3498 
3499             retval = clBuildProgram(handle, n,
3500                                     (const cl_device_id*)deviceList,
3501                                     buildflags.c_str(), 0, 0);
3502 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
3503             if( retval != CL_SUCCESS )
3504 #endif
3505             {
3506                 size_t retsz = 0;
3507                 cl_int buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
3508                                                CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
3509                 if (buildInfo_retval == CL_SUCCESS && retsz > 1)
3510                 {
3511                     AutoBuffer<char> bufbuf(retsz + 16);
3512                     char* buf = bufbuf;
3513                     buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
3514                                                    CL_PROGRAM_BUILD_LOG, retsz+1, buf, &retsz);
3515                     if (buildInfo_retval == CL_SUCCESS)
3516                     {
3517                         // TODO It is useful to see kernel name & program file name also
3518                         errmsg = String(buf);
3519                         printf("OpenCL program build log: %s\n%s\n", buildflags.c_str(), errmsg.c_str());
3520                         fflush(stdout);
3521                     }
3522                 }
3523                 if (retval != CL_SUCCESS && handle)
3524                 {
3525                     clReleaseProgram(handle);
3526                     handle = NULL;
3527                 }
3528             }
3529         }
3530     }
3531 
Implcv::ocl::Program::Impl3532     Impl(const String& _buf, const String& _buildflags)
3533     {
3534         refcount = 1;
3535         handle = 0;
3536         buildflags = _buildflags;
3537         if(_buf.empty())
3538             return;
3539         String prefix0 = Program::getPrefix(buildflags);
3540         const Context& ctx = Context::getDefault();
3541         const Device& dev = Device::getDefault();
3542         const char* pos0 = _buf.c_str();
3543         const char* pos1 = strchr(pos0, '\n');
3544         if(!pos1)
3545             return;
3546         const char* pos2 = strchr(pos1+1, '\n');
3547         if(!pos2)
3548             return;
3549         const char* pos3 = strchr(pos2+1, '\n');
3550         if(!pos3)
3551             return;
3552         size_t prefixlen = (pos3 - pos0)+1;
3553         String prefix(pos0, prefixlen);
3554         if( prefix != prefix0 )
3555             return;
3556         const uchar* bin = (uchar*)(pos3+1);
3557         void* devid = dev.ptr();
3558         size_t codelen = _buf.length() - prefixlen;
3559         cl_int binstatus = 0, retval = 0;
3560         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), 1, (cl_device_id*)&devid,
3561                                            &codelen, &bin, &binstatus, &retval);
3562         CV_OclDbgAssert(retval == CL_SUCCESS);
3563     }
3564 
storecv::ocl::Program::Impl3565     String store()
3566     {
3567         if(!handle)
3568             return String();
3569         size_t progsz = 0, retsz = 0;
3570         String prefix = Program::getPrefix(buildflags);
3571         size_t prefixlen = prefix.length();
3572         if(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(progsz), &progsz, &retsz) != CL_SUCCESS)
3573             return String();
3574         AutoBuffer<uchar> bufbuf(prefixlen + progsz + 16);
3575         uchar* buf = bufbuf;
3576         memcpy(buf, prefix.c_str(), prefixlen);
3577         buf += prefixlen;
3578         if(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(buf), &buf, &retsz) != CL_SUCCESS)
3579             return String();
3580         buf[progsz] = (uchar)'\0';
3581         return String((const char*)(uchar*)bufbuf, prefixlen + progsz);
3582     }
3583 
~Implcv::ocl::Program::Impl3584     ~Impl()
3585     {
3586         if( handle )
3587         {
3588 #ifdef _WIN32
3589             if (!cv::__termination)
3590 #endif
3591             {
3592                 clReleaseProgram(handle);
3593             }
3594             handle = NULL;
3595         }
3596     }
3597 
3598     IMPLEMENT_REFCOUNTABLE();
3599 
3600     ProgramSource src;
3601     String buildflags;
3602     cl_program handle;
3603 };
3604 
3605 
Program()3606 Program::Program() { p = 0; }
3607 
Program(const ProgramSource & src,const String & buildflags,String & errmsg)3608 Program::Program(const ProgramSource& src,
3609         const String& buildflags, String& errmsg)
3610 {
3611     p = 0;
3612     create(src, buildflags, errmsg);
3613 }
3614 
Program(const Program & prog)3615 Program::Program(const Program& prog)
3616 {
3617     p = prog.p;
3618     if(p)
3619         p->addref();
3620 }
3621 
operator =(const Program & prog)3622 Program& Program::operator = (const Program& prog)
3623 {
3624     Impl* newp = (Impl*)prog.p;
3625     if(newp)
3626         newp->addref();
3627     if(p)
3628         p->release();
3629     p = newp;
3630     return *this;
3631 }
3632 
~Program()3633 Program::~Program()
3634 {
3635     if(p)
3636         p->release();
3637 }
3638 
create(const ProgramSource & src,const String & buildflags,String & errmsg)3639 bool Program::create(const ProgramSource& src,
3640             const String& buildflags, String& errmsg)
3641 {
3642     if(p)
3643         p->release();
3644     p = new Impl(src, buildflags, errmsg);
3645     if(!p->handle)
3646     {
3647         p->release();
3648         p = 0;
3649     }
3650     return p != 0;
3651 }
3652 
source() const3653 const ProgramSource& Program::source() const
3654 {
3655     static ProgramSource dummy;
3656     return p ? p->src : dummy;
3657 }
3658 
ptr() const3659 void* Program::ptr() const
3660 {
3661     return p ? p->handle : 0;
3662 }
3663 
read(const String & bin,const String & buildflags)3664 bool Program::read(const String& bin, const String& buildflags)
3665 {
3666     if(p)
3667         p->release();
3668     p = new Impl(bin, buildflags);
3669     return p->handle != 0;
3670 }
3671 
write(String & bin) const3672 bool Program::write(String& bin) const
3673 {
3674     if(!p)
3675         return false;
3676     bin = p->store();
3677     return !bin.empty();
3678 }
3679 
getPrefix() const3680 String Program::getPrefix() const
3681 {
3682     if(!p)
3683         return String();
3684     return getPrefix(p->buildflags);
3685 }
3686 
getPrefix(const String & buildflags)3687 String Program::getPrefix(const String& buildflags)
3688 {
3689     const Context& ctx = Context::getDefault();
3690     const Device& dev = ctx.device(0);
3691     return format("name=%s\ndriver=%s\nbuildflags=%s\n",
3692                   dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str());
3693 }
3694 
3695 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3696 
3697 struct ProgramSource::Impl
3698 {
Implcv::ocl::ProgramSource::Impl3699     Impl(const char* _src)
3700     {
3701         init(String(_src));
3702     }
Implcv::ocl::ProgramSource::Impl3703     Impl(const String& _src)
3704     {
3705         init(_src);
3706     }
initcv::ocl::ProgramSource::Impl3707     void init(const String& _src)
3708     {
3709         refcount = 1;
3710         src = _src;
3711         h = crc64((uchar*)src.c_str(), src.size());
3712     }
3713 
3714     IMPLEMENT_REFCOUNTABLE();
3715     String src;
3716     ProgramSource::hash_t h;
3717 };
3718 
3719 
ProgramSource()3720 ProgramSource::ProgramSource()
3721 {
3722     p = 0;
3723 }
3724 
ProgramSource(const char * prog)3725 ProgramSource::ProgramSource(const char* prog)
3726 {
3727     p = new Impl(prog);
3728 }
3729 
ProgramSource(const String & prog)3730 ProgramSource::ProgramSource(const String& prog)
3731 {
3732     p = new Impl(prog);
3733 }
3734 
~ProgramSource()3735 ProgramSource::~ProgramSource()
3736 {
3737     if(p)
3738         p->release();
3739 }
3740 
ProgramSource(const ProgramSource & prog)3741 ProgramSource::ProgramSource(const ProgramSource& prog)
3742 {
3743     p = prog.p;
3744     if(p)
3745         p->addref();
3746 }
3747 
operator =(const ProgramSource & prog)3748 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
3749 {
3750     Impl* newp = (Impl*)prog.p;
3751     if(newp)
3752         newp->addref();
3753     if(p)
3754         p->release();
3755     p = newp;
3756     return *this;
3757 }
3758 
source() const3759 const String& ProgramSource::source() const
3760 {
3761     static String dummy;
3762     return p ? p->src : dummy;
3763 }
3764 
hash() const3765 ProgramSource::hash_t ProgramSource::hash() const
3766 {
3767     return p ? p->h : 0;
3768 }
3769 
3770 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
3771 
3772 template<typename T>
3773 class OpenCLBufferPool
3774 {
3775 protected:
~OpenCLBufferPool()3776     ~OpenCLBufferPool() { }
3777 public:
3778     virtual T allocate(size_t size) = 0;
3779     virtual void release(T buffer) = 0;
3780 };
3781 
3782 template <typename Derived, typename BufferEntry, typename T>
3783 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
3784 {
3785 private:
derived()3786     inline Derived& derived() { return *static_cast<Derived*>(this); }
3787 protected:
3788     Mutex mutex_;
3789 
3790     size_t currentReservedSize;
3791     size_t maxReservedSize;
3792 
3793     std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
3794     std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
3795 
3796     // synchronized
_findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry & entry,T buffer)3797     bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
3798     {
3799         typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
3800         for (; i != allocatedEntries_.end(); ++i)
3801         {
3802             BufferEntry& e = *i;
3803             if (e.clBuffer_ == buffer)
3804             {
3805                 entry = e;
3806                 allocatedEntries_.erase(i);
3807                 return true;
3808             }
3809         }
3810         return false;
3811     }
3812 
3813     // synchronized
_findAndRemoveEntryFromReservedList(CV_OUT BufferEntry & entry,const size_t size)3814     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
3815     {
3816         if (reservedEntries_.empty())
3817             return false;
3818         typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
3819         typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
3820         BufferEntry result;
3821         size_t minDiff = (size_t)(-1);
3822         for (; i != reservedEntries_.end(); ++i)
3823         {
3824             BufferEntry& e = *i;
3825             if (e.capacity_ >= size)
3826             {
3827                 size_t diff = e.capacity_ - size;
3828                 if (diff < size / 8 && (result_pos == reservedEntries_.end() || diff < minDiff))
3829                 {
3830                     minDiff = diff;
3831                     result_pos = i;
3832                     result = e;
3833                     if (diff == 0)
3834                         break;
3835                 }
3836             }
3837         }
3838         if (result_pos != reservedEntries_.end())
3839         {
3840             //CV_DbgAssert(result == *result_pos);
3841             reservedEntries_.erase(result_pos);
3842             entry = result;
3843             currentReservedSize -= entry.capacity_;
3844             allocatedEntries_.push_back(entry);
3845             return true;
3846         }
3847         return false;
3848     }
3849 
3850     // synchronized
_checkSizeOfReservedEntries()3851     void _checkSizeOfReservedEntries()
3852     {
3853         while (currentReservedSize > maxReservedSize)
3854         {
3855             CV_DbgAssert(!reservedEntries_.empty());
3856             const BufferEntry& entry = reservedEntries_.back();
3857             CV_DbgAssert(currentReservedSize >= entry.capacity_);
3858             currentReservedSize -= entry.capacity_;
3859             derived()._releaseBufferEntry(entry);
3860             reservedEntries_.pop_back();
3861         }
3862     }
3863 
_allocationGranularity(size_t size)3864     inline size_t _allocationGranularity(size_t size)
3865     {
3866         // heuristic values
3867         if (size < 1024)
3868             return 16;
3869         else if (size < 64*1024)
3870             return 64;
3871         else if (size < 1024*1024)
3872             return 4096;
3873         else if (size < 16*1024*1024)
3874             return 64*1024;
3875         else
3876             return 1024*1024;
3877     }
3878 
3879 public:
OpenCLBufferPoolBaseImpl()3880     OpenCLBufferPoolBaseImpl()
3881         : currentReservedSize(0),
3882           maxReservedSize(0)
3883     {
3884         // nothing
3885     }
~OpenCLBufferPoolBaseImpl()3886     virtual ~OpenCLBufferPoolBaseImpl()
3887     {
3888         freeAllReservedBuffers();
3889         CV_Assert(reservedEntries_.empty());
3890     }
3891 public:
allocate(size_t size)3892     virtual T allocate(size_t size)
3893     {
3894         AutoLock locker(mutex_);
3895         BufferEntry entry;
3896         if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
3897         {
3898             CV_DbgAssert(size <= entry.capacity_);
3899             LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
3900         }
3901         else
3902         {
3903             derived()._allocateBufferEntry(entry, size);
3904         }
3905         return entry.clBuffer_;
3906     }
release(T buffer)3907     virtual void release(T buffer)
3908     {
3909         AutoLock locker(mutex_);
3910         BufferEntry entry;
3911         CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
3912         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
3913         {
3914             derived()._releaseBufferEntry(entry);
3915         }
3916         else
3917         {
3918             reservedEntries_.push_front(entry);
3919             currentReservedSize += entry.capacity_;
3920             _checkSizeOfReservedEntries();
3921         }
3922     }
3923 
getReservedSize() const3924     virtual size_t getReservedSize() const { return currentReservedSize; }
getMaxReservedSize() const3925     virtual size_t getMaxReservedSize() const { return maxReservedSize; }
setMaxReservedSize(size_t size)3926     virtual void setMaxReservedSize(size_t size)
3927     {
3928         AutoLock locker(mutex_);
3929         size_t oldMaxReservedSize = maxReservedSize;
3930         maxReservedSize = size;
3931         if (maxReservedSize < oldMaxReservedSize)
3932         {
3933             typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
3934             for (; i != reservedEntries_.end();)
3935             {
3936                 const BufferEntry& entry = *i;
3937                 if (entry.capacity_ > maxReservedSize / 8)
3938                 {
3939                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
3940                     currentReservedSize -= entry.capacity_;
3941                     derived()._releaseBufferEntry(entry);
3942                     i = reservedEntries_.erase(i);
3943                     continue;
3944                 }
3945                 ++i;
3946             }
3947             _checkSizeOfReservedEntries();
3948         }
3949     }
freeAllReservedBuffers()3950     virtual void freeAllReservedBuffers()
3951     {
3952         AutoLock locker(mutex_);
3953         typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
3954         for (; i != reservedEntries_.end(); ++i)
3955         {
3956             const BufferEntry& entry = *i;
3957             derived()._releaseBufferEntry(entry);
3958         }
3959         reservedEntries_.clear();
3960         currentReservedSize = 0;
3961     }
3962 };
3963 
3964 struct CLBufferEntry
3965 {
3966     cl_mem clBuffer_;
3967     size_t capacity_;
CLBufferEntrycv::ocl::CLBufferEntry3968     CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
3969 };
3970 
3971 class OpenCLBufferPoolImpl : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
3972 {
3973 public:
3974     typedef struct CLBufferEntry BufferEntry;
3975 protected:
3976     int createFlags_;
3977 public:
OpenCLBufferPoolImpl(int createFlags=0)3978     OpenCLBufferPoolImpl(int createFlags = 0)
3979         : createFlags_(createFlags)
3980     {
3981     }
3982 
_allocateBufferEntry(BufferEntry & entry,size_t size)3983     void _allocateBufferEntry(BufferEntry& entry, size_t size)
3984     {
3985         CV_DbgAssert(entry.clBuffer_ == NULL);
3986         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
3987         Context& ctx = Context::getDefault();
3988         cl_int retval = CL_SUCCESS;
3989         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
3990         CV_Assert(retval == CL_SUCCESS);
3991         CV_Assert(entry.clBuffer_ != NULL);
3992         if(retval == CL_SUCCESS)
3993         {
3994             CV_IMPL_ADD(CV_IMPL_OCL);
3995         }
3996         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
3997                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
3998         allocatedEntries_.push_back(entry);
3999     }
4000 
_releaseBufferEntry(const BufferEntry & entry)4001     void _releaseBufferEntry(const BufferEntry& entry)
4002     {
4003         CV_Assert(entry.capacity_ != 0);
4004         CV_Assert(entry.clBuffer_ != NULL);
4005         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
4006                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4007         clReleaseMemObject(entry.clBuffer_);
4008     }
4009 };
4010 
4011 #ifdef HAVE_OPENCL_SVM
4012 struct CLSVMBufferEntry
4013 {
4014     void* clBuffer_;
4015     size_t capacity_;
CLSVMBufferEntrycv::ocl::CLSVMBufferEntry4016     CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
4017 };
4018 class OpenCLSVMBufferPoolImpl : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
4019 {
4020 public:
4021     typedef struct CLSVMBufferEntry BufferEntry;
4022 public:
OpenCLSVMBufferPoolImpl()4023     OpenCLSVMBufferPoolImpl()
4024     {
4025     }
4026 
_allocateBufferEntry(BufferEntry & entry,size_t size)4027     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4028     {
4029         CV_DbgAssert(entry.clBuffer_ == NULL);
4030         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4031 
4032         Context& ctx = Context::getDefault();
4033         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4034         bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4035         cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
4036                 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4037 
4038         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4039         CV_DbgAssert(svmFns->isValid());
4040 
4041         CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
4042         void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
4043         CV_Assert(buf);
4044 
4045         entry.clBuffer_ = buf;
4046         {
4047             CV_IMPL_ADD(CV_IMPL_OCL);
4048         }
4049         LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
4050                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4051         allocatedEntries_.push_back(entry);
4052     }
4053 
_releaseBufferEntry(const BufferEntry & entry)4054     void _releaseBufferEntry(const BufferEntry& entry)
4055     {
4056         CV_Assert(entry.capacity_ != 0);
4057         CV_Assert(entry.clBuffer_ != NULL);
4058         LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
4059                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4060         Context& ctx = Context::getDefault();
4061         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4062         CV_DbgAssert(svmFns->isValid());
4063         CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n",  entry.clBuffer_);
4064         svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
4065     }
4066 };
4067 #endif
4068 
4069 
4070 
4071 #if defined _MSC_VER
4072 #pragma warning(disable:4127) // conditional expression is constant
4073 #endif
4074 template <bool readAccess, bool writeAccess>
4075 class AlignedDataPtr
4076 {
4077 protected:
4078     const size_t size_;
4079     uchar* const originPtr_;
4080     const size_t alignment_;
4081     uchar* ptr_;
4082     uchar* allocatedPtr_;
4083 
4084 public:
AlignedDataPtr(uchar * ptr,size_t size,size_t alignment)4085     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
4086         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
4087     {
4088         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
4089         if (((size_t)ptr_ & (alignment - 1)) != 0)
4090         {
4091             allocatedPtr_ = new uchar[size_ + alignment - 1];
4092             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
4093             if (readAccess)
4094             {
4095                 memcpy(ptr_, originPtr_, size_);
4096             }
4097         }
4098     }
4099 
getAlignedPtr() const4100     uchar* getAlignedPtr() const
4101     {
4102         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
4103         return ptr_;
4104     }
4105 
~AlignedDataPtr()4106     ~AlignedDataPtr()
4107     {
4108         if (allocatedPtr_)
4109         {
4110             if (writeAccess)
4111             {
4112                 memcpy(originPtr_, ptr_, size_);
4113             }
4114             delete[] allocatedPtr_;
4115             allocatedPtr_ = NULL;
4116         }
4117         ptr_ = NULL;
4118     }
4119 private:
4120     AlignedDataPtr(const AlignedDataPtr&); // disabled
4121     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
4122 };
4123 #if defined _MSC_VER
4124 #pragma warning(default:4127) // conditional expression is constant
4125 #endif
4126 
4127 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
4128 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
4129 #endif
4130 
4131 class OpenCLAllocator : public MatAllocator
4132 {
4133     mutable OpenCLBufferPoolImpl bufferPool;
4134     mutable OpenCLBufferPoolImpl bufferPoolHostPtr;
4135 #ifdef  HAVE_OPENCL_SVM
4136     mutable OpenCLSVMBufferPoolImpl bufferPoolSVM;
4137 #endif
4138 
4139     enum AllocatorFlags
4140     {
4141         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
4142         ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1
4143 #ifdef HAVE_OPENCL_SVM
4144         ,ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2
4145 #endif
4146     };
4147 public:
OpenCLAllocator()4148     OpenCLAllocator()
4149         : bufferPool(0),
4150           bufferPoolHostPtr(CL_MEM_ALLOC_HOST_PTR)
4151     {
4152         size_t defaultPoolSize, poolSize;
4153         defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
4154         poolSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
4155         bufferPool.setMaxReservedSize(poolSize);
4156         poolSize = getConfigurationParameterForSize("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
4157         bufferPoolHostPtr.setMaxReservedSize(poolSize);
4158 #ifdef HAVE_OPENCL_SVM
4159         poolSize = getConfigurationParameterForSize("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
4160         bufferPoolSVM.setMaxReservedSize(poolSize);
4161 #endif
4162 
4163         matStdAllocator = Mat::getStdAllocator();
4164     }
4165 
defaultAllocate(int dims,const int * sizes,int type,void * data,size_t * step,int flags,UMatUsageFlags usageFlags) const4166     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
4167             int flags, UMatUsageFlags usageFlags) const
4168     {
4169         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
4170         return u;
4171     }
4172 
getBestFlags(const Context & ctx,int,UMatUsageFlags usageFlags,int & createFlags,int & flags0) const4173     void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const
4174     {
4175         const Device& dev = ctx.device(0);
4176         createFlags = 0;
4177         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
4178             createFlags |= CL_MEM_ALLOC_HOST_PTR;
4179 
4180         if( dev.hostUnifiedMemory() )
4181             flags0 = 0;
4182         else
4183             flags0 = UMatData::COPY_ON_MAP;
4184     }
4185 
allocate(int dims,const int * sizes,int type,void * data,size_t * step,int flags,UMatUsageFlags usageFlags) const4186     UMatData* allocate(int dims, const int* sizes, int type,
4187                        void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const
4188     {
4189         if(!useOpenCL())
4190             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
4191         CV_Assert(data == 0);
4192         size_t total = CV_ELEM_SIZE(type);
4193         for( int i = dims-1; i >= 0; i-- )
4194         {
4195             if( step )
4196                 step[i] = total;
4197             total *= sizes[i];
4198         }
4199 
4200         Context& ctx = Context::getDefault();
4201 
4202         int createFlags = 0, flags0 = 0;
4203         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
4204 
4205         void* handle = NULL;
4206         int allocatorFlags = 0;
4207 
4208 #ifdef HAVE_OPENCL_SVM
4209         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4210         if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
4211         {
4212             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
4213             handle = bufferPoolSVM.allocate(total);
4214 
4215             // this property is constant, so single buffer pool can be used here
4216             bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4217             allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
4218         }
4219         else
4220 #endif
4221         if (createFlags == 0)
4222         {
4223             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
4224             handle = bufferPool.allocate(total);
4225         }
4226         else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
4227         {
4228             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
4229             handle = bufferPoolHostPtr.allocate(total);
4230         }
4231         else
4232         {
4233             CV_Assert(handle != NULL); // Unsupported, throw
4234         }
4235 
4236         if (!handle)
4237             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
4238 
4239         UMatData* u = new UMatData(this);
4240         u->data = 0;
4241         u->size = total;
4242         u->handle = handle;
4243         u->flags = flags0;
4244         u->allocatorFlags_ = allocatorFlags;
4245         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
4246         return u;
4247     }
4248 
allocate(UMatData * u,int accessFlags,UMatUsageFlags usageFlags) const4249     bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const
4250     {
4251         if(!u)
4252             return false;
4253 
4254         UMatDataAutoLock lock(u);
4255 
4256         if(u->handle == 0)
4257         {
4258             CV_Assert(u->origdata != 0);
4259             Context& ctx = Context::getDefault();
4260             int createFlags = 0, flags0 = 0;
4261             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
4262 
4263             cl_context ctx_handle = (cl_context)ctx.ptr();
4264             int allocatorFlags = 0;
4265             int tempUMatFlags = 0;
4266             void* handle = NULL;
4267             cl_int retval = CL_SUCCESS;
4268 
4269 #ifdef HAVE_OPENCL_SVM
4270             svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4271             bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
4272             if (useSVM && svmCaps.isSupportFineGrainSystem())
4273             {
4274                 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
4275                 tempUMatFlags = UMatData::TEMP_UMAT;
4276                 handle = u->origdata;
4277                 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
4278             }
4279             else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
4280             {
4281                 if (!(accessFlags & ACCESS_FAST)) // memcpy used
4282                 {
4283                     bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4284 
4285                     cl_svm_mem_flags memFlags = createFlags |
4286                             (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4287 
4288                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4289                     CV_DbgAssert(svmFns->isValid());
4290 
4291                     CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
4292                     handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
4293                     CV_Assert(handle);
4294 
4295                     cl_command_queue q = NULL;
4296                     if (!isFineGrainBuffer)
4297                     {
4298                         q = (cl_command_queue)Queue::getDefault().ptr();
4299                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
4300                         cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
4301                                 handle, u->size,
4302                                 0, NULL, NULL);
4303                         CV_Assert(status == CL_SUCCESS);
4304 
4305                     }
4306                     memcpy(handle, u->origdata, u->size);
4307                     if (!isFineGrainBuffer)
4308                     {
4309                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
4310                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
4311                         CV_Assert(status == CL_SUCCESS);
4312                     }
4313 
4314                     tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
4315                     allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
4316                                                 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
4317                 }
4318             }
4319             else
4320 #endif
4321             {
4322                 tempUMatFlags = UMatData::TEMP_UMAT;
4323                 handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
4324                                            u->size, u->origdata, &retval);
4325                 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
4326                 {
4327                     handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
4328                                                u->size, u->origdata, &retval);
4329                     tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
4330                 }
4331             }
4332             if(!handle || retval != CL_SUCCESS)
4333                 return false;
4334             u->handle = handle;
4335             u->prevAllocator = u->currAllocator;
4336             u->currAllocator = this;
4337             u->flags |= tempUMatFlags;
4338             u->allocatorFlags_ = allocatorFlags;
4339         }
4340         if(accessFlags & ACCESS_WRITE)
4341             u->markHostCopyObsolete(true);
4342         return true;
4343     }
4344 
4345     /*void sync(UMatData* u) const
4346     {
4347         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4348         UMatDataAutoLock lock(u);
4349 
4350         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
4351         {
4352             if( u->tempCopiedUMat() )
4353             {
4354                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4355                                     u->size, u->origdata, 0, 0, 0);
4356             }
4357             else
4358             {
4359                 cl_int retval = 0;
4360                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4361                                                 (CL_MAP_READ | CL_MAP_WRITE),
4362                                                 0, u->size, 0, 0, 0, &retval);
4363                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
4364                 clFinish(q);
4365             }
4366             u->markHostCopyObsolete(false);
4367         }
4368         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
4369         {
4370             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4371                                  u->size, u->data, 0, 0, 0);
4372         }
4373     }*/
4374 
deallocate(UMatData * u) const4375     void deallocate(UMatData* u) const
4376     {
4377         if(!u)
4378             return;
4379 
4380         CV_Assert(u->urefcount >= 0);
4381         CV_Assert(u->refcount >= 0);
4382 
4383         CV_Assert(u->handle != 0 && u->urefcount == 0);
4384         if(u->tempUMat())
4385         {
4386 //            UMatDataAutoLock lock(u);
4387 
4388             if( u->hostCopyObsolete() && u->refcount > 0 )
4389             {
4390 #ifdef HAVE_OPENCL_SVM
4391                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4392                 {
4393                     Context& ctx = Context::getDefault();
4394                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4395                     CV_DbgAssert(svmFns->isValid());
4396 
4397                     if( u->tempCopiedUMat() )
4398                     {
4399                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
4400                                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
4401                         bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
4402                         cl_command_queue q = NULL;
4403                         if (!isFineGrainBuffer)
4404                         {
4405                             CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
4406                             q = (cl_command_queue)Queue::getDefault().ptr();
4407                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
4408                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
4409                                     u->handle, u->size,
4410                                     0, NULL, NULL);
4411                             CV_Assert(status == CL_SUCCESS);
4412                         }
4413                         clFinish(q);
4414                         memcpy(u->origdata, u->handle, u->size);
4415                         if (!isFineGrainBuffer)
4416                         {
4417                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4418                             cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
4419                             CV_Assert(status == CL_SUCCESS);
4420                         }
4421                     }
4422                     else
4423                     {
4424                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
4425                         // nothing
4426                     }
4427                 }
4428                 else
4429 #endif
4430                 {
4431                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4432                     if( u->tempCopiedUMat() )
4433                     {
4434                         AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
4435                         CV_OclDbgAssert(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4436                                             u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS);
4437                     }
4438                     else
4439                     {
4440                         // TODO Is it really needed for clCreateBuffer with CL_MEM_USE_HOST_PTR?
4441                         cl_int retval = 0;
4442                         void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4443                                                         (CL_MAP_READ | CL_MAP_WRITE),
4444                                                         0, u->size, 0, 0, 0, &retval);
4445                         CV_OclDbgAssert(retval == CL_SUCCESS);
4446                         CV_OclDbgAssert(clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0) == CL_SUCCESS);
4447                         CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
4448                     }
4449                 }
4450                 u->markHostCopyObsolete(false);
4451             }
4452 #ifdef HAVE_OPENCL_SVM
4453             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4454             {
4455                 if( u->tempCopiedUMat() )
4456                 {
4457                     Context& ctx = Context::getDefault();
4458                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4459                     CV_DbgAssert(svmFns->isValid());
4460 
4461                     CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
4462                     svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
4463                 }
4464             }
4465             else
4466 #endif
4467             {
4468                 clReleaseMemObject((cl_mem)u->handle);
4469             }
4470             u->handle = 0;
4471             u->currAllocator = u->prevAllocator;
4472             if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
4473                 fastFree(u->data);
4474             u->data = u->origdata;
4475             if(u->refcount == 0)
4476                 u->currAllocator->deallocate(u);
4477         }
4478         else
4479         {
4480             CV_Assert(u->refcount == 0);
4481             if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
4482             {
4483                 fastFree(u->data);
4484                 u->data = 0;
4485             }
4486             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
4487             {
4488                 bufferPool.release((cl_mem)u->handle);
4489             }
4490             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
4491             {
4492                 bufferPoolHostPtr.release((cl_mem)u->handle);
4493             }
4494 #ifdef HAVE_OPENCL_SVM
4495             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
4496             {
4497                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
4498                 {
4499                     //nothing
4500                 }
4501                 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
4502                         (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4503                 {
4504                     Context& ctx = Context::getDefault();
4505                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4506                     CV_DbgAssert(svmFns->isValid());
4507                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4508 
4509                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
4510                     {
4511                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4512                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
4513                         CV_Assert(status == CL_SUCCESS);
4514                     }
4515                 }
4516                 bufferPoolSVM.release((void*)u->handle);
4517             }
4518 #endif
4519             else
4520             {
4521                 clReleaseMemObject((cl_mem)u->handle);
4522             }
4523             u->handle = 0;
4524             delete u;
4525         }
4526     }
4527 
map(UMatData * u,int accessFlags) const4528     void map(UMatData* u, int accessFlags) const
4529     {
4530         if(!u)
4531             return;
4532 
4533         CV_Assert( u->handle != 0 );
4534 
4535         UMatDataAutoLock autolock(u);
4536 
4537         if(accessFlags & ACCESS_WRITE)
4538             u->markDeviceCopyObsolete(true);
4539 
4540         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4541 
4542         // FIXIT Workaround for UMat synchronization issue
4543         // if( u->refcount == 0 )
4544         {
4545             if( !u->copyOnMap() )
4546             {
4547                 // TODO
4548                 // because there can be other map requests for the same UMat with different access flags,
4549                 // we use the universal (read-write) access mode.
4550 #ifdef HAVE_OPENCL_SVM
4551                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4552                 {
4553                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4554                     {
4555                         Context& ctx = Context::getDefault();
4556                         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4557                         CV_DbgAssert(svmFns->isValid());
4558 
4559                         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
4560                         {
4561                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
4562                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
4563                                     u->handle, u->size,
4564                                     0, NULL, NULL);
4565                             CV_Assert(status == CL_SUCCESS);
4566                             u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
4567                         }
4568                     }
4569                     clFinish(q);
4570                     u->data = (uchar*)u->handle;
4571                     u->markHostCopyObsolete(false);
4572                     u->markDeviceMemMapped(true);
4573                     return;
4574                 }
4575 #endif
4576                 if (u->data) // FIXIT Workaround for UMat synchronization issue
4577                 {
4578                     //CV_Assert(u->hostCopyObsolete() == false);
4579                     return;
4580                 }
4581 
4582                 cl_int retval = 0;
4583                 u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4584                                                      (CL_MAP_READ | CL_MAP_WRITE),
4585                                                      0, u->size, 0, 0, 0, &retval);
4586                 if(u->data && retval == CL_SUCCESS)
4587                 {
4588                     u->markHostCopyObsolete(false);
4589                     u->markDeviceMemMapped(true);
4590                     return;
4591                 }
4592 
4593                 // TODO Is it really a good idea and was it tested well?
4594                 // if map failed, switch to copy-on-map mode for the particular buffer
4595                 u->flags |= UMatData::COPY_ON_MAP;
4596             }
4597 
4598             if(!u->data)
4599             {
4600                 u->data = (uchar*)fastMalloc(u->size);
4601                 u->markHostCopyObsolete(true);
4602             }
4603         }
4604 
4605         if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
4606         {
4607             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
4608 #ifdef HAVE_OPENCL_SVM
4609             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
4610 #endif
4611             CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4612                                            u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
4613             u->markHostCopyObsolete(false);
4614         }
4615     }
4616 
unmap(UMatData * u) const4617     void unmap(UMatData* u) const
4618     {
4619         if(!u)
4620             return;
4621 
4622 
4623         CV_Assert(u->handle != 0);
4624 
4625         UMatDataAutoLock autolock(u);
4626 
4627         // FIXIT Workaround for UMat synchronization issue
4628         if(u->refcount > 0)
4629             return;
4630 
4631         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4632         cl_int retval = 0;
4633         if( !u->copyOnMap() && u->deviceMemMapped() )
4634         {
4635             CV_Assert(u->data != NULL);
4636             u->markDeviceMemMapped(false);
4637 #ifdef HAVE_OPENCL_SVM
4638             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4639             {
4640                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4641                 {
4642                     Context& ctx = Context::getDefault();
4643                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4644                     CV_DbgAssert(svmFns->isValid());
4645 
4646                     CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
4647                     {
4648                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4649                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
4650                                 0, NULL, NULL);
4651                         CV_Assert(status == CL_SUCCESS);
4652                         clFinish(q);
4653                         u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
4654                     }
4655                 }
4656                 u->data = 0;
4657                 u->markDeviceCopyObsolete(false);
4658                 u->markHostCopyObsolete(false);
4659                 return;
4660             }
4661 #endif
4662             CV_Assert( (retval = clEnqueueUnmapMemObject(q,
4663                                 (cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS );
4664             if (Device::getDefault().isAMD())
4665             {
4666                 // required for multithreaded applications (see stitching test)
4667                 CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
4668             }
4669             u->data = 0;
4670         }
4671         else if( u->copyOnMap() && u->deviceCopyObsolete() )
4672         {
4673             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
4674 #ifdef HAVE_OPENCL_SVM
4675             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
4676 #endif
4677             CV_Assert( (retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4678                                 u->size, alignedPtr.getAlignedPtr(), 0, 0, 0)) == CL_SUCCESS );
4679         }
4680         u->markDeviceCopyObsolete(false);
4681         u->markHostCopyObsolete(false);
4682     }
4683 
checkContinuous(int dims,const size_t sz[],const size_t srcofs[],const size_t srcstep[],const size_t dstofs[],const size_t dststep[],size_t & total,size_t new_sz[],size_t & srcrawofs,size_t new_srcofs[],size_t new_srcstep[],size_t & dstrawofs,size_t new_dstofs[],size_t new_dststep[]) const4684     bool checkContinuous(int dims, const size_t sz[],
4685                          const size_t srcofs[], const size_t srcstep[],
4686                          const size_t dstofs[], const size_t dststep[],
4687                          size_t& total, size_t new_sz[],
4688                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
4689                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
4690     {
4691         bool iscontinuous = true;
4692         srcrawofs = srcofs ? srcofs[dims-1] : 0;
4693         dstrawofs = dstofs ? dstofs[dims-1] : 0;
4694         total = sz[dims-1];
4695         for( int i = dims-2; i >= 0; i-- )
4696         {
4697             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
4698                 iscontinuous = false;
4699             total *= sz[i];
4700             if( srcofs )
4701                 srcrawofs += srcofs[i]*srcstep[i];
4702             if( dstofs )
4703                 dstrawofs += dstofs[i]*dststep[i];
4704         }
4705 
4706         if( !iscontinuous )
4707         {
4708             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
4709             if( dims == 2 )
4710             {
4711                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
4712                 // we assume that new_... arrays are initialized by caller
4713                 // with 0's, so there is no else branch
4714                 if( srcofs )
4715                 {
4716                     new_srcofs[0] = srcofs[1];
4717                     new_srcofs[1] = srcofs[0];
4718                     new_srcofs[2] = 0;
4719                 }
4720 
4721                 if( dstofs )
4722                 {
4723                     new_dstofs[0] = dstofs[1];
4724                     new_dstofs[1] = dstofs[0];
4725                     new_dstofs[2] = 0;
4726                 }
4727 
4728                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
4729                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
4730             }
4731             else
4732             {
4733                 // we could check for dims == 3 here,
4734                 // but from user perspective this one is more informative
4735                 CV_Assert(dims <= 3);
4736                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
4737                 if( srcofs )
4738                 {
4739                     new_srcofs[0] = srcofs[2];
4740                     new_srcofs[1] = srcofs[1];
4741                     new_srcofs[2] = srcofs[0];
4742                 }
4743 
4744                 if( dstofs )
4745                 {
4746                     new_dstofs[0] = dstofs[2];
4747                     new_dstofs[1] = dstofs[1];
4748                     new_dstofs[2] = dstofs[0];
4749                 }
4750 
4751                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
4752                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
4753             }
4754         }
4755         return iscontinuous;
4756     }
4757 
download(UMatData * u,void * dstptr,int dims,const size_t sz[],const size_t srcofs[],const size_t srcstep[],const size_t dststep[]) const4758     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
4759                   const size_t srcofs[], const size_t srcstep[],
4760                   const size_t dststep[]) const
4761     {
4762         if(!u)
4763             return;
4764         UMatDataAutoLock autolock(u);
4765 
4766         if( u->data && !u->hostCopyObsolete() )
4767         {
4768             Mat::getStdAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
4769             return;
4770         }
4771         CV_Assert( u->handle != 0 );
4772 
4773         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4774 
4775         size_t total = 0, new_sz[] = {0, 0, 0};
4776         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4777         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4778 
4779         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
4780                                             total, new_sz,
4781                                             srcrawofs, new_srcofs, new_srcstep,
4782                                             dstrawofs, new_dstofs, new_dststep);
4783 
4784 #ifdef HAVE_OPENCL_SVM
4785         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4786         {
4787             CV_DbgAssert(u->data == NULL || u->data == u->handle);
4788             Context& ctx = Context::getDefault();
4789             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4790             CV_DbgAssert(svmFns->isValid());
4791 
4792             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
4793             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4794             {
4795                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
4796                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
4797                         u->handle, u->size,
4798                         0, NULL, NULL);
4799                 CV_Assert(status == CL_SUCCESS);
4800             }
4801             clFinish(q);
4802             if( iscontinuous )
4803             {
4804                 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
4805             }
4806             else
4807             {
4808                 // This code is from MatAllocator::download()
4809                 int isz[CV_MAX_DIM];
4810                 uchar* srcptr = (uchar*)u->handle;
4811                 for( int i = 0; i < dims; i++ )
4812                 {
4813                     CV_Assert( sz[i] <= (size_t)INT_MAX );
4814                     if( sz[i] == 0 )
4815                     return;
4816                     if( srcofs )
4817                     srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
4818                     isz[i] = (int)sz[i];
4819                 }
4820 
4821                 Mat src(dims, isz, CV_8U, srcptr, srcstep);
4822                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
4823 
4824                 const Mat* arrays[] = { &src, &dst };
4825                 uchar* ptrs[2];
4826                 NAryMatIterator it(arrays, ptrs, 2);
4827                 size_t j, planesz = it.size;
4828 
4829                 for( j = 0; j < it.nplanes; j++, ++it )
4830                     memcpy(ptrs[1], ptrs[0], planesz);
4831             }
4832             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4833             {
4834                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4835                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
4836                         0, NULL, NULL);
4837                 CV_Assert(status == CL_SUCCESS);
4838                 clFinish(q);
4839             }
4840         }
4841         else
4842 #endif
4843         {
4844             AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, sz[0] * dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
4845             if( iscontinuous )
4846             {
4847                 CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
4848                                                srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
4849             }
4850             else
4851             {
4852                 CV_Assert( clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
4853                                 new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1],
4854                                 new_dststep[0], new_dststep[1], alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
4855             }
4856         }
4857     }
4858 
upload(UMatData * u,const void * srcptr,int dims,const size_t sz[],const size_t dstofs[],const size_t dststep[],const size_t srcstep[]) const4859     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
4860                 const size_t dstofs[], const size_t dststep[],
4861                 const size_t srcstep[]) const
4862     {
4863         if(!u)
4864             return;
4865 
4866         // there should be no user-visible CPU copies of the UMat which we are going to copy to
4867         CV_Assert(u->refcount == 0 || u->tempUMat());
4868 
4869         size_t total = 0, new_sz[] = {0, 0, 0};
4870         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4871         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4872 
4873         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
4874                                             total, new_sz,
4875                                             srcrawofs, new_srcofs, new_srcstep,
4876                                             dstrawofs, new_dstofs, new_dststep);
4877 
4878         UMatDataAutoLock autolock(u);
4879 
4880         // if there is cached CPU copy of the GPU matrix,
4881         // we could use it as a destination.
4882         // we can do it in 2 cases:
4883         //    1. we overwrite the whole content
4884         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
4885         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
4886         {
4887             Mat::getStdAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
4888             u->markHostCopyObsolete(false);
4889             u->markDeviceCopyObsolete(true);
4890             return;
4891         }
4892 
4893         CV_Assert( u->handle != 0 );
4894         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4895 
4896 #ifdef HAVE_OPENCL_SVM
4897         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4898         {
4899             CV_DbgAssert(u->data == NULL || u->data == u->handle);
4900             Context& ctx = Context::getDefault();
4901             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4902             CV_DbgAssert(svmFns->isValid());
4903 
4904             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
4905             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4906             {
4907                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
4908                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
4909                         u->handle, u->size,
4910                         0, NULL, NULL);
4911                 CV_Assert(status == CL_SUCCESS);
4912             }
4913             clFinish(q);
4914             if( iscontinuous )
4915             {
4916                 memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
4917             }
4918             else
4919             {
4920                 // This code is from MatAllocator::upload()
4921                 int isz[CV_MAX_DIM];
4922                 uchar* dstptr = (uchar*)u->handle;
4923                 for( int i = 0; i < dims; i++ )
4924                 {
4925                     CV_Assert( sz[i] <= (size_t)INT_MAX );
4926                     if( sz[i] == 0 )
4927                     return;
4928                     if( dstofs )
4929                     dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
4930                     isz[i] = (int)sz[i];
4931                 }
4932 
4933                 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
4934                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
4935 
4936                 const Mat* arrays[] = { &src, &dst };
4937                 uchar* ptrs[2];
4938                 NAryMatIterator it(arrays, ptrs, 2);
4939                 size_t j, planesz = it.size;
4940 
4941                 for( j = 0; j < it.nplanes; j++, ++it )
4942                     memcpy(ptrs[1], ptrs[0], planesz);
4943             }
4944             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4945             {
4946                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4947                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
4948                         0, NULL, NULL);
4949                 CV_Assert(status == CL_SUCCESS);
4950                 clFinish(q);
4951             }
4952         }
4953         else
4954 #endif
4955         {
4956             AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, sz[0] * srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
4957             if( iscontinuous )
4958             {
4959                 CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle,
4960                     CL_TRUE, dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
4961             }
4962             else
4963             {
4964                 CV_Assert( clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
4965                     new_dstofs, new_srcofs, new_sz, new_dststep[0], new_dststep[1],
4966                     new_srcstep[0], new_srcstep[1], alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
4967             }
4968         }
4969         u->markHostCopyObsolete(true);
4970 #ifdef HAVE_OPENCL_SVM
4971         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
4972                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
4973         {
4974             // nothing
4975         }
4976         else
4977 #endif
4978         {
4979             u->markHostCopyObsolete(true);
4980         }
4981         u->markDeviceCopyObsolete(false);
4982     }
4983 
copy(UMatData * src,UMatData * dst,int dims,const size_t sz[],const size_t srcofs[],const size_t srcstep[],const size_t dstofs[],const size_t dststep[],bool _sync) const4984     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
4985               const size_t srcofs[], const size_t srcstep[],
4986               const size_t dstofs[], const size_t dststep[], bool _sync) const
4987     {
4988         if(!src || !dst)
4989             return;
4990 
4991         size_t total = 0, new_sz[] = {0, 0, 0};
4992         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4993         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4994 
4995         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
4996                                             total, new_sz,
4997                                             srcrawofs, new_srcofs, new_srcstep,
4998                                             dstrawofs, new_dstofs, new_dststep);
4999 
5000         UMatDataAutoLock src_autolock(src);
5001         UMatDataAutoLock dst_autolock(dst);
5002 
5003         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
5004         {
5005             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
5006             return;
5007         }
5008         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
5009         {
5010             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
5011             dst->markHostCopyObsolete(false);
5012 #ifdef HAVE_OPENCL_SVM
5013             if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5014                     (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5015             {
5016                 // nothing
5017             }
5018             else
5019 #endif
5020             {
5021                 dst->markDeviceCopyObsolete(true);
5022             }
5023             return;
5024         }
5025 
5026         // there should be no user-visible CPU copies of the UMat which we are going to copy to
5027         CV_Assert(dst->refcount == 0);
5028         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5029 
5030         cl_int retval = CL_SUCCESS;
5031 #ifdef HAVE_OPENCL_SVM
5032         if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
5033                 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5034         {
5035             if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
5036                             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5037             {
5038                 Context& ctx = Context::getDefault();
5039                 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5040                 CV_DbgAssert(svmFns->isValid());
5041 
5042                 if( iscontinuous )
5043                 {
5044                     CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
5045                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
5046                     cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
5047                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
5048                             total, 0, NULL, NULL);
5049                     CV_Assert(status == CL_SUCCESS);
5050                 }
5051                 else
5052                 {
5053                     clFinish(q);
5054                     // This code is from MatAllocator::download()/upload()
5055                     int isz[CV_MAX_DIM];
5056                     uchar* srcptr = (uchar*)src->handle;
5057                     for( int i = 0; i < dims; i++ )
5058                     {
5059                         CV_Assert( sz[i] <= (size_t)INT_MAX );
5060                         if( sz[i] == 0 )
5061                         return;
5062                         if( srcofs )
5063                         srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5064                         isz[i] = (int)sz[i];
5065                     }
5066                     Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
5067 
5068                     uchar* dstptr = (uchar*)dst->handle;
5069                     for( int i = 0; i < dims; i++ )
5070                     {
5071                         if( dstofs )
5072                         dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
5073                     }
5074                     Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
5075 
5076                     const Mat* arrays[] = { &m_src, &m_dst };
5077                     uchar* ptrs[2];
5078                     NAryMatIterator it(arrays, ptrs, 2);
5079                     size_t j, planesz = it.size;
5080 
5081                     for( j = 0; j < it.nplanes; j++, ++it )
5082                         memcpy(ptrs[1], ptrs[0], planesz);
5083                 }
5084             }
5085             else
5086             {
5087                 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5088                 {
5089                     map(src, ACCESS_READ);
5090                     upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
5091                     unmap(src);
5092                 }
5093                 else
5094                 {
5095                     map(dst, ACCESS_WRITE);
5096                     download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
5097                     unmap(dst);
5098                 }
5099             }
5100         }
5101         else
5102 #endif
5103         {
5104             if( iscontinuous )
5105             {
5106                 CV_Assert( (retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
5107                                                srcrawofs, dstrawofs, total, 0, 0, 0)) == CL_SUCCESS );
5108             }
5109             else
5110             {
5111                 CV_Assert( (retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
5112                                                    new_srcofs, new_dstofs, new_sz,
5113                                                    new_srcstep[0], new_srcstep[1],
5114                                                    new_dststep[0], new_dststep[1],
5115                                                    0, 0, 0)) == CL_SUCCESS );
5116             }
5117         }
5118         if (retval == CL_SUCCESS)
5119         {
5120             CV_IMPL_ADD(CV_IMPL_OCL)
5121         }
5122 
5123 #ifdef HAVE_OPENCL_SVM
5124         if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5125                 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5126         {
5127             // nothing
5128         }
5129         else
5130 #endif
5131         {
5132             dst->markHostCopyObsolete(true);
5133         }
5134         dst->markDeviceCopyObsolete(false);
5135 
5136         if( _sync )
5137         {
5138             CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
5139         }
5140     }
5141 
getBufferPoolController(const char * id) const5142     BufferPoolController* getBufferPoolController(const char* id) const {
5143 #ifdef HAVE_OPENCL_SVM
5144         if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
5145         {
5146             return &bufferPoolSVM;
5147         }
5148 #endif
5149         if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
5150         {
5151             return &bufferPoolHostPtr;
5152         }
5153         if (id != NULL && strcmp(id, "OCL") != 0)
5154         {
5155             CV_ErrorNoReturn(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
5156         }
5157         return &bufferPool;
5158     }
5159 
5160     MatAllocator* matStdAllocator;
5161 };
5162 
getOpenCLAllocator()5163 MatAllocator* getOpenCLAllocator()
5164 {
5165     static MatAllocator * allocator = new OpenCLAllocator();
5166     return allocator;
5167 }
5168 
5169 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
5170 
getDevices(std::vector<cl_device_id> & devices,cl_platform_id platform)5171 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
5172 {
5173     cl_uint numDevices = 0;
5174     CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
5175                                 0, NULL, &numDevices) == CL_SUCCESS);
5176 
5177     if (numDevices == 0)
5178     {
5179         devices.clear();
5180         return;
5181     }
5182 
5183     devices.resize((size_t)numDevices);
5184     CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
5185                                 numDevices, &devices[0], &numDevices) == CL_SUCCESS);
5186 }
5187 
5188 struct PlatformInfo::Impl
5189 {
Implcv::ocl::PlatformInfo::Impl5190     Impl(void* id)
5191     {
5192         refcount = 1;
5193         handle = *(cl_platform_id*)id;
5194         getDevices(devices, handle);
5195     }
5196 
getStrPropcv::ocl::PlatformInfo::Impl5197     String getStrProp(cl_device_info prop) const
5198     {
5199         char buf[1024];
5200         size_t sz=0;
5201         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
5202             sz < sizeof(buf) ? String(buf) : String();
5203     }
5204 
5205     IMPLEMENT_REFCOUNTABLE();
5206     std::vector<cl_device_id> devices;
5207     cl_platform_id handle;
5208 };
5209 
PlatformInfo()5210 PlatformInfo::PlatformInfo()
5211 {
5212     p = 0;
5213 }
5214 
PlatformInfo(void * platform_id)5215 PlatformInfo::PlatformInfo(void* platform_id)
5216 {
5217     p = new Impl(platform_id);
5218 }
5219 
~PlatformInfo()5220 PlatformInfo::~PlatformInfo()
5221 {
5222     if(p)
5223         p->release();
5224 }
5225 
PlatformInfo(const PlatformInfo & i)5226 PlatformInfo::PlatformInfo(const PlatformInfo& i)
5227 {
5228     if (i.p)
5229         i.p->addref();
5230     p = i.p;
5231 }
5232 
operator =(const PlatformInfo & i)5233 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
5234 {
5235     if (i.p != p)
5236     {
5237         if (i.p)
5238             i.p->addref();
5239         if (p)
5240             p->release();
5241         p = i.p;
5242     }
5243     return *this;
5244 }
5245 
deviceNumber() const5246 int PlatformInfo::deviceNumber() const
5247 {
5248     return p ? (int)p->devices.size() : 0;
5249 }
5250 
getDevice(Device & device,int d) const5251 void PlatformInfo::getDevice(Device& device, int d) const
5252 {
5253     CV_Assert(p && d < (int)p->devices.size() );
5254     if(p)
5255         device.set(p->devices[d]);
5256 }
5257 
name() const5258 String PlatformInfo::name() const
5259 {
5260     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
5261 }
5262 
vendor() const5263 String PlatformInfo::vendor() const
5264 {
5265     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
5266 }
5267 
version() const5268 String PlatformInfo::version() const
5269 {
5270     return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
5271 }
5272 
getPlatforms(std::vector<cl_platform_id> & platforms)5273 static void getPlatforms(std::vector<cl_platform_id>& platforms)
5274 {
5275     cl_uint numPlatforms = 0;
5276     CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
5277 
5278     if (numPlatforms == 0)
5279     {
5280         platforms.clear();
5281         return;
5282     }
5283 
5284     platforms.resize((size_t)numPlatforms);
5285     CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
5286 }
5287 
getPlatfomsInfo(std::vector<PlatformInfo> & platformsInfo)5288 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
5289 {
5290     std::vector<cl_platform_id> platforms;
5291     getPlatforms(platforms);
5292 
5293     for (size_t i = 0; i < platforms.size(); i++)
5294         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
5295 }
5296 
typeToStr(int type)5297 const char* typeToStr(int type)
5298 {
5299     static const char* tab[]=
5300     {
5301         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
5302         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
5303         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
5304         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
5305         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
5306         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
5307         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
5308         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
5309     };
5310     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
5311     return cn > 16 ? "?" : tab[depth*16 + cn-1];
5312 }
5313 
memopTypeToStr(int type)5314 const char* memopTypeToStr(int type)
5315 {
5316     static const char* tab[] =
5317     {
5318         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
5319         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
5320         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
5321         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
5322         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
5323         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
5324         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
5325         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
5326     };
5327     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
5328     return cn > 16 ? "?" : tab[depth*16 + cn-1];
5329 }
5330 
vecopTypeToStr(int type)5331 const char* vecopTypeToStr(int type)
5332 {
5333     static const char* tab[] =
5334     {
5335         "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
5336         "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
5337         "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
5338         "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
5339         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
5340         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
5341         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
5342         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
5343     };
5344     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
5345     return cn > 16 ? "?" : tab[depth*16 + cn-1];
5346 }
5347 
convertTypeStr(int sdepth,int ddepth,int cn,char * buf)5348 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
5349 {
5350     if( sdepth == ddepth )
5351         return "noconvert";
5352     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
5353     if( ddepth >= CV_32F ||
5354         (ddepth == CV_32S && sdepth < CV_32S) ||
5355         (ddepth == CV_16S && sdepth <= CV_8S) ||
5356         (ddepth == CV_16U && sdepth == CV_8U))
5357     {
5358         sprintf(buf, "convert_%s", typestr);
5359     }
5360     else if( sdepth >= CV_32F )
5361         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
5362     else
5363         sprintf(buf, "convert_%s_sat", typestr);
5364 
5365     return buf;
5366 }
5367 
5368 template <typename T>
kerToStr(const Mat & k)5369 static std::string kerToStr(const Mat & k)
5370 {
5371     int width = k.cols - 1, depth = k.depth();
5372     const T * const data = k.ptr<T>();
5373 
5374     std::ostringstream stream;
5375     stream.precision(10);
5376 
5377     if (depth <= CV_8S)
5378     {
5379         for (int i = 0; i < width; ++i)
5380             stream << "DIG(" << (int)data[i] << ")";
5381         stream << "DIG(" << (int)data[width] << ")";
5382     }
5383     else if (depth == CV_32F)
5384     {
5385         stream.setf(std::ios_base::showpoint);
5386         for (int i = 0; i < width; ++i)
5387             stream << "DIG(" << data[i] << "f)";
5388         stream << "DIG(" << data[width] << "f)";
5389     }
5390     else
5391     {
5392         for (int i = 0; i < width; ++i)
5393             stream << "DIG(" << data[i] << ")";
5394         stream << "DIG(" << data[width] << ")";
5395     }
5396 
5397     return stream.str();
5398 }
5399 
kernelToStr(InputArray _kernel,int ddepth,const char * name)5400 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
5401 {
5402     Mat kernel = _kernel.getMat().reshape(1, 1);
5403 
5404     int depth = kernel.depth();
5405     if (ddepth < 0)
5406         ddepth = depth;
5407 
5408     if (ddepth != depth)
5409         kernel.convertTo(kernel, ddepth);
5410 
5411     typedef std::string (* func_t)(const Mat &);
5412     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
5413                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
5414     const func_t func = funcs[ddepth];
5415     CV_Assert(func != 0);
5416 
5417     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
5418 }
5419 
5420 #define PROCESS_SRC(src) \
5421     do \
5422     { \
5423         if (!src.empty()) \
5424         { \
5425             CV_Assert(src.isMat() || src.isUMat()); \
5426             Size csize = src.size(); \
5427             int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
5428                 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
5429             if (cwidth < ckercn || ckercn <= 0) \
5430                 return 1; \
5431             cols.push_back(cwidth); \
5432             if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
5433                 return 1; \
5434             offsets.push_back(src.offset()); \
5435             steps.push_back(src.step()); \
5436             dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
5437             kercns.push_back(ckercn); \
5438         } \
5439     } \
5440     while ((void)0, 0)
5441 
predictOptimalVectorWidth(InputArray src1,InputArray src2,InputArray src3,InputArray src4,InputArray src5,InputArray src6,InputArray src7,InputArray src8,InputArray src9,OclVectorStrategy strat)5442 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
5443                               InputArray src4, InputArray src5, InputArray src6,
5444                               InputArray src7, InputArray src8, InputArray src9,
5445                               OclVectorStrategy strat)
5446 {
5447     const ocl::Device & d = ocl::Device::getDefault();
5448 
5449     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
5450         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
5451         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
5452         d.preferredVectorWidthDouble(), -1 };
5453 
5454     // if the device says don't use vectors
5455     if (vectorWidths[0] == 1)
5456     {
5457         // it's heuristic
5458         vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
5459         vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
5460         vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
5461     }
5462 
5463     return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
5464 }
5465 
checkOptimalVectorWidth(const int * vectorWidths,InputArray src1,InputArray src2,InputArray src3,InputArray src4,InputArray src5,InputArray src6,InputArray src7,InputArray src8,InputArray src9,OclVectorStrategy strat)5466 int checkOptimalVectorWidth(const int *vectorWidths,
5467                             InputArray src1, InputArray src2, InputArray src3,
5468                             InputArray src4, InputArray src5, InputArray src6,
5469                             InputArray src7, InputArray src8, InputArray src9,
5470                             OclVectorStrategy strat)
5471 {
5472     CV_Assert(vectorWidths);
5473 
5474     int ref_type = src1.type();
5475 
5476     std::vector<size_t> offsets, steps, cols;
5477     std::vector<int> dividers, kercns;
5478     PROCESS_SRC(src1);
5479     PROCESS_SRC(src2);
5480     PROCESS_SRC(src3);
5481     PROCESS_SRC(src4);
5482     PROCESS_SRC(src5);
5483     PROCESS_SRC(src6);
5484     PROCESS_SRC(src7);
5485     PROCESS_SRC(src8);
5486     PROCESS_SRC(src9);
5487 
5488     size_t size = offsets.size();
5489 
5490     for (size_t i = 0; i < size; ++i)
5491         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
5492             dividers[i] >>= 1, kercns[i] >>= 1;
5493 
5494     // default strategy
5495     int kercn = *std::min_element(kercns.begin(), kercns.end());
5496 
5497     return kercn;
5498 }
5499 
predictOptimalVectorWidthMax(InputArray src1,InputArray src2,InputArray src3,InputArray src4,InputArray src5,InputArray src6,InputArray src7,InputArray src8,InputArray src9)5500 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
5501                                  InputArray src4, InputArray src5, InputArray src6,
5502                                  InputArray src7, InputArray src8, InputArray src9)
5503 {
5504     return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
5505 }
5506 
5507 #undef PROCESS_SRC
5508 
5509 
5510 // TODO Make this as a method of OpenCL "BuildOptions" class
buildOptionsAddMatrixDescription(String & buildOptions,const String & name,InputArray _m)5511 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
5512 {
5513     if (!buildOptions.empty())
5514         buildOptions += " ";
5515     int type = _m.type(), depth = CV_MAT_DEPTH(type);
5516     buildOptions += format(
5517             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
5518             name.c_str(), ocl::typeToStr(type),
5519             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
5520             name.c_str(), (int)CV_MAT_CN(type),
5521             name.c_str(), (int)CV_ELEM_SIZE(type),
5522             name.c_str(), (int)CV_ELEM_SIZE1(type),
5523             name.c_str(), (int)depth
5524             );
5525 }
5526 
5527 
5528 struct Image2D::Impl
5529 {
Implcv::ocl::Image2D::Impl5530     Impl(const UMat &src, bool norm, bool alias)
5531     {
5532         handle = 0;
5533         refcount = 1;
5534         init(src, norm, alias);
5535     }
5536 
~Implcv::ocl::Image2D::Impl5537     ~Impl()
5538     {
5539         if (handle)
5540             clReleaseMemObject(handle);
5541     }
5542 
getImageFormatcv::ocl::Image2D::Impl5543     static cl_image_format getImageFormat(int depth, int cn, bool norm)
5544     {
5545         cl_image_format format;
5546         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
5547                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
5548         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
5549                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
5550         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
5551 
5552         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
5553         int channelOrder = channelOrders[cn];
5554         format.image_channel_data_type = (cl_channel_type)channelType;
5555         format.image_channel_order = (cl_channel_order)channelOrder;
5556         return format;
5557     }
5558 
isFormatSupportedcv::ocl::Image2D::Impl5559     static bool isFormatSupported(cl_image_format format)
5560     {
5561         if (!haveOpenCL())
5562             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
5563 
5564         cl_context context = (cl_context)Context::getDefault().ptr();
5565         // Figure out how many formats are supported by this context.
5566         cl_uint numFormats = 0;
5567         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
5568                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
5569                                                 NULL, &numFormats);
5570         AutoBuffer<cl_image_format> formats(numFormats);
5571         err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
5572                                          CL_MEM_OBJECT_IMAGE2D, numFormats,
5573                                          formats, NULL);
5574         CV_OclDbgAssert(err == CL_SUCCESS);
5575         for (cl_uint i = 0; i < numFormats; ++i)
5576         {
5577             if (!memcmp(&formats[i], &format, sizeof(format)))
5578             {
5579                 return true;
5580             }
5581         }
5582         return false;
5583     }
5584 
initcv::ocl::Image2D::Impl5585     void init(const UMat &src, bool norm, bool alias)
5586     {
5587         if (!haveOpenCL())
5588             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
5589 
5590         CV_Assert(!src.empty());
5591         CV_Assert(ocl::Device::getDefault().imageSupport());
5592 
5593         int err, depth = src.depth(), cn = src.channels();
5594         CV_Assert(cn <= 4);
5595         cl_image_format format = getImageFormat(depth, cn, norm);
5596 
5597         if (!isFormatSupported(format))
5598             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
5599 
5600         if (alias && !src.handle(ACCESS_RW))
5601             CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
5602 
5603         cl_context context = (cl_context)Context::getDefault().ptr();
5604         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
5605 
5606 #ifdef CL_VERSION_1_2
5607         // this enables backwards portability to
5608         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
5609         const Device & d = ocl::Device::getDefault();
5610         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
5611         CV_Assert(!alias || canCreateAlias(src));
5612         if (1 < major || (1 == major && 2 <= minor))
5613         {
5614             cl_image_desc desc;
5615             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
5616             desc.image_width      = src.cols;
5617             desc.image_height     = src.rows;
5618             desc.image_depth      = 0;
5619             desc.image_array_size = 1;
5620             desc.image_row_pitch  = alias ? src.step[0] : 0;
5621             desc.image_slice_pitch = 0;
5622             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
5623             desc.num_mip_levels   = 0;
5624             desc.num_samples      = 0;
5625             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
5626         }
5627         else
5628 #endif
5629         {
5630             CV_SUPPRESS_DEPRECATED_START
5631             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
5632             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
5633             CV_SUPPRESS_DEPRECATED_END
5634         }
5635         CV_OclDbgAssert(err == CL_SUCCESS);
5636 
5637         size_t origin[] = { 0, 0, 0 };
5638         size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
5639 
5640         cl_mem devData;
5641         if (!alias && !src.isContinuous())
5642         {
5643             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
5644             CV_OclDbgAssert(err == CL_SUCCESS);
5645 
5646             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
5647             CV_Assert(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
5648                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL) == CL_SUCCESS);
5649             CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
5650         }
5651         else
5652         {
5653             devData = (cl_mem)src.handle(ACCESS_READ);
5654         }
5655         CV_Assert(devData != NULL);
5656 
5657         if (!alias)
5658         {
5659             CV_OclDbgAssert(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0) == CL_SUCCESS);
5660             if (!src.isContinuous())
5661             {
5662                 CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
5663                 CV_OclDbgAssert(clReleaseMemObject(devData) == CL_SUCCESS);
5664             }
5665         }
5666     }
5667 
5668     IMPLEMENT_REFCOUNTABLE();
5669 
5670     cl_mem handle;
5671 };
5672 
Image2D()5673 Image2D::Image2D()
5674 {
5675     p = NULL;
5676 }
5677 
Image2D(const UMat & src,bool norm,bool alias)5678 Image2D::Image2D(const UMat &src, bool norm, bool alias)
5679 {
5680     p = new Impl(src, norm, alias);
5681 }
5682 
canCreateAlias(const UMat & m)5683 bool Image2D::canCreateAlias(const UMat &m)
5684 {
5685     bool ret = false;
5686     const Device & d = ocl::Device::getDefault();
5687     if (d.imageFromBufferSupport() && !m.empty())
5688     {
5689         // This is the required pitch alignment in pixels
5690         uint pitchAlign = d.imagePitchAlignment();
5691         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
5692         {
5693             // We don't currently handle the case where the buffer was created
5694             // with CL_MEM_USE_HOST_PTR
5695             if (!m.u->tempUMat())
5696             {
5697                 ret = true;
5698             }
5699         }
5700     }
5701     return ret;
5702 }
5703 
isFormatSupported(int depth,int cn,bool norm)5704 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
5705 {
5706     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
5707 
5708     return Impl::isFormatSupported(format);
5709 }
5710 
Image2D(const Image2D & i)5711 Image2D::Image2D(const Image2D & i)
5712 {
5713     p = i.p;
5714     if (p)
5715         p->addref();
5716 }
5717 
operator =(const Image2D & i)5718 Image2D & Image2D::operator = (const Image2D & i)
5719 {
5720     if (i.p != p)
5721     {
5722         if (i.p)
5723             i.p->addref();
5724         if (p)
5725             p->release();
5726         p = i.p;
5727     }
5728     return *this;
5729 }
5730 
~Image2D()5731 Image2D::~Image2D()
5732 {
5733     if (p)
5734         p->release();
5735 }
5736 
ptr() const5737 void* Image2D::ptr() const
5738 {
5739     return p ? p->handle : 0;
5740 }
5741 
isPerformanceCheckBypassed()5742 bool internal::isPerformanceCheckBypassed()
5743 {
5744     static bool initialized = false;
5745     static bool value = false;
5746     if (!initialized)
5747     {
5748         value = getBoolParameter("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
5749         initialized = true;
5750     }
5751     return value;
5752 }
5753 
isCLBuffer(UMat & u)5754 bool internal::isCLBuffer(UMat& u)
5755 {
5756     void* h = u.handle(ACCESS_RW);
5757     if (!h)
5758         return true;
5759     CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
5760 #if 1
5761     if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
5762         return false;
5763 #else
5764     cl_mem_object_type type = 0;
5765     cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
5766     if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
5767         return false;
5768 #endif
5769     return true;
5770 }
5771 
5772 }}
5773