1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5//  By downloading, copying, installing or using the software you agree to this license.
6//  If you do not agree to this license, do not download, install,
7//  copy or use the software.
8//
9//
10//                           License Agreement
11//                For Open Source Computer Vision Library
12//
13// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14// Third party copyrights are property of their respective owners.
15//
16// Redistribution and use in source and binary forms, with or without modification,
17// are permitted provided that the following conditions are met:
18//
19//   * Redistribution's of source code must retain the above copyright notice,
20//     this list of conditions and the following disclaimer.
21//
22//   * Redistribution's in binary form must reproduce the above copyright notice,
23//     this list of conditions and the following disclaimer in the documentation
24//     and/or other materials provided with the distribution.
25//
26//   * The name of the copyright holders may not be used to endorse or promote products
27//     derived from this software without specific prior written permission.
28//
29// This software is provided by the copyright holders and contributors "as is" and
30// any express or implied warranties, including, but not limited to, the implied
31// warranties of merchantability and fitness for a particular purpose are disclaimed.
32// In no event shall the OpenCV Foundation or contributors be liable for any direct,
33// indirect, incidental, special, exemplary, or consequential damages
34// (including, but not limited to, procurement of substitute goods or services;
35// loss of use, data, or profits; or business interruption) however caused
36// and on any theory of liability, whether in contract, strict liability,
37// or tort (including negligence or otherwise) arising in any way out of
38// the use of this software, even if advised of the possibility of such damage.
39//
40//M*/
41
42#include "precomp.hpp"
43#include <list>
44#include <map>
45#include <string>
46#include <sstream>
47#include <iostream> // std::cerr
48
49#define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
50#define CV_OPENCL_SHOW_RUN_ERRORS       0
51#define CV_OPENCL_SHOW_SVM_ERROR_LOG    1
52#define CV_OPENCL_SHOW_SVM_LOG          0
53
54#include "opencv2/core/bufferpool.hpp"
55#ifndef LOG_BUFFER_POOL
56# if 0
57#   define LOG_BUFFER_POOL printf
58# else
59#   define LOG_BUFFER_POOL(...)
60# endif
61#endif
62
63
64// TODO Move to some common place
65static bool getBoolParameter(const char* name, bool defaultValue)
66{
67/*
68 * If your system doesn't support getenv(), define NO_GETENV to disable
69 * this feature.
70 */
71#ifdef NO_GETENV
72    const char* envValue = NULL;
73#else
74    const char* envValue = getenv(name);
75#endif
76    if (envValue == NULL)
77    {
78        return defaultValue;
79    }
80    cv::String value = envValue;
81    if (value == "1" || value == "True" || value == "true" || value == "TRUE")
82    {
83        return true;
84    }
85    if (value == "0" || value == "False" || value == "false" || value == "FALSE")
86    {
87        return false;
88    }
89    CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
90}
91
92
93// TODO Move to some common place
94static size_t getConfigurationParameterForSize(const char* name, size_t defaultValue)
95{
96#ifdef NO_GETENV
97    const char* envValue = NULL;
98#else
99    const char* envValue = getenv(name);
100#endif
101    if (envValue == NULL)
102    {
103        return defaultValue;
104    }
105    cv::String value = envValue;
106    size_t pos = 0;
107    for (; pos < value.size(); pos++)
108    {
109        if (!isdigit(value[pos]))
110            break;
111    }
112    cv::String valueStr = value.substr(0, pos);
113    cv::String suffixStr = value.substr(pos, value.length() - pos);
114    int v = atoi(valueStr.c_str());
115    if (suffixStr.length() == 0)
116        return v;
117    else if (suffixStr == "MB" || suffixStr == "Mb" || suffixStr == "mb")
118        return v * 1024 * 1024;
119    else if (suffixStr == "KB" || suffixStr == "Kb" || suffixStr == "kb")
120        return v * 1024;
121    CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
122}
123
124#if CV_OPENCL_SHOW_SVM_LOG
125// TODO add timestamp logging
126#define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
127#else
128#define CV_OPENCL_SVM_TRACE_P(...)
129#endif
130
131#if CV_OPENCL_SHOW_SVM_ERROR_LOG
132// TODO add timestamp logging
133#define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
134#else
135#define CV_OPENCL_SVM_TRACE_ERROR_P(...)
136#endif
137
138#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
139#include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
140
141#ifdef HAVE_OPENCL
142#include "opencv2/core/opencl/runtime/opencl_core.hpp"
143#else
144// TODO FIXIT: This file can't be build without OPENCL
145
146/*
147  Part of the file is an extract from the standard OpenCL headers from Khronos site.
148  Below is the original copyright.
149*/
150
151/*******************************************************************************
152 * Copyright (c) 2008 - 2012 The Khronos Group Inc.
153 *
154 * Permission is hereby granted, free of charge, to any person obtaining a
155 * copy of this software and/or associated documentation files (the
156 * "Materials"), to deal in the Materials without restriction, including
157 * without limitation the rights to use, copy, modify, merge, publish,
158 * distribute, sublicense, and/or sell copies of the Materials, and to
159 * permit persons to whom the Materials are furnished to do so, subject to
160 * the following conditions:
161 *
162 * The above copyright notice and this permission notice shall be included
163 * in all copies or substantial portions of the Materials.
164 *
165 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
166 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
167 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
168 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
169 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
170 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
171 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
172 ******************************************************************************/
173
174#if 0 //defined __APPLE__
175#define HAVE_OPENCL 1
176#else
177#undef HAVE_OPENCL
178#endif
179
180#define OPENCV_CL_NOT_IMPLEMENTED -1000
181
182#ifdef HAVE_OPENCL
183
184#if defined __APPLE__
185#include <OpenCL/opencl.h>
186#else
187#include <CL/opencl.h>
188#endif
189
190static const bool g_haveOpenCL = true;
191
192#else
193
194extern "C" {
195
196struct _cl_platform_id { int dummy; };
197struct _cl_device_id { int dummy; };
198struct _cl_context { int dummy; };
199struct _cl_command_queue { int dummy; };
200struct _cl_mem { int dummy; };
201struct _cl_program { int dummy; };
202struct _cl_kernel { int dummy; };
203struct _cl_event { int dummy; };
204struct _cl_sampler { int dummy; };
205
206typedef struct _cl_platform_id *    cl_platform_id;
207typedef struct _cl_device_id *      cl_device_id;
208typedef struct _cl_context *        cl_context;
209typedef struct _cl_command_queue *  cl_command_queue;
210typedef struct _cl_mem *            cl_mem;
211typedef struct _cl_program *        cl_program;
212typedef struct _cl_kernel *         cl_kernel;
213typedef struct _cl_event *          cl_event;
214typedef struct _cl_sampler *        cl_sampler;
215
216typedef int cl_int;
217typedef unsigned cl_uint;
218#if defined (_WIN32) && defined(_MSC_VER)
219    typedef __int64 cl_long;
220    typedef unsigned __int64 cl_ulong;
221#else
222    typedef long cl_long;
223    typedef unsigned long cl_ulong;
224#endif
225
226typedef cl_uint             cl_bool; /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
227typedef cl_ulong            cl_bitfield;
228typedef cl_bitfield         cl_device_type;
229typedef cl_uint             cl_platform_info;
230typedef cl_uint             cl_device_info;
231typedef cl_bitfield         cl_device_fp_config;
232typedef cl_uint             cl_device_mem_cache_type;
233typedef cl_uint             cl_device_local_mem_type;
234typedef cl_bitfield         cl_device_exec_capabilities;
235typedef cl_bitfield         cl_command_queue_properties;
236typedef intptr_t            cl_device_partition_property;
237typedef cl_bitfield         cl_device_affinity_domain;
238
239typedef intptr_t            cl_context_properties;
240typedef cl_uint             cl_context_info;
241typedef cl_uint             cl_command_queue_info;
242typedef cl_uint             cl_channel_order;
243typedef cl_uint             cl_channel_type;
244typedef cl_bitfield         cl_mem_flags;
245typedef cl_uint             cl_mem_object_type;
246typedef cl_uint             cl_mem_info;
247typedef cl_bitfield         cl_mem_migration_flags;
248typedef cl_uint             cl_image_info;
249typedef cl_uint             cl_buffer_create_type;
250typedef cl_uint             cl_addressing_mode;
251typedef cl_uint             cl_filter_mode;
252typedef cl_uint             cl_sampler_info;
253typedef cl_bitfield         cl_map_flags;
254typedef cl_uint             cl_program_info;
255typedef cl_uint             cl_program_build_info;
256typedef cl_uint             cl_program_binary_type;
257typedef cl_int              cl_build_status;
258typedef cl_uint             cl_kernel_info;
259typedef cl_uint             cl_kernel_arg_info;
260typedef cl_uint             cl_kernel_arg_address_qualifier;
261typedef cl_uint             cl_kernel_arg_access_qualifier;
262typedef cl_bitfield         cl_kernel_arg_type_qualifier;
263typedef cl_uint             cl_kernel_work_group_info;
264typedef cl_uint             cl_event_info;
265typedef cl_uint             cl_command_type;
266typedef cl_uint             cl_profiling_info;
267
268
269typedef struct _cl_image_format {
270    cl_channel_order        image_channel_order;
271    cl_channel_type         image_channel_data_type;
272} cl_image_format;
273
274typedef struct _cl_image_desc {
275    cl_mem_object_type      image_type;
276    size_t                  image_width;
277    size_t                  image_height;
278    size_t                  image_depth;
279    size_t                  image_array_size;
280    size_t                  image_row_pitch;
281    size_t                  image_slice_pitch;
282    cl_uint                 num_mip_levels;
283    cl_uint                 num_samples;
284    cl_mem                  buffer;
285} cl_image_desc;
286
287typedef struct _cl_buffer_region {
288    size_t                  origin;
289    size_t                  size;
290} cl_buffer_region;
291
292
293//////////////////////////////////////////////////////////
294
295#define CL_SUCCESS                                  0
296#define CL_DEVICE_NOT_FOUND                         -1
297#define CL_DEVICE_NOT_AVAILABLE                     -2
298#define CL_COMPILER_NOT_AVAILABLE                   -3
299#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
300#define CL_OUT_OF_RESOURCES                         -5
301#define CL_OUT_OF_HOST_MEMORY                       -6
302#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
303#define CL_MEM_COPY_OVERLAP                         -8
304#define CL_IMAGE_FORMAT_MISMATCH                    -9
305#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
306#define CL_BUILD_PROGRAM_FAILURE                    -11
307#define CL_MAP_FAILURE                              -12
308#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
309#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
310#define CL_COMPILE_PROGRAM_FAILURE                  -15
311#define CL_LINKER_NOT_AVAILABLE                     -16
312#define CL_LINK_PROGRAM_FAILURE                     -17
313#define CL_DEVICE_PARTITION_FAILED                  -18
314#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
315
316#define CL_INVALID_VALUE                            -30
317#define CL_INVALID_DEVICE_TYPE                      -31
318#define CL_INVALID_PLATFORM                         -32
319#define CL_INVALID_DEVICE                           -33
320#define CL_INVALID_CONTEXT                          -34
321#define CL_INVALID_QUEUE_PROPERTIES                 -35
322#define CL_INVALID_COMMAND_QUEUE                    -36
323#define CL_INVALID_HOST_PTR                         -37
324#define CL_INVALID_MEM_OBJECT                       -38
325#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
326#define CL_INVALID_IMAGE_SIZE                       -40
327#define CL_INVALID_SAMPLER                          -41
328#define CL_INVALID_BINARY                           -42
329#define CL_INVALID_BUILD_OPTIONS                    -43
330#define CL_INVALID_PROGRAM                          -44
331#define CL_INVALID_PROGRAM_EXECUTABLE               -45
332#define CL_INVALID_KERNEL_NAME                      -46
333#define CL_INVALID_KERNEL_DEFINITION                -47
334#define CL_INVALID_KERNEL                           -48
335#define CL_INVALID_ARG_INDEX                        -49
336#define CL_INVALID_ARG_VALUE                        -50
337#define CL_INVALID_ARG_SIZE                         -51
338#define CL_INVALID_KERNEL_ARGS                      -52
339#define CL_INVALID_WORK_DIMENSION                   -53
340#define CL_INVALID_WORK_GROUP_SIZE                  -54
341#define CL_INVALID_WORK_ITEM_SIZE                   -55
342#define CL_INVALID_GLOBAL_OFFSET                    -56
343#define CL_INVALID_EVENT_WAIT_LIST                  -57
344#define CL_INVALID_EVENT                            -58
345#define CL_INVALID_OPERATION                        -59
346#define CL_INVALID_GL_OBJECT                        -60
347#define CL_INVALID_BUFFER_SIZE                      -61
348#define CL_INVALID_MIP_LEVEL                        -62
349#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
350#define CL_INVALID_PROPERTY                         -64
351#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
352#define CL_INVALID_COMPILER_OPTIONS                 -66
353#define CL_INVALID_LINKER_OPTIONS                   -67
354#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
355
356/*#define CL_VERSION_1_0                              1
357#define CL_VERSION_1_1                              1
358#define CL_VERSION_1_2                              1*/
359
360#define CL_FALSE                                    0
361#define CL_TRUE                                     1
362#define CL_BLOCKING                                 CL_TRUE
363#define CL_NON_BLOCKING                             CL_FALSE
364
365#define CL_PLATFORM_PROFILE                         0x0900
366#define CL_PLATFORM_VERSION                         0x0901
367#define CL_PLATFORM_NAME                            0x0902
368#define CL_PLATFORM_VENDOR                          0x0903
369#define CL_PLATFORM_EXTENSIONS                      0x0904
370
371#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
372#define CL_DEVICE_TYPE_CPU                          (1 << 1)
373#define CL_DEVICE_TYPE_GPU                          (1 << 2)
374#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
375#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
376#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
377#define CL_DEVICE_TYPE                              0x1000
378#define CL_DEVICE_VENDOR_ID                         0x1001
379#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
380#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
381#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
382#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
383#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
384#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
385#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
386#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
387#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
388#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
389#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
390#define CL_DEVICE_ADDRESS_BITS                      0x100D
391#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
392#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
393#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
394#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
395#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
396#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
397#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
398#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
399#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
400#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
401#define CL_DEVICE_MAX_SAMPLERS                      0x1018
402#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
403#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
404#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
405#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
406#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
407#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
408#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
409#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
410#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
411#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
412#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
413#define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
414#define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
415#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
416#define CL_DEVICE_AVAILABLE                         0x1027
417#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
418#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
419#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
420#define CL_DEVICE_NAME                              0x102B
421#define CL_DEVICE_VENDOR                            0x102C
422#define CL_DRIVER_VERSION                           0x102D
423#define CL_DEVICE_PROFILE                           0x102E
424#define CL_DEVICE_VERSION                           0x102F
425#define CL_DEVICE_EXTENSIONS                        0x1030
426#define CL_DEVICE_PLATFORM                          0x1031
427#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
428#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
429#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
430#define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
431#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
432#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
433#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
434#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
435#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
436#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
437#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
438#define CL_DEVICE_OPENCL_C_VERSION                  0x103D
439#define CL_DEVICE_LINKER_AVAILABLE                  0x103E
440#define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
441#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
442#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
443#define CL_DEVICE_PARENT_DEVICE                     0x1042
444#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES         0x1043
445#define CL_DEVICE_PARTITION_PROPERTIES              0x1044
446#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN         0x1045
447#define CL_DEVICE_PARTITION_TYPE                    0x1046
448#define CL_DEVICE_REFERENCE_COUNT                   0x1047
449#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       0x1048
450#define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
451#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
452#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT      0x104B
453
454#define CL_FP_DENORM                                (1 << 0)
455#define CL_FP_INF_NAN                               (1 << 1)
456#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
457#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
458#define CL_FP_ROUND_TO_INF                          (1 << 4)
459#define CL_FP_FMA                                   (1 << 5)
460#define CL_FP_SOFT_FLOAT                            (1 << 6)
461#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
462
463#define CL_NONE                                     0x0
464#define CL_READ_ONLY_CACHE                          0x1
465#define CL_READ_WRITE_CACHE                         0x2
466#define CL_LOCAL                                    0x1
467#define CL_GLOBAL                                   0x2
468#define CL_EXEC_KERNEL                              (1 << 0)
469#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
470#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
471#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
472
473#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
474#define CL_CONTEXT_DEVICES                          0x1081
475#define CL_CONTEXT_PROPERTIES                       0x1082
476#define CL_CONTEXT_NUM_DEVICES                      0x1083
477#define CL_CONTEXT_PLATFORM                         0x1084
478#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
479
480#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
481#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
482#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
483#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
484#define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
485#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
486#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
487#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
488#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
489#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE       (1 << 5)
490#define CL_QUEUE_CONTEXT                            0x1090
491#define CL_QUEUE_DEVICE                             0x1091
492#define CL_QUEUE_REFERENCE_COUNT                    0x1092
493#define CL_QUEUE_PROPERTIES                         0x1093
494#define CL_MEM_READ_WRITE                           (1 << 0)
495#define CL_MEM_WRITE_ONLY                           (1 << 1)
496#define CL_MEM_READ_ONLY                            (1 << 2)
497#define CL_MEM_USE_HOST_PTR                         (1 << 3)
498#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
499#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
500// reserved                                         (1 << 6)
501#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
502#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
503#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
504#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
505#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
506
507#define CL_R                                        0x10B0
508#define CL_A                                        0x10B1
509#define CL_RG                                       0x10B2
510#define CL_RA                                       0x10B3
511#define CL_RGB                                      0x10B4
512#define CL_RGBA                                     0x10B5
513#define CL_BGRA                                     0x10B6
514#define CL_ARGB                                     0x10B7
515#define CL_INTENSITY                                0x10B8
516#define CL_LUMINANCE                                0x10B9
517#define CL_Rx                                       0x10BA
518#define CL_RGx                                      0x10BB
519#define CL_RGBx                                     0x10BC
520#define CL_DEPTH                                    0x10BD
521#define CL_DEPTH_STENCIL                            0x10BE
522
523#define CL_SNORM_INT8                               0x10D0
524#define CL_SNORM_INT16                              0x10D1
525#define CL_UNORM_INT8                               0x10D2
526#define CL_UNORM_INT16                              0x10D3
527#define CL_UNORM_SHORT_565                          0x10D4
528#define CL_UNORM_SHORT_555                          0x10D5
529#define CL_UNORM_INT_101010                         0x10D6
530#define CL_SIGNED_INT8                              0x10D7
531#define CL_SIGNED_INT16                             0x10D8
532#define CL_SIGNED_INT32                             0x10D9
533#define CL_UNSIGNED_INT8                            0x10DA
534#define CL_UNSIGNED_INT16                           0x10DB
535#define CL_UNSIGNED_INT32                           0x10DC
536#define CL_HALF_FLOAT                               0x10DD
537#define CL_FLOAT                                    0x10DE
538#define CL_UNORM_INT24                              0x10DF
539
540#define CL_MEM_OBJECT_BUFFER                        0x10F0
541#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
542#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
543#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
544#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
545#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
546#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
547
548#define CL_MEM_TYPE                                 0x1100
549#define CL_MEM_FLAGS                                0x1101
550#define CL_MEM_SIZE                                 0x1102
551#define CL_MEM_HOST_PTR                             0x1103
552#define CL_MEM_MAP_COUNT                            0x1104
553#define CL_MEM_REFERENCE_COUNT                      0x1105
554#define CL_MEM_CONTEXT                              0x1106
555#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
556#define CL_MEM_OFFSET                               0x1108
557
558#define CL_IMAGE_FORMAT                             0x1110
559#define CL_IMAGE_ELEMENT_SIZE                       0x1111
560#define CL_IMAGE_ROW_PITCH                          0x1112
561#define CL_IMAGE_SLICE_PITCH                        0x1113
562#define CL_IMAGE_WIDTH                              0x1114
563#define CL_IMAGE_HEIGHT                             0x1115
564#define CL_IMAGE_DEPTH                              0x1116
565#define CL_IMAGE_ARRAY_SIZE                         0x1117
566#define CL_IMAGE_BUFFER                             0x1118
567#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
568#define CL_IMAGE_NUM_SAMPLES                        0x111A
569
570#define CL_ADDRESS_NONE                             0x1130
571#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
572#define CL_ADDRESS_CLAMP                            0x1132
573#define CL_ADDRESS_REPEAT                           0x1133
574#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
575
576#define CL_FILTER_NEAREST                           0x1140
577#define CL_FILTER_LINEAR                            0x1141
578
579#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
580#define CL_SAMPLER_CONTEXT                          0x1151
581#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
582#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
583#define CL_SAMPLER_FILTER_MODE                      0x1154
584
585#define CL_MAP_READ                                 (1 << 0)
586#define CL_MAP_WRITE                                (1 << 1)
587#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
588
589#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
590#define CL_PROGRAM_CONTEXT                          0x1161
591#define CL_PROGRAM_NUM_DEVICES                      0x1162
592#define CL_PROGRAM_DEVICES                          0x1163
593#define CL_PROGRAM_SOURCE                           0x1164
594#define CL_PROGRAM_BINARY_SIZES                     0x1165
595#define CL_PROGRAM_BINARIES                         0x1166
596#define CL_PROGRAM_NUM_KERNELS                      0x1167
597#define CL_PROGRAM_KERNEL_NAMES                     0x1168
598#define CL_PROGRAM_BUILD_STATUS                     0x1181
599#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
600#define CL_PROGRAM_BUILD_LOG                        0x1183
601#define CL_PROGRAM_BINARY_TYPE                      0x1184
602#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
603#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
604#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
605#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
606
607#define CL_BUILD_SUCCESS                            0
608#define CL_BUILD_NONE                               -1
609#define CL_BUILD_ERROR                              -2
610#define CL_BUILD_IN_PROGRESS                        -3
611
612#define CL_KERNEL_FUNCTION_NAME                     0x1190
613#define CL_KERNEL_NUM_ARGS                          0x1191
614#define CL_KERNEL_REFERENCE_COUNT                   0x1192
615#define CL_KERNEL_CONTEXT                           0x1193
616#define CL_KERNEL_PROGRAM                           0x1194
617#define CL_KERNEL_ATTRIBUTES                        0x1195
618#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
619#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
620#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
621#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
622#define CL_KERNEL_ARG_NAME                          0x119A
623#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
624#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
625#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
626#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
627#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
628#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
629#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
630#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
631#define CL_KERNEL_ARG_TYPE_NONE                     0
632#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
633#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
634#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
635#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
636#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
637#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
638#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
639#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
640#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
641
642#define CL_EVENT_COMMAND_QUEUE                      0x11D0
643#define CL_EVENT_COMMAND_TYPE                       0x11D1
644#define CL_EVENT_REFERENCE_COUNT                    0x11D2
645#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
646#define CL_EVENT_CONTEXT                            0x11D4
647
648#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
649#define CL_COMMAND_TASK                             0x11F1
650#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
651#define CL_COMMAND_READ_BUFFER                      0x11F3
652#define CL_COMMAND_WRITE_BUFFER                     0x11F4
653#define CL_COMMAND_COPY_BUFFER                      0x11F5
654#define CL_COMMAND_READ_IMAGE                       0x11F6
655#define CL_COMMAND_WRITE_IMAGE                      0x11F7
656#define CL_COMMAND_COPY_IMAGE                       0x11F8
657#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
658#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
659#define CL_COMMAND_MAP_BUFFER                       0x11FB
660#define CL_COMMAND_MAP_IMAGE                        0x11FC
661#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
662#define CL_COMMAND_MARKER                           0x11FE
663#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
664#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
665#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
666#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
667#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
668#define CL_COMMAND_USER                             0x1204
669#define CL_COMMAND_BARRIER                          0x1205
670#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
671#define CL_COMMAND_FILL_BUFFER                      0x1207
672#define CL_COMMAND_FILL_IMAGE                       0x1208
673
674#define CL_COMPLETE                                 0x0
675#define CL_RUNNING                                  0x1
676#define CL_SUBMITTED                                0x2
677#define CL_QUEUED                                   0x3
678#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
679
680#define CL_PROFILING_COMMAND_QUEUED                 0x1280
681#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
682#define CL_PROFILING_COMMAND_START                  0x1282
683#define CL_PROFILING_COMMAND_END                    0x1283
684
685#define CL_CALLBACK CV_STDCALL
686
687static volatile bool g_haveOpenCL = false;
688static const char* oclFuncToCheck = "clEnqueueReadBufferRect";
689
690#if defined(__APPLE__)
691#include <dlfcn.h>
692
693static void* initOpenCLAndLoad(const char* funcname)
694{
695    static bool initialized = false;
696    static void* handle = 0;
697    if (!handle)
698    {
699        if(!initialized)
700        {
701            const char* oclpath = getenv("OPENCV_OPENCL_RUNTIME");
702            oclpath = oclpath && strlen(oclpath) > 0 ? oclpath :
703                "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL";
704            handle = dlopen(oclpath, RTLD_LAZY);
705            initialized = true;
706            g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
707            if( g_haveOpenCL )
708                fprintf(stderr, "Successfully loaded OpenCL v1.1+ runtime from %s\n", oclpath);
709            else
710                fprintf(stderr, "Failed to load OpenCL runtime\n");
711        }
712        if(!handle)
713            return 0;
714    }
715
716    return funcname && handle ? dlsym(handle, funcname) : 0;
717}
718
719#elif defined WIN32 || defined _WIN32
720
721#ifndef _WIN32_WINNT           // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?)
722  #define _WIN32_WINNT 0x0400  // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx
723#endif
724#include <windows.h>
725#if (_WIN32_WINNT >= 0x0602)
726  #include <synchapi.h>
727#endif
728#undef small
729#undef min
730#undef max
731#undef abs
732
733static void* initOpenCLAndLoad(const char* funcname)
734{
735    static bool initialized = false;
736    static HMODULE handle = 0;
737    if (!handle)
738    {
739#ifndef WINRT
740        if(!initialized)
741        {
742            handle = LoadLibraryA("OpenCL.dll");
743            initialized = true;
744            g_haveOpenCL = handle != 0 && GetProcAddress(handle, oclFuncToCheck) != 0;
745        }
746#endif
747        if(!handle)
748            return 0;
749    }
750
751    return funcname ? (void*)GetProcAddress(handle, funcname) : 0;
752}
753
754#elif defined(__linux)
755
756#include <dlfcn.h>
757#include <stdio.h>
758
759static void* initOpenCLAndLoad(const char* funcname)
760{
761    static bool initialized = false;
762    static void* handle = 0;
763    if (!handle)
764    {
765        if(!initialized)
766        {
767            handle = dlopen("libOpenCL.so", RTLD_LAZY);
768            if(!handle)
769                handle = dlopen("libCL.so", RTLD_LAZY);
770            initialized = true;
771            g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
772        }
773        if(!handle)
774            return 0;
775    }
776
777    return funcname ? (void*)dlsym(handle, funcname) : 0;
778}
779
780#else
781
782static void* initOpenCLAndLoad(const char*)
783{
784    return 0;
785}
786
787#endif
788
789
790#define OCL_FUNC(rettype, funcname, argsdecl, args) \
791    typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
792    static rettype funcname argsdecl \
793    { \
794        static funcname##_t funcname##_p = 0; \
795        if( !funcname##_p ) \
796        { \
797            funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
798            if( !funcname##_p ) \
799                return OPENCV_CL_NOT_IMPLEMENTED; \
800        } \
801        return funcname##_p args; \
802    }
803
804
805#define OCL_FUNC_P(rettype, funcname, argsdecl, args) \
806    typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
807    static rettype funcname argsdecl \
808    { \
809        static funcname##_t funcname##_p = 0; \
810        if( !funcname##_p ) \
811        { \
812            funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
813            if( !funcname##_p ) \
814            { \
815                if( errcode_ret ) \
816                    *errcode_ret = OPENCV_CL_NOT_IMPLEMENTED; \
817                return 0; \
818            } \
819        } \
820        return funcname##_p args; \
821    }
822
823OCL_FUNC(cl_int, clGetPlatformIDs,
824    (cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms),
825    (num_entries, platforms, num_platforms))
826
827OCL_FUNC(cl_int, clGetPlatformInfo,
828    (cl_platform_id platform, cl_platform_info param_name,
829    size_t param_value_size, void * param_value,
830    size_t * param_value_size_ret),
831    (platform, param_name, param_value_size, param_value, param_value_size_ret))
832
833OCL_FUNC(cl_int, clGetDeviceInfo,
834         (cl_device_id device,
835          cl_device_info param_name,
836          size_t param_value_size,
837          void * param_value,
838          size_t * param_value_size_ret),
839         (device, param_name, param_value_size, param_value, param_value_size_ret))
840
841
842OCL_FUNC(cl_int, clGetDeviceIDs,
843    (cl_platform_id platform,
844    cl_device_type device_type,
845    cl_uint num_entries,
846    cl_device_id * devices,
847    cl_uint * num_devices),
848    (platform, device_type, num_entries, devices, num_devices))
849
850OCL_FUNC_P(cl_context, clCreateContext,
851    (const cl_context_properties * properties,
852    cl_uint num_devices,
853    const cl_device_id * devices,
854    void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
855    void * user_data,
856    cl_int * errcode_ret),
857    (properties, num_devices, devices, pfn_notify, user_data, errcode_ret))
858
859OCL_FUNC(cl_int, clReleaseContext, (cl_context context), (context))
860
861/*
862OCL_FUNC(cl_int, clRetainContext, (cl_context context), (context))
863
864OCL_FUNC_P(cl_context, clCreateContextFromType,
865    (const cl_context_properties * properties,
866    cl_device_type device_type,
867    void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
868    void * user_data,
869    cl_int * errcode_ret),
870    (properties, device_type, pfn_notify, user_data, errcode_ret))
871
872OCL_FUNC(cl_int, clGetContextInfo,
873    (cl_context context,
874    cl_context_info param_name,
875    size_t param_value_size,
876    void * param_value,
877    size_t * param_value_size_ret),
878    (context, param_name, param_value_size,
879    param_value, param_value_size_ret))
880*/
881OCL_FUNC_P(cl_command_queue, clCreateCommandQueue,
882    (cl_context context,
883    cl_device_id device,
884    cl_command_queue_properties properties,
885    cl_int * errcode_ret),
886    (context, device, properties, errcode_ret))
887
888OCL_FUNC(cl_int, clReleaseCommandQueue, (cl_command_queue command_queue), (command_queue))
889
890OCL_FUNC_P(cl_mem, clCreateBuffer,
891    (cl_context context,
892    cl_mem_flags flags,
893    size_t size,
894    void * host_ptr,
895    cl_int * errcode_ret),
896    (context, flags, size, host_ptr, errcode_ret))
897
898/*
899OCL_FUNC(cl_int, clRetainCommandQueue, (cl_command_queue command_queue), (command_queue))
900
901OCL_FUNC(cl_int, clGetCommandQueueInfo,
902 (cl_command_queue command_queue,
903 cl_command_queue_info param_name,
904 size_t param_value_size,
905 void * param_value,
906 size_t * param_value_size_ret),
907 (command_queue, param_name, param_value_size, param_value, param_value_size_ret))
908
909OCL_FUNC_P(cl_mem, clCreateSubBuffer,
910    (cl_mem buffer,
911    cl_mem_flags flags,
912    cl_buffer_create_type buffer_create_type,
913    const void * buffer_create_info,
914    cl_int * errcode_ret),
915    (buffer, flags, buffer_create_type, buffer_create_info, errcode_ret))
916*/
917
918OCL_FUNC_P(cl_mem, clCreateImage,
919    (cl_context context,
920    cl_mem_flags flags,
921    const cl_image_format * image_format,
922    const cl_image_desc * image_desc,
923    void * host_ptr,
924    cl_int * errcode_ret),
925    (context, flags, image_format, image_desc, host_ptr, errcode_ret))
926
927OCL_FUNC_P(cl_mem, clCreateImage2D,
928    (cl_context context,
929    cl_mem_flags flags,
930    const cl_image_format * image_format,
931    size_t image_width,
932    size_t image_height,
933    size_t image_row_pitch,
934    void * host_ptr,
935    cl_int *errcode_ret),
936    (context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret))
937
938OCL_FUNC(cl_int, clGetSupportedImageFormats,
939 (cl_context context,
940 cl_mem_flags flags,
941 cl_mem_object_type image_type,
942 cl_uint num_entries,
943 cl_image_format * image_formats,
944 cl_uint * num_image_formats),
945 (context, flags, image_type, num_entries, image_formats, num_image_formats))
946
947
948/*
949OCL_FUNC(cl_int, clGetMemObjectInfo,
950 (cl_mem memobj,
951 cl_mem_info param_name,
952 size_t param_value_size,
953 void * param_value,
954 size_t * param_value_size_ret),
955 (memobj, param_name, param_value_size, param_value, param_value_size_ret))
956
957OCL_FUNC(cl_int, clGetImageInfo,
958 (cl_mem image,
959 cl_image_info param_name,
960 size_t param_value_size,
961 void * param_value,
962 size_t * param_value_size_ret),
963 (image, param_name, param_value_size, param_value, param_value_size_ret))
964
965OCL_FUNC(cl_int, clCreateKernelsInProgram,
966 (cl_program program,
967 cl_uint num_kernels,
968 cl_kernel * kernels,
969 cl_uint * num_kernels_ret),
970 (program, num_kernels, kernels, num_kernels_ret))
971
972OCL_FUNC(cl_int, clRetainKernel, (cl_kernel kernel), (kernel))
973
974OCL_FUNC(cl_int, clGetKernelArgInfo,
975 (cl_kernel kernel,
976 cl_uint arg_indx,
977 cl_kernel_arg_info param_name,
978 size_t param_value_size,
979 void * param_value,
980 size_t * param_value_size_ret),
981 (kernel, arg_indx, param_name, param_value_size, param_value, param_value_size_ret))
982
983OCL_FUNC(cl_int, clEnqueueReadImage,
984 (cl_command_queue command_queue,
985 cl_mem image,
986 cl_bool blocking_read,
987 const size_t * origin[3],
988 const size_t * region[3],
989 size_t row_pitch,
990 size_t slice_pitch,
991 void * ptr,
992 cl_uint num_events_in_wait_list,
993 const cl_event * event_wait_list,
994 cl_event * event),
995 (command_queue, image, blocking_read, origin, region,
996 row_pitch, slice_pitch,
997 ptr,
998 num_events_in_wait_list,
999 event_wait_list,
1000 event))
1001
1002OCL_FUNC(cl_int, clEnqueueWriteImage,
1003 (cl_command_queue command_queue,
1004 cl_mem image,
1005 cl_bool blocking_write,
1006 const size_t * origin[3],
1007 const size_t * region[3],
1008 size_t input_row_pitch,
1009 size_t input_slice_pitch,
1010 const void * ptr,
1011 cl_uint num_events_in_wait_list,
1012 const cl_event * event_wait_list,
1013 cl_event * event),
1014 (command_queue, image, blocking_write, origin, region, input_row_pitch,
1015 input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
1016
1017OCL_FUNC(cl_int, clEnqueueFillImage,
1018 (cl_command_queue command_queue,
1019 cl_mem image,
1020 const void * fill_color,
1021 const size_t * origin[3],
1022 const size_t * region[3],
1023 cl_uint num_events_in_wait_list,
1024 const cl_event * event_wait_list,
1025 cl_event * event),
1026 (command_queue, image, fill_color, origin, region,
1027 num_events_in_wait_list, event_wait_list, event))
1028
1029OCL_FUNC(cl_int, clEnqueueCopyImage,
1030 (cl_command_queue command_queue,
1031 cl_mem src_image,
1032 cl_mem dst_image,
1033 const size_t * src_origin[3],
1034 const size_t * dst_origin[3],
1035 const size_t * region[3],
1036 cl_uint num_events_in_wait_list,
1037 const cl_event * event_wait_list,
1038 cl_event * event),
1039 (command_queue, src_image, dst_image, src_origin, dst_origin,
1040 region, num_events_in_wait_list, event_wait_list, event))
1041
1042OCL_FUNC(cl_int, clEnqueueCopyImageToBuffer,
1043 (cl_command_queue command_queue,
1044 cl_mem src_image,
1045 cl_mem dst_buffer,
1046 const size_t * src_origin[3],
1047 const size_t * region[3],
1048 size_t dst_offset,
1049 cl_uint num_events_in_wait_list,
1050 const cl_event * event_wait_list,
1051 cl_event * event),
1052 (command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
1053 num_events_in_wait_list, event_wait_list, event))
1054*/
1055
1056OCL_FUNC(cl_int, clEnqueueCopyBufferToImage,
1057 (cl_command_queue command_queue,
1058 cl_mem src_buffer,
1059 cl_mem dst_image,
1060 size_t src_offset,
1061 const size_t dst_origin[3],
1062 const size_t region[3],
1063 cl_uint num_events_in_wait_list,
1064 const cl_event * event_wait_list,
1065 cl_event * event),
1066 (command_queue, src_buffer, dst_image, src_offset, dst_origin,
1067 region, num_events_in_wait_list, event_wait_list, event))
1068
1069 OCL_FUNC(cl_int, clFlush,
1070 (cl_command_queue command_queue),
1071 (command_queue))
1072
1073/*
1074OCL_FUNC_P(void*, clEnqueueMapImage,
1075 (cl_command_queue command_queue,
1076 cl_mem image,
1077 cl_bool blocking_map,
1078 cl_map_flags map_flags,
1079 const size_t * origin[3],
1080 const size_t * region[3],
1081 size_t * image_row_pitch,
1082 size_t * image_slice_pitch,
1083 cl_uint num_events_in_wait_list,
1084 const cl_event * event_wait_list,
1085 cl_event * event,
1086 cl_int * errcode_ret),
1087 (command_queue, image, blocking_map, map_flags, origin, region,
1088 image_row_pitch, image_slice_pitch, num_events_in_wait_list,
1089 event_wait_list, event, errcode_ret))
1090*/
1091
1092/*
1093OCL_FUNC(cl_int, clRetainProgram, (cl_program program), (program))
1094
1095OCL_FUNC(cl_int, clGetKernelInfo,
1096 (cl_kernel kernel,
1097 cl_kernel_info param_name,
1098 size_t param_value_size,
1099 void * param_value,
1100 size_t * param_value_size_ret),
1101 (kernel, param_name, param_value_size, param_value, param_value_size_ret))
1102
1103OCL_FUNC(cl_int, clRetainMemObject, (cl_mem memobj), (memobj))
1104
1105*/
1106
1107OCL_FUNC(cl_int, clReleaseMemObject, (cl_mem memobj), (memobj))
1108
1109
1110OCL_FUNC_P(cl_program, clCreateProgramWithSource,
1111    (cl_context context,
1112    cl_uint count,
1113    const char ** strings,
1114    const size_t * lengths,
1115    cl_int * errcode_ret),
1116    (context, count, strings, lengths, errcode_ret))
1117
1118OCL_FUNC_P(cl_program, clCreateProgramWithBinary,
1119    (cl_context context,
1120    cl_uint num_devices,
1121    const cl_device_id * device_list,
1122    const size_t * lengths,
1123    const unsigned char ** binaries,
1124    cl_int * binary_status,
1125    cl_int * errcode_ret),
1126    (context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret))
1127
1128OCL_FUNC(cl_int, clReleaseProgram, (cl_program program), (program))
1129
1130OCL_FUNC(cl_int, clBuildProgram,
1131    (cl_program program,
1132    cl_uint num_devices,
1133    const cl_device_id * device_list,
1134    const char * options,
1135    void (CL_CALLBACK * pfn_notify)(cl_program, void *),
1136    void * user_data),
1137    (program, num_devices, device_list, options, pfn_notify, user_data))
1138
1139OCL_FUNC(cl_int, clGetProgramInfo,
1140    (cl_program program,
1141    cl_program_info param_name,
1142    size_t param_value_size,
1143    void * param_value,
1144    size_t * param_value_size_ret),
1145    (program, param_name, param_value_size, param_value, param_value_size_ret))
1146
1147OCL_FUNC(cl_int, clGetProgramBuildInfo,
1148    (cl_program program,
1149    cl_device_id device,
1150    cl_program_build_info param_name,
1151    size_t param_value_size,
1152    void * param_value,
1153    size_t * param_value_size_ret),
1154    (program, device, param_name, param_value_size, param_value, param_value_size_ret))
1155
1156OCL_FUNC_P(cl_kernel, clCreateKernel,
1157    (cl_program program,
1158    const char * kernel_name,
1159    cl_int * errcode_ret),
1160    (program, kernel_name, errcode_ret))
1161
1162OCL_FUNC(cl_int, clReleaseKernel, (cl_kernel kernel), (kernel))
1163
1164OCL_FUNC(cl_int, clSetKernelArg,
1165    (cl_kernel kernel,
1166    cl_uint arg_index,
1167    size_t arg_size,
1168    const void * arg_value),
1169    (kernel, arg_index, arg_size, arg_value))
1170
1171OCL_FUNC(cl_int, clGetKernelWorkGroupInfo,
1172    (cl_kernel kernel,
1173    cl_device_id device,
1174    cl_kernel_work_group_info param_name,
1175    size_t param_value_size,
1176    void * param_value,
1177    size_t * param_value_size_ret),
1178    (kernel, device, param_name, param_value_size, param_value, param_value_size_ret))
1179
1180OCL_FUNC(cl_int, clFinish, (cl_command_queue command_queue), (command_queue))
1181
1182OCL_FUNC(cl_int, clEnqueueReadBuffer,
1183    (cl_command_queue command_queue,
1184    cl_mem buffer,
1185    cl_bool blocking_read,
1186    size_t offset,
1187    size_t size,
1188    void * ptr,
1189    cl_uint num_events_in_wait_list,
1190    const cl_event * event_wait_list,
1191    cl_event * event),
1192    (command_queue, buffer, blocking_read, offset, size, ptr,
1193    num_events_in_wait_list, event_wait_list, event))
1194
1195OCL_FUNC(cl_int, clEnqueueReadBufferRect,
1196    (cl_command_queue command_queue,
1197    cl_mem buffer,
1198    cl_bool blocking_read,
1199    const size_t * buffer_offset,
1200    const size_t * host_offset,
1201    const size_t * region,
1202    size_t buffer_row_pitch,
1203    size_t buffer_slice_pitch,
1204    size_t host_row_pitch,
1205    size_t host_slice_pitch,
1206    void * ptr,
1207    cl_uint num_events_in_wait_list,
1208    const cl_event * event_wait_list,
1209    cl_event * event),
1210    (command_queue, buffer, blocking_read, buffer_offset, host_offset, region, buffer_row_pitch,
1211    buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list,
1212    event_wait_list, event))
1213
1214OCL_FUNC(cl_int, clEnqueueWriteBuffer,
1215    (cl_command_queue command_queue,
1216    cl_mem buffer,
1217    cl_bool blocking_write,
1218    size_t offset,
1219    size_t size,
1220    const void * ptr,
1221    cl_uint num_events_in_wait_list,
1222    const cl_event * event_wait_list,
1223    cl_event * event),
1224    (command_queue, buffer, blocking_write, offset, size, ptr,
1225    num_events_in_wait_list, event_wait_list, event))
1226
1227OCL_FUNC(cl_int, clEnqueueWriteBufferRect,
1228    (cl_command_queue command_queue,
1229    cl_mem buffer,
1230    cl_bool blocking_write,
1231    const size_t * buffer_offset,
1232    const size_t * host_offset,
1233    const size_t * region,
1234    size_t buffer_row_pitch,
1235    size_t buffer_slice_pitch,
1236    size_t host_row_pitch,
1237    size_t host_slice_pitch,
1238    const void * ptr,
1239    cl_uint num_events_in_wait_list,
1240    const cl_event * event_wait_list,
1241    cl_event * event),
1242    (command_queue, buffer, blocking_write, buffer_offset, host_offset,
1243    region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch,
1244    host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
1245
1246/*OCL_FUNC(cl_int, clEnqueueFillBuffer,
1247    (cl_command_queue command_queue,
1248    cl_mem buffer,
1249    const void * pattern,
1250    size_t pattern_size,
1251    size_t offset,
1252    size_t size,
1253    cl_uint num_events_in_wait_list,
1254    const cl_event * event_wait_list,
1255    cl_event * event),
1256    (command_queue, buffer, pattern, pattern_size, offset, size,
1257    num_events_in_wait_list, event_wait_list, event))*/
1258
1259OCL_FUNC(cl_int, clEnqueueCopyBuffer,
1260    (cl_command_queue command_queue,
1261    cl_mem src_buffer,
1262    cl_mem dst_buffer,
1263    size_t src_offset,
1264    size_t dst_offset,
1265    size_t size,
1266    cl_uint num_events_in_wait_list,
1267    const cl_event * event_wait_list,
1268    cl_event * event),
1269    (command_queue, src_buffer, dst_buffer, src_offset, dst_offset,
1270    size, num_events_in_wait_list, event_wait_list, event))
1271
1272OCL_FUNC(cl_int, clEnqueueCopyBufferRect,
1273    (cl_command_queue command_queue,
1274    cl_mem src_buffer,
1275    cl_mem dst_buffer,
1276    const size_t * src_origin,
1277    const size_t * dst_origin,
1278    const size_t * region,
1279    size_t src_row_pitch,
1280    size_t src_slice_pitch,
1281    size_t dst_row_pitch,
1282    size_t dst_slice_pitch,
1283    cl_uint num_events_in_wait_list,
1284    const cl_event * event_wait_list,
1285    cl_event * event),
1286    (command_queue, src_buffer, dst_buffer, src_origin, dst_origin,
1287    region, src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch,
1288    num_events_in_wait_list, event_wait_list, event))
1289
1290OCL_FUNC_P(void*, clEnqueueMapBuffer,
1291    (cl_command_queue command_queue,
1292    cl_mem buffer,
1293    cl_bool blocking_map,
1294    cl_map_flags map_flags,
1295    size_t offset,
1296    size_t size,
1297    cl_uint num_events_in_wait_list,
1298    const cl_event * event_wait_list,
1299    cl_event * event,
1300    cl_int * errcode_ret),
1301    (command_queue, buffer, blocking_map, map_flags, offset, size,
1302    num_events_in_wait_list, event_wait_list, event, errcode_ret))
1303
1304OCL_FUNC(cl_int, clEnqueueUnmapMemObject,
1305    (cl_command_queue command_queue,
1306    cl_mem memobj,
1307    void * mapped_ptr,
1308    cl_uint num_events_in_wait_list,
1309    const cl_event * event_wait_list,
1310    cl_event * event),
1311    (command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event))
1312
1313OCL_FUNC(cl_int, clEnqueueNDRangeKernel,
1314    (cl_command_queue command_queue,
1315    cl_kernel kernel,
1316    cl_uint work_dim,
1317    const size_t * global_work_offset,
1318    const size_t * global_work_size,
1319    const size_t * local_work_size,
1320    cl_uint num_events_in_wait_list,
1321    const cl_event * event_wait_list,
1322    cl_event * event),
1323    (command_queue, kernel, work_dim, global_work_offset, global_work_size,
1324    local_work_size, num_events_in_wait_list, event_wait_list, event))
1325
1326OCL_FUNC(cl_int, clEnqueueTask,
1327    (cl_command_queue command_queue,
1328    cl_kernel kernel,
1329    cl_uint num_events_in_wait_list,
1330    const cl_event * event_wait_list,
1331    cl_event * event),
1332    (command_queue, kernel, num_events_in_wait_list, event_wait_list, event))
1333
1334OCL_FUNC(cl_int, clSetEventCallback,
1335    (cl_event event,
1336    cl_int command_exec_callback_type ,
1337    void (CL_CALLBACK  *pfn_event_notify) (cl_event event, cl_int event_command_exec_status, void *user_data),
1338    void *user_data),
1339    (event, command_exec_callback_type, pfn_event_notify, user_data))
1340
1341OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event))
1342
1343}
1344
1345#endif
1346
1347#ifndef CL_VERSION_1_2
1348#define CL_VERSION_1_2
1349#endif
1350
1351#endif
1352
1353#ifdef _DEBUG
1354#define CV_OclDbgAssert CV_DbgAssert
1355#else
1356static bool isRaiseError()
1357{
1358    static bool initialized = false;
1359    static bool value = false;
1360    if (!initialized)
1361    {
1362        value = getBoolParameter("OPENCV_OPENCL_RAISE_ERROR", false);
1363        initialized = true;
1364    }
1365    return value;
1366}
1367#define CV_OclDbgAssert(expr) do { if (isRaiseError()) { CV_Assert(expr); } else { (void)(expr); } } while ((void)0, 0)
1368#endif
1369
1370#ifdef HAVE_OPENCL_SVM
1371#include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
1372#include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
1373#include "opencv2/core/opencl/opencl_svm.hpp"
1374#endif
1375
1376namespace cv { namespace ocl {
1377
1378struct UMat2D
1379{
1380    UMat2D(const UMat& m)
1381    {
1382        offset = (int)m.offset;
1383        step = (int)m.step;
1384        rows = m.rows;
1385        cols = m.cols;
1386    }
1387    int offset;
1388    int step;
1389    int rows;
1390    int cols;
1391};
1392
1393struct UMat3D
1394{
1395    UMat3D(const UMat& m)
1396    {
1397        offset = (int)m.offset;
1398        step = (int)m.step.p[1];
1399        slicestep = (int)m.step.p[0];
1400        slices = (int)m.size.p[0];
1401        rows = m.size.p[1];
1402        cols = m.size.p[2];
1403    }
1404    int offset;
1405    int slicestep;
1406    int step;
1407    int slices;
1408    int rows;
1409    int cols;
1410};
1411
1412// Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
1413static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
1414{
1415    static uint64 table[256];
1416    static bool initialized = false;
1417
1418    if( !initialized )
1419    {
1420        for( int i = 0; i < 256; i++ )
1421        {
1422            uint64 c = i;
1423            for( int j = 0; j < 8; j++ )
1424                c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
1425            table[i] = c;
1426        }
1427        initialized = true;
1428    }
1429
1430    uint64 crc = ~crc0;
1431    for( size_t idx = 0; idx < size; idx++ )
1432        crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
1433
1434    return ~crc;
1435}
1436
1437struct HashKey
1438{
1439    typedef uint64 part;
1440    HashKey(part _a, part _b) : a(_a), b(_b) {}
1441    part a, b;
1442};
1443
1444inline bool operator == (const HashKey& h1, const HashKey& h2)
1445{
1446    return h1.a == h2.a && h1.b == h2.b;
1447}
1448
1449inline bool operator < (const HashKey& h1, const HashKey& h2)
1450{
1451    return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b);
1452}
1453
1454
1455bool haveOpenCL()
1456{
1457#ifdef HAVE_OPENCL
1458    static bool g_isOpenCLInitialized = false;
1459    static bool g_isOpenCLAvailable = false;
1460
1461    if (!g_isOpenCLInitialized)
1462    {
1463        try
1464        {
1465            cl_uint n = 0;
1466            g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
1467        }
1468        catch (...)
1469        {
1470            g_isOpenCLAvailable = false;
1471        }
1472        g_isOpenCLInitialized = true;
1473    }
1474    return g_isOpenCLAvailable;
1475#else
1476    return false;
1477#endif
1478}
1479
1480bool useOpenCL()
1481{
1482    CoreTLSData* data = getCoreTlsData().get();
1483    if( data->useOpenCL < 0 )
1484    {
1485        try
1486        {
1487            data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available();
1488        }
1489        catch (...)
1490        {
1491            data->useOpenCL = 0;
1492        }
1493    }
1494    return data->useOpenCL > 0;
1495}
1496
1497void setUseOpenCL(bool flag)
1498{
1499    if( haveOpenCL() )
1500    {
1501        CoreTLSData* data = getCoreTlsData().get();
1502        data->useOpenCL = (flag && Device::getDefault().ptr() != NULL) ? 1 : 0;
1503    }
1504}
1505
1506#ifdef HAVE_CLAMDBLAS
1507
1508class AmdBlasHelper
1509{
1510public:
1511    static AmdBlasHelper & getInstance()
1512    {
1513        static AmdBlasHelper amdBlas;
1514        return amdBlas;
1515    }
1516
1517    bool isAvailable() const
1518    {
1519        return g_isAmdBlasAvailable;
1520    }
1521
1522    ~AmdBlasHelper()
1523    {
1524        try
1525        {
1526            clAmdBlasTeardown();
1527        }
1528        catch (...) { }
1529    }
1530
1531protected:
1532    AmdBlasHelper()
1533    {
1534        if (!g_isAmdBlasInitialized)
1535        {
1536            AutoLock lock(m);
1537
1538            if (!g_isAmdBlasInitialized && haveOpenCL())
1539            {
1540                try
1541                {
1542                    g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
1543                }
1544                catch (...)
1545                {
1546                    g_isAmdBlasAvailable = false;
1547                }
1548            }
1549            else
1550                g_isAmdBlasAvailable = false;
1551
1552            g_isAmdBlasInitialized = true;
1553        }
1554    }
1555
1556private:
1557    static Mutex m;
1558    static bool g_isAmdBlasInitialized;
1559    static bool g_isAmdBlasAvailable;
1560};
1561
1562bool AmdBlasHelper::g_isAmdBlasAvailable = false;
1563bool AmdBlasHelper::g_isAmdBlasInitialized = false;
1564Mutex AmdBlasHelper::m;
1565
1566bool haveAmdBlas()
1567{
1568    return AmdBlasHelper::getInstance().isAvailable();
1569}
1570
1571#else
1572
1573bool haveAmdBlas()
1574{
1575    return false;
1576}
1577
1578#endif
1579
1580#ifdef HAVE_CLAMDFFT
1581
1582class AmdFftHelper
1583{
1584public:
1585    static AmdFftHelper & getInstance()
1586    {
1587        static AmdFftHelper amdFft;
1588        return amdFft;
1589    }
1590
1591    bool isAvailable() const
1592    {
1593        return g_isAmdFftAvailable;
1594    }
1595
1596    ~AmdFftHelper()
1597    {
1598        try
1599        {
1600//            clAmdFftTeardown();
1601        }
1602        catch (...) { }
1603    }
1604
1605protected:
1606    AmdFftHelper()
1607    {
1608        if (!g_isAmdFftInitialized)
1609        {
1610            AutoLock lock(m);
1611
1612            if (!g_isAmdFftInitialized && haveOpenCL())
1613            {
1614                try
1615                {
1616                    cl_uint major, minor, patch;
1617                    CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1618
1619                    // it throws exception in case AmdFft binaries are not found
1620                    CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1621                    g_isAmdFftAvailable = true;
1622                }
1623                catch (const Exception &)
1624                {
1625                    g_isAmdFftAvailable = false;
1626                }
1627            }
1628            else
1629                g_isAmdFftAvailable = false;
1630
1631            g_isAmdFftInitialized = true;
1632        }
1633    }
1634
1635private:
1636    static clAmdFftSetupData setupData;
1637    static Mutex m;
1638    static bool g_isAmdFftInitialized;
1639    static bool g_isAmdFftAvailable;
1640};
1641
1642clAmdFftSetupData AmdFftHelper::setupData;
1643bool AmdFftHelper::g_isAmdFftAvailable = false;
1644bool AmdFftHelper::g_isAmdFftInitialized = false;
1645Mutex AmdFftHelper::m;
1646
1647bool haveAmdFft()
1648{
1649    return AmdFftHelper::getInstance().isAvailable();
1650}
1651
1652#else
1653
1654bool haveAmdFft()
1655{
1656    return false;
1657}
1658
1659#endif
1660
1661bool haveSVM()
1662{
1663#ifdef HAVE_OPENCL_SVM
1664    return true;
1665#else
1666    return false;
1667#endif
1668}
1669
1670void finish()
1671{
1672    Queue::getDefault().finish();
1673}
1674
1675#define IMPLEMENT_REFCOUNTABLE() \
1676    void addref() { CV_XADD(&refcount, 1); } \
1677    void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
1678    int refcount
1679
1680/////////////////////////////////////////// Platform /////////////////////////////////////////////
1681
1682struct Platform::Impl
1683{
1684    Impl()
1685    {
1686        refcount = 1;
1687        handle = 0;
1688        initialized = false;
1689    }
1690
1691    ~Impl() {}
1692
1693    void init()
1694    {
1695        if( !initialized )
1696        {
1697            //cl_uint num_entries
1698            cl_uint n = 0;
1699            if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1700                handle = 0;
1701            if( handle != 0 )
1702            {
1703                char buf[1000];
1704                size_t len = 0;
1705                CV_OclDbgAssert(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len) == CL_SUCCESS);
1706                buf[len] = '\0';
1707                vendor = String(buf);
1708            }
1709
1710            initialized = true;
1711        }
1712    }
1713
1714    IMPLEMENT_REFCOUNTABLE();
1715
1716    cl_platform_id handle;
1717    String vendor;
1718    bool initialized;
1719};
1720
1721Platform::Platform()
1722{
1723    p = 0;
1724}
1725
1726Platform::~Platform()
1727{
1728    if(p)
1729        p->release();
1730}
1731
1732Platform::Platform(const Platform& pl)
1733{
1734    p = (Impl*)pl.p;
1735    if(p)
1736        p->addref();
1737}
1738
1739Platform& Platform::operator = (const Platform& pl)
1740{
1741    Impl* newp = (Impl*)pl.p;
1742    if(newp)
1743        newp->addref();
1744    if(p)
1745        p->release();
1746    p = newp;
1747    return *this;
1748}
1749
1750void* Platform::ptr() const
1751{
1752    return p ? p->handle : 0;
1753}
1754
1755Platform& Platform::getDefault()
1756{
1757    static Platform p;
1758    if( !p.p )
1759    {
1760        p.p = new Impl;
1761        p.p->init();
1762    }
1763    return p;
1764}
1765
1766/////////////////////////////////////// Device ////////////////////////////////////////////
1767
1768// deviceVersion has format
1769//   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1770// by specification
1771//   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1772//   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1773static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
1774{
1775    major = minor = 0;
1776    if (10 >= deviceVersion.length())
1777        return;
1778    const char *pstr = deviceVersion.c_str();
1779    if (0 != strncmp(pstr, "OpenCL ", 7))
1780        return;
1781    size_t ppos = deviceVersion.find('.', 7);
1782    if (String::npos == ppos)
1783        return;
1784    String temp = deviceVersion.substr(7, ppos - 7);
1785    major = atoi(temp.c_str());
1786    temp = deviceVersion.substr(ppos + 1);
1787    minor = atoi(temp.c_str());
1788}
1789
1790struct Device::Impl
1791{
1792    Impl(void* d)
1793    {
1794        handle = (cl_device_id)d;
1795        refcount = 1;
1796
1797        name_ = getStrProp(CL_DEVICE_NAME);
1798        version_ = getStrProp(CL_DEVICE_VERSION);
1799        doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1800        hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1801        maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1802        maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1803        type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1804        driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1805
1806        String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1807        parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1808
1809        vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1810        if (vendorName_ == "Advanced Micro Devices, Inc." ||
1811            vendorName_ == "AMD")
1812            vendorID_ = VENDOR_AMD;
1813        else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1814            vendorID_ = VENDOR_INTEL;
1815        else if (vendorName_ == "NVIDIA Corporation")
1816            vendorID_ = VENDOR_NVIDIA;
1817        else
1818            vendorID_ = UNKNOWN_VENDOR;
1819    }
1820
1821    template<typename _TpCL, typename _TpOut>
1822    _TpOut getProp(cl_device_info prop) const
1823    {
1824        _TpCL temp=_TpCL();
1825        size_t sz = 0;
1826
1827        return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1828            sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1829    }
1830
1831    bool getBoolProp(cl_device_info prop) const
1832    {
1833        cl_bool temp = CL_FALSE;
1834        size_t sz = 0;
1835
1836        return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1837            sz == sizeof(temp) ? temp != 0 : false;
1838    }
1839
1840    String getStrProp(cl_device_info prop) const
1841    {
1842        char buf[1024];
1843        size_t sz=0;
1844        return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1845            sz < sizeof(buf) ? String(buf) : String();
1846    }
1847
1848    IMPLEMENT_REFCOUNTABLE();
1849    cl_device_id handle;
1850
1851    String name_;
1852    String version_;
1853    int doubleFPConfig_;
1854    bool hostUnifiedMemory_;
1855    int maxComputeUnits_;
1856    size_t maxWorkGroupSize_;
1857    int type_;
1858    int deviceVersionMajor_;
1859    int deviceVersionMinor_;
1860    String driverVersion_;
1861    String vendorName_;
1862    int vendorID_;
1863};
1864
1865
1866Device::Device()
1867{
1868    p = 0;
1869}
1870
1871Device::Device(void* d)
1872{
1873    p = 0;
1874    set(d);
1875}
1876
1877Device::Device(const Device& d)
1878{
1879    p = d.p;
1880    if(p)
1881        p->addref();
1882}
1883
1884Device& Device::operator = (const Device& d)
1885{
1886    Impl* newp = (Impl*)d.p;
1887    if(newp)
1888        newp->addref();
1889    if(p)
1890        p->release();
1891    p = newp;
1892    return *this;
1893}
1894
1895Device::~Device()
1896{
1897    if(p)
1898        p->release();
1899}
1900
1901void Device::set(void* d)
1902{
1903    if(p)
1904        p->release();
1905    p = new Impl(d);
1906}
1907
1908void* Device::ptr() const
1909{
1910    return p ? p->handle : 0;
1911}
1912
1913String Device::name() const
1914{ return p ? p->name_ : String(); }
1915
1916String Device::extensions() const
1917{ return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
1918
1919String Device::version() const
1920{ return p ? p->version_ : String(); }
1921
1922String Device::vendorName() const
1923{ return p ? p->vendorName_ : String(); }
1924
1925int Device::vendorID() const
1926{ return p ? p->vendorID_ : 0; }
1927
1928String Device::OpenCL_C_Version() const
1929{ return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1930
1931String Device::OpenCLVersion() const
1932{ return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
1933
1934int Device::deviceVersionMajor() const
1935{ return p ? p->deviceVersionMajor_ : 0; }
1936
1937int Device::deviceVersionMinor() const
1938{ return p ? p->deviceVersionMinor_ : 0; }
1939
1940String Device::driverVersion() const
1941{ return p ? p->driverVersion_ : String(); }
1942
1943int Device::type() const
1944{ return p ? p->type_ : 0; }
1945
1946int Device::addressBits() const
1947{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS) : 0; }
1948
1949bool Device::available() const
1950{ return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1951
1952bool Device::compilerAvailable() const
1953{ return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1954
1955bool Device::linkerAvailable() const
1956#ifdef CL_VERSION_1_2
1957{ return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1958#else
1959{ CV_REQUIRE_OPENCL_1_2_ERROR; }
1960#endif
1961
1962int Device::doubleFPConfig() const
1963{ return p ? p->doubleFPConfig_ : 0; }
1964
1965int Device::singleFPConfig() const
1966{ return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1967
1968int Device::halfFPConfig() const
1969#ifdef CL_VERSION_1_2
1970{ return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1971#else
1972{ CV_REQUIRE_OPENCL_1_2_ERROR; }
1973#endif
1974
1975bool Device::endianLittle() const
1976{ return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1977
1978bool Device::errorCorrectionSupport() const
1979{ return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1980
1981int Device::executionCapabilities() const
1982{ return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1983
1984size_t Device::globalMemCacheSize() const
1985{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1986
1987int Device::globalMemCacheType() const
1988{ return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1989
1990int Device::globalMemCacheLineSize() const
1991{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1992
1993size_t Device::globalMemSize() const
1994{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1995
1996size_t Device::localMemSize() const
1997{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1998
1999int Device::localMemType() const
2000{ return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
2001
2002bool Device::hostUnifiedMemory() const
2003{ return p ? p->hostUnifiedMemory_ : false; }
2004
2005bool Device::imageSupport() const
2006{ return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
2007
2008bool Device::imageFromBufferSupport() const
2009{
2010    bool ret = false;
2011    if (p)
2012    {
2013        size_t pos = p->getStrProp(CL_DEVICE_EXTENSIONS).find("cl_khr_image2d_from_buffer");
2014        if (pos != String::npos)
2015        {
2016            ret = true;
2017        }
2018    }
2019    return ret;
2020}
2021
2022uint Device::imagePitchAlignment() const
2023{
2024#ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
2025    return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
2026#else
2027    return 0;
2028#endif
2029}
2030
2031uint Device::imageBaseAddressAlignment() const
2032{
2033#ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
2034    return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
2035#else
2036    return 0;
2037#endif
2038}
2039
2040size_t Device::image2DMaxWidth() const
2041{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
2042
2043size_t Device::image2DMaxHeight() const
2044{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
2045
2046size_t Device::image3DMaxWidth() const
2047{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
2048
2049size_t Device::image3DMaxHeight() const
2050{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
2051
2052size_t Device::image3DMaxDepth() const
2053{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
2054
2055size_t Device::imageMaxBufferSize() const
2056#ifdef CL_VERSION_1_2
2057{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
2058#else
2059{ CV_REQUIRE_OPENCL_1_2_ERROR; }
2060#endif
2061
2062size_t Device::imageMaxArraySize() const
2063#ifdef CL_VERSION_1_2
2064{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
2065#else
2066{ CV_REQUIRE_OPENCL_1_2_ERROR; }
2067#endif
2068
2069int Device::maxClockFrequency() const
2070{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
2071
2072int Device::maxComputeUnits() const
2073{ return p ? p->maxComputeUnits_ : 0; }
2074
2075int Device::maxConstantArgs() const
2076{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
2077
2078size_t Device::maxConstantBufferSize() const
2079{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
2080
2081size_t Device::maxMemAllocSize() const
2082{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
2083
2084size_t Device::maxParameterSize() const
2085{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
2086
2087int Device::maxReadImageArgs() const
2088{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
2089
2090int Device::maxWriteImageArgs() const
2091{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
2092
2093int Device::maxSamplers() const
2094{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
2095
2096size_t Device::maxWorkGroupSize() const
2097{ return p ? p->maxWorkGroupSize_ : 0; }
2098
2099int Device::maxWorkItemDims() const
2100{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
2101
2102void Device::maxWorkItemSizes(size_t* sizes) const
2103{
2104    if(p)
2105    {
2106        const int MAX_DIMS = 32;
2107        size_t retsz = 0;
2108        CV_OclDbgAssert(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
2109                MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz) == CL_SUCCESS);
2110    }
2111}
2112
2113int Device::memBaseAddrAlign() const
2114{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
2115
2116int Device::nativeVectorWidthChar() const
2117{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
2118
2119int Device::nativeVectorWidthShort() const
2120{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
2121
2122int Device::nativeVectorWidthInt() const
2123{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
2124
2125int Device::nativeVectorWidthLong() const
2126{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
2127
2128int Device::nativeVectorWidthFloat() const
2129{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
2130
2131int Device::nativeVectorWidthDouble() const
2132{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
2133
2134int Device::nativeVectorWidthHalf() const
2135{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
2136
2137int Device::preferredVectorWidthChar() const
2138{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
2139
2140int Device::preferredVectorWidthShort() const
2141{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
2142
2143int Device::preferredVectorWidthInt() const
2144{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
2145
2146int Device::preferredVectorWidthLong() const
2147{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
2148
2149int Device::preferredVectorWidthFloat() const
2150{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
2151
2152int Device::preferredVectorWidthDouble() const
2153{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
2154
2155int Device::preferredVectorWidthHalf() const
2156{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
2157
2158size_t Device::printfBufferSize() const
2159#ifdef CL_VERSION_1_2
2160{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
2161#else
2162{ CV_REQUIRE_OPENCL_1_2_ERROR; }
2163#endif
2164
2165
2166size_t Device::profilingTimerResolution() const
2167{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
2168
2169const Device& Device::getDefault()
2170{
2171    const Context& ctx = Context::getDefault();
2172    int idx = getCoreTlsData().get()->device;
2173    const Device& device = ctx.device(idx);
2174    return device;
2175}
2176
2177////////////////////////////////////// Context ///////////////////////////////////////////////////
2178
2179template <typename Functor, typename ObjectType>
2180inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
2181{
2182    ::size_t required;
2183    cl_int err = f(obj, name, 0, NULL, &required);
2184    if (err != CL_SUCCESS)
2185        return err;
2186
2187    param.clear();
2188    if (required > 0)
2189    {
2190        AutoBuffer<char> buf(required + 1);
2191        char* ptr = (char*)buf; // cleanup is not needed
2192        err = f(obj, name, required, ptr, NULL);
2193        if (err != CL_SUCCESS)
2194            return err;
2195        param = ptr;
2196    }
2197
2198    return CL_SUCCESS;
2199}
2200
2201static void split(const std::string &s, char delim, std::vector<std::string> &elems)
2202{
2203    elems.clear();
2204    if (s.size() == 0)
2205        return;
2206    std::istringstream ss(s);
2207    std::string item;
2208    while (!ss.eof())
2209    {
2210        std::getline(ss, item, delim);
2211        elems.push_back(item);
2212    }
2213}
2214
2215// Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
2216// Sample: AMD:GPU:
2217// Sample: AMD:GPU:Tahiti
2218// Sample: :GPU|CPU: = '' = ':' = '::'
2219static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
2220        std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
2221{
2222    std::vector<std::string> parts;
2223    split(configurationStr, ':', parts);
2224    if (parts.size() > 3)
2225    {
2226        std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
2227        return false;
2228    }
2229    if (parts.size() > 2)
2230        deviceNameOrID = parts[2];
2231    if (parts.size() > 1)
2232    {
2233        split(parts[1], '|', deviceTypes);
2234    }
2235    if (parts.size() > 0)
2236    {
2237        platform = parts[0];
2238    }
2239    return true;
2240}
2241
2242#ifdef WINRT
2243static cl_device_id selectOpenCLDevice()
2244{
2245    return NULL;
2246}
2247#else
2248static cl_device_id selectOpenCLDevice()
2249{
2250    std::string platform, deviceName;
2251    std::vector<std::string> deviceTypes;
2252
2253    const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
2254    if (configuration &&
2255            (strcmp(configuration, "disabled") == 0 ||
2256             !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
2257            ))
2258        return NULL;
2259
2260    bool isID = false;
2261    int deviceID = -1;
2262    if (deviceName.length() == 1)
2263    // We limit ID range to 0..9, because we want to write:
2264    // - '2500' to mean i5-2500
2265    // - '8350' to mean AMD FX-8350
2266    // - '650' to mean GeForce 650
2267    // To extend ID range change condition to '> 0'
2268    {
2269        isID = true;
2270        for (size_t i = 0; i < deviceName.length(); i++)
2271        {
2272            if (!isdigit(deviceName[i]))
2273            {
2274                isID = false;
2275                break;
2276            }
2277        }
2278        if (isID)
2279        {
2280            deviceID = atoi(deviceName.c_str());
2281            if (deviceID < 0)
2282                return NULL;
2283        }
2284    }
2285
2286    std::vector<cl_platform_id> platforms;
2287    {
2288        cl_uint numPlatforms = 0;
2289        CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
2290
2291        if (numPlatforms == 0)
2292            return NULL;
2293        platforms.resize((size_t)numPlatforms);
2294        CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
2295        platforms.resize(numPlatforms);
2296    }
2297
2298    int selectedPlatform = -1;
2299    if (platform.length() > 0)
2300    {
2301        for (size_t i = 0; i < platforms.size(); i++)
2302        {
2303            std::string name;
2304            CV_OclDbgAssert(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name) == CL_SUCCESS);
2305            if (name.find(platform) != std::string::npos)
2306            {
2307                selectedPlatform = (int)i;
2308                break;
2309            }
2310        }
2311        if (selectedPlatform == -1)
2312        {
2313            std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
2314            goto not_found;
2315        }
2316    }
2317    if (deviceTypes.size() == 0)
2318    {
2319        if (!isID)
2320        {
2321            deviceTypes.push_back("GPU");
2322            if (configuration)
2323                deviceTypes.push_back("CPU");
2324        }
2325        else
2326            deviceTypes.push_back("ALL");
2327    }
2328    for (size_t t = 0; t < deviceTypes.size(); t++)
2329    {
2330        int deviceType = 0;
2331        std::string tempStrDeviceType = deviceTypes[t];
2332        std::transform( tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), tolower );
2333
2334        if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2335            deviceType = Device::TYPE_GPU;
2336        else if (tempStrDeviceType == "cpu")
2337            deviceType = Device::TYPE_CPU;
2338        else if (tempStrDeviceType == "accelerator")
2339            deviceType = Device::TYPE_ACCELERATOR;
2340        else if (tempStrDeviceType == "all")
2341            deviceType = Device::TYPE_ALL;
2342        else
2343        {
2344            std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
2345            goto not_found;
2346        }
2347
2348        std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
2349        for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
2350                (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
2351                i++)
2352        {
2353            cl_uint count = 0;
2354            cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
2355            CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
2356            if (count == 0)
2357                continue;
2358            size_t base = devices.size();
2359            devices.resize(base + count);
2360            status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
2361            CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
2362        }
2363
2364        for (size_t i = (isID ? deviceID : 0);
2365             (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
2366             i++)
2367        {
2368            std::string name;
2369            CV_OclDbgAssert(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name) == CL_SUCCESS);
2370            cl_bool useGPU = true;
2371            if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2372            {
2373                cl_bool isIGPU = CL_FALSE;
2374                clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL);
2375                useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
2376            }
2377            if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
2378            {
2379                // TODO check for OpenCL 1.1
2380                return devices[i];
2381            }
2382        }
2383    }
2384
2385not_found:
2386    if (!configuration)
2387        return NULL; // suppress messages on stderr
2388
2389    std::cerr << "ERROR: Requested OpenCL device not found, check configuration: " << (configuration == NULL ? "" : configuration) << std::endl
2390            << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
2391            << "    Device types: ";
2392    for (size_t t = 0; t < deviceTypes.size(); t++)
2393        std::cerr << deviceTypes[t] << " ";
2394
2395    std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
2396    return NULL;
2397}
2398#endif
2399
2400#ifdef HAVE_OPENCL_SVM
2401namespace svm {
2402
2403enum AllocatorFlags { // don't use first 16 bits
2404        OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
2405        OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
2406        OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
2407        OPENCL_SVM_BUFFER_MASK = 3 << 16,
2408        OPENCL_SVM_BUFFER_MAP = 4 << 16
2409};
2410
2411static bool checkForceSVMUmatUsage()
2412{
2413    static bool initialized = false;
2414    static bool force = false;
2415    if (!initialized)
2416    {
2417        force = getBoolParameter("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
2418        initialized = true;
2419    }
2420    return force;
2421}
2422static bool checkDisableSVMUMatUsage()
2423{
2424    static bool initialized = false;
2425    static bool force = false;
2426    if (!initialized)
2427    {
2428        force = getBoolParameter("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
2429        initialized = true;
2430    }
2431    return force;
2432}
2433static bool checkDisableSVM()
2434{
2435    static bool initialized = false;
2436    static bool force = false;
2437    if (!initialized)
2438    {
2439        force = getBoolParameter("OPENCV_OPENCL_SVM_DISABLE", false);
2440        initialized = true;
2441    }
2442    return force;
2443}
2444// see SVMCapabilities
2445static unsigned int getSVMCapabilitiesMask()
2446{
2447    static bool initialized = false;
2448    static unsigned int mask = 0;
2449    if (!initialized)
2450    {
2451        const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
2452        if (envValue == NULL)
2453        {
2454            return ~0U; // all bits 1
2455        }
2456        mask = atoi(envValue);
2457        initialized = true;
2458    }
2459    return mask;
2460}
2461} // namespace
2462#endif
2463
2464struct Context::Impl
2465{
2466    static Context::Impl* get(Context& context) { return context.p; }
2467
2468    void __init()
2469    {
2470        refcount = 1;
2471        handle = 0;
2472#ifdef HAVE_OPENCL_SVM
2473        svmInitialized = false;
2474#endif
2475    }
2476
2477    Impl()
2478    {
2479        __init();
2480    }
2481
2482    void setDefault()
2483    {
2484        CV_Assert(handle == NULL);
2485
2486        cl_device_id d = selectOpenCLDevice();
2487
2488        if (d == NULL)
2489            return;
2490
2491        cl_platform_id pl = NULL;
2492        CV_OclDbgAssert(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL) == CL_SUCCESS);
2493
2494        cl_context_properties prop[] =
2495        {
2496            CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2497            0
2498        };
2499
2500        // !!! in the current implementation force the number of devices to 1 !!!
2501        cl_uint nd = 1;
2502        cl_int status;
2503
2504        handle = clCreateContext(prop, nd, &d, 0, 0, &status);
2505
2506        bool ok = handle != 0 && status == CL_SUCCESS;
2507        if( ok )
2508        {
2509            devices.resize(nd);
2510            devices[0].set(d);
2511        }
2512        else
2513            handle = NULL;
2514    }
2515
2516    Impl(int dtype0)
2517    {
2518        __init();
2519
2520        cl_int retval = 0;
2521        cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr();
2522        cl_context_properties prop[] =
2523        {
2524            CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2525            0
2526        };
2527
2528        cl_uint i, nd0 = 0, nd = 0;
2529        int dtype = dtype0 & 15;
2530        CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, 0, 0, &nd0 ) == CL_SUCCESS);
2531
2532        AutoBuffer<void*> dlistbuf(nd0*2+1);
2533        cl_device_id* dlist = (cl_device_id*)(void**)dlistbuf;
2534        cl_device_id* dlist_new = dlist + nd0;
2535        CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, nd0, dlist, &nd0 ) == CL_SUCCESS);
2536        String name0;
2537
2538        for(i = 0; i < nd0; i++)
2539        {
2540            Device d(dlist[i]);
2541            if( !d.available() || !d.compilerAvailable() )
2542                continue;
2543            if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() )
2544                continue;
2545            if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() )
2546                continue;
2547            String name = d.name();
2548            if( nd != 0 && name != name0 )
2549                continue;
2550            name0 = name;
2551            dlist_new[nd++] = dlist[i];
2552        }
2553
2554        if(nd == 0)
2555            return;
2556
2557        // !!! in the current implementation force the number of devices to 1 !!!
2558        nd = 1;
2559
2560        handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
2561        bool ok = handle != 0 && retval == CL_SUCCESS;
2562        if( ok )
2563        {
2564            devices.resize(nd);
2565            for( i = 0; i < nd; i++ )
2566                devices[i].set(dlist_new[i]);
2567        }
2568    }
2569
2570    ~Impl()
2571    {
2572        if(handle)
2573        {
2574            clReleaseContext(handle);
2575            handle = NULL;
2576        }
2577        devices.clear();
2578    }
2579
2580    Program getProg(const ProgramSource& src,
2581                    const String& buildflags, String& errmsg)
2582    {
2583        String prefix = Program::getPrefix(buildflags);
2584        HashKey k(src.hash(), crc64((const uchar*)prefix.c_str(), prefix.size()));
2585        phash_t::iterator it = phash.find(k);
2586        if( it != phash.end() )
2587            return it->second;
2588        //String filename = format("%08x%08x_%08x%08x.clb2",
2589        Program prog(src, buildflags, errmsg);
2590        if(prog.ptr())
2591            phash.insert(std::pair<HashKey,Program>(k, prog));
2592        return prog;
2593    }
2594
2595    IMPLEMENT_REFCOUNTABLE();
2596
2597    cl_context handle;
2598    std::vector<Device> devices;
2599
2600    typedef ProgramSource::hash_t hash_t;
2601
2602    struct HashKey
2603    {
2604        HashKey(hash_t _a, hash_t _b) : a(_a), b(_b) {}
2605        bool operator < (const HashKey& k) const { return a < k.a || (a == k.a && b < k.b); }
2606        bool operator == (const HashKey& k) const { return a == k.a && b == k.b; }
2607        bool operator != (const HashKey& k) const { return a != k.a || b != k.b; }
2608        hash_t a, b;
2609    };
2610    typedef std::map<HashKey, Program> phash_t;
2611    phash_t phash;
2612
2613#ifdef HAVE_OPENCL_SVM
2614    bool svmInitialized;
2615    bool svmAvailable;
2616    bool svmEnabled;
2617    svm::SVMCapabilities svmCapabilities;
2618    svm::SVMFunctions svmFunctions;
2619
2620    void svmInit()
2621    {
2622        CV_Assert(handle != NULL);
2623        const Device& device = devices[0];
2624        cl_device_svm_capabilities deviceCaps = 0;
2625        CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
2626        cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
2627        if (status != CL_SUCCESS)
2628        {
2629            CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
2630            goto noSVM;
2631        }
2632        CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
2633        CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
2634        svmCapabilities.value_ =
2635                ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
2636                ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
2637                ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
2638                ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
2639        svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
2640        if (svmCapabilities.value_ == 0)
2641        {
2642            CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
2643            goto noSVM;
2644        }
2645        try
2646        {
2647            // Try OpenCL 2.0
2648            CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
2649            void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
2650            if (!ptr)
2651            {
2652                CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
2653                CV_ErrorNoReturn(Error::StsBadArg, "clSVMAlloc returned NULL");
2654            }
2655            try
2656            {
2657                bool error = false;
2658                cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
2659                if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
2660                {
2661                    CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
2662                    CV_ErrorNoReturn(Error::StsBadArg, "clEnqueueSVMMap FAILED");
2663                }
2664                clFinish(q);
2665                try
2666                {
2667                    ((int*)ptr)[0] = 100;
2668                }
2669                catch (...)
2670                {
2671                    CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
2672                    error = true;
2673                }
2674                if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
2675                {
2676                    CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
2677                    CV_ErrorNoReturn(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
2678                }
2679                clFinish(q);
2680                if (error)
2681                {
2682                    CV_ErrorNoReturn(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
2683                }
2684            }
2685            catch (...)
2686            {
2687                CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
2688                clSVMFree(handle, ptr);
2689                throw;
2690            }
2691            clSVMFree(handle, ptr);
2692            svmFunctions.fn_clSVMAlloc = clSVMAlloc;
2693            svmFunctions.fn_clSVMFree = clSVMFree;
2694            svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
2695            //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
2696            //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
2697            svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
2698            svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
2699            svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
2700            svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
2701        }
2702        catch (...)
2703        {
2704            CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
2705            try
2706            {
2707                // Try HSA extension
2708                String extensions = device.extensions();
2709                if (extensions.find("cl_amd_svm") == String::npos)
2710                {
2711                    CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
2712                    goto noSVM;
2713                }
2714                cl_platform_id p = NULL;
2715                status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL);
2716                CV_Assert(status == CL_SUCCESS);
2717                svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
2718                svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
2719                svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
2720                //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
2721                //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
2722                svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
2723                svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
2724                svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
2725                svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
2726                CV_Assert(svmFunctions.isValid());
2727            }
2728            catch (...)
2729            {
2730                CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
2731                goto noSVM;
2732            }
2733        }
2734
2735        svmAvailable = true;
2736        svmEnabled = !svm::checkDisableSVM();
2737        svmInitialized = true;
2738        CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
2739        return;
2740    noSVM:
2741        CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
2742        svmAvailable = false;
2743        svmEnabled = false;
2744        svmCapabilities.value_ = 0;
2745        svmInitialized = true;
2746        svmFunctions.fn_clSVMAlloc = NULL;
2747        return;
2748    }
2749#endif
2750};
2751
2752
2753Context::Context()
2754{
2755    p = 0;
2756}
2757
2758Context::Context(int dtype)
2759{
2760    p = 0;
2761    create(dtype);
2762}
2763
2764bool Context::create()
2765{
2766    if( !haveOpenCL() )
2767        return false;
2768    if(p)
2769        p->release();
2770    p = new Impl();
2771    if(!p->handle)
2772    {
2773        delete p;
2774        p = 0;
2775    }
2776    return p != 0;
2777}
2778
2779bool Context::create(int dtype0)
2780{
2781    if( !haveOpenCL() )
2782        return false;
2783    if(p)
2784        p->release();
2785    p = new Impl(dtype0);
2786    if(!p->handle)
2787    {
2788        delete p;
2789        p = 0;
2790    }
2791    return p != 0;
2792}
2793
2794Context::~Context()
2795{
2796    if (p)
2797    {
2798        p->release();
2799        p = NULL;
2800    }
2801}
2802
2803Context::Context(const Context& c)
2804{
2805    p = (Impl*)c.p;
2806    if(p)
2807        p->addref();
2808}
2809
2810Context& Context::operator = (const Context& c)
2811{
2812    Impl* newp = (Impl*)c.p;
2813    if(newp)
2814        newp->addref();
2815    if(p)
2816        p->release();
2817    p = newp;
2818    return *this;
2819}
2820
2821void* Context::ptr() const
2822{
2823    return p == NULL ? NULL : p->handle;
2824}
2825
2826size_t Context::ndevices() const
2827{
2828    return p ? p->devices.size() : 0;
2829}
2830
2831const Device& Context::device(size_t idx) const
2832{
2833    static Device dummy;
2834    return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2835}
2836
2837Context& Context::getDefault(bool initialize)
2838{
2839    static Context* ctx = new Context();
2840    if(!ctx->p && haveOpenCL())
2841    {
2842        if (!ctx->p)
2843            ctx->p = new Impl();
2844        if (initialize)
2845        {
2846            // do not create new Context right away.
2847            // First, try to retrieve existing context of the same type.
2848            // In its turn, Platform::getContext() may call Context::create()
2849            // if there is no such context.
2850            if (ctx->p->handle == NULL)
2851                ctx->p->setDefault();
2852        }
2853    }
2854
2855    return *ctx;
2856}
2857
2858Program Context::getProg(const ProgramSource& prog,
2859                         const String& buildopts, String& errmsg)
2860{
2861    return p ? p->getProg(prog, buildopts, errmsg) : Program();
2862}
2863
2864
2865
2866#ifdef HAVE_OPENCL_SVM
2867bool Context::useSVM() const
2868{
2869    Context::Impl* i = p;
2870    CV_Assert(i);
2871    if (!i->svmInitialized)
2872        i->svmInit();
2873    return i->svmEnabled;
2874}
2875void Context::setUseSVM(bool enabled)
2876{
2877    Context::Impl* i = p;
2878    CV_Assert(i);
2879    if (!i->svmInitialized)
2880        i->svmInit();
2881    if (enabled && !i->svmAvailable)
2882    {
2883        CV_ErrorNoReturn(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
2884    }
2885    i->svmEnabled = enabled;
2886}
2887#else
2888bool Context::useSVM() const { return false; }
2889void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
2890#endif
2891
2892#ifdef HAVE_OPENCL_SVM
2893namespace svm {
2894
2895const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
2896{
2897    Context::Impl* i = context.p;
2898    CV_Assert(i);
2899    if (!i->svmInitialized)
2900        i->svmInit();
2901    return i->svmCapabilities;
2902}
2903
2904CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
2905{
2906    Context::Impl* i = context.p;
2907    CV_Assert(i);
2908    CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
2909    CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
2910    return &i->svmFunctions;
2911}
2912
2913CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
2914{
2915    if (checkForceSVMUmatUsage())
2916        return true;
2917    if (checkDisableSVMUMatUsage())
2918        return false;
2919    if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
2920        return true;
2921    return false; // don't use SVM by default
2922}
2923
2924} // namespace cv::ocl::svm
2925#endif // HAVE_OPENCL_SVM
2926
2927
2928
2929void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device)
2930{
2931    cl_context context = (cl_context)_context;
2932    cl_device_id device = (cl_device_id)_device;
2933
2934    // cleanup old context
2935    Context::Impl * impl = ctx.p;
2936    if (impl->handle)
2937    {
2938        CV_OclDbgAssert(clReleaseContext(impl->handle) == CL_SUCCESS);
2939    }
2940    impl->devices.clear();
2941
2942    impl->handle = context;
2943    impl->devices.resize(1);
2944    impl->devices[0].set(device);
2945
2946    Platform& p = Platform::getDefault();
2947    Platform::Impl* pImpl = p.p;
2948    pImpl->handle = (cl_platform_id)platform;
2949}
2950
2951/////////////////////////////////////////// Queue /////////////////////////////////////////////
2952
2953struct Queue::Impl
2954{
2955    Impl(const Context& c, const Device& d)
2956    {
2957        refcount = 1;
2958        const Context* pc = &c;
2959        cl_context ch = (cl_context)pc->ptr();
2960        if( !ch )
2961        {
2962            pc = &Context::getDefault();
2963            ch = (cl_context)pc->ptr();
2964        }
2965        cl_device_id dh = (cl_device_id)d.ptr();
2966        if( !dh )
2967            dh = (cl_device_id)pc->device(0).ptr();
2968        cl_int retval = 0;
2969        handle = clCreateCommandQueue(ch, dh, 0, &retval);
2970        CV_OclDbgAssert(retval == CL_SUCCESS);
2971    }
2972
2973    ~Impl()
2974    {
2975#ifdef _WIN32
2976        if (!cv::__termination)
2977#endif
2978        {
2979            if(handle)
2980            {
2981                clFinish(handle);
2982                clReleaseCommandQueue(handle);
2983                handle = NULL;
2984            }
2985        }
2986    }
2987
2988    IMPLEMENT_REFCOUNTABLE();
2989
2990    cl_command_queue handle;
2991};
2992
2993Queue::Queue()
2994{
2995    p = 0;
2996}
2997
2998Queue::Queue(const Context& c, const Device& d)
2999{
3000    p = 0;
3001    create(c, d);
3002}
3003
3004Queue::Queue(const Queue& q)
3005{
3006    p = q.p;
3007    if(p)
3008        p->addref();
3009}
3010
3011Queue& Queue::operator = (const Queue& q)
3012{
3013    Impl* newp = (Impl*)q.p;
3014    if(newp)
3015        newp->addref();
3016    if(p)
3017        p->release();
3018    p = newp;
3019    return *this;
3020}
3021
3022Queue::~Queue()
3023{
3024    if(p)
3025        p->release();
3026}
3027
3028bool Queue::create(const Context& c, const Device& d)
3029{
3030    if(p)
3031        p->release();
3032    p = new Impl(c, d);
3033    return p->handle != 0;
3034}
3035
3036void Queue::finish()
3037{
3038    if(p && p->handle)
3039    {
3040        CV_OclDbgAssert(clFinish(p->handle) == CL_SUCCESS);
3041    }
3042}
3043
3044void* Queue::ptr() const
3045{
3046    return p ? p->handle : 0;
3047}
3048
3049Queue& Queue::getDefault()
3050{
3051    Queue& q = getCoreTlsData().get()->oclQueue;
3052    if( !q.p && haveOpenCL() )
3053        q.create(Context::getDefault());
3054    return q;
3055}
3056
3057static cl_command_queue getQueue(const Queue& q)
3058{
3059    cl_command_queue qq = (cl_command_queue)q.ptr();
3060    if(!qq)
3061        qq = (cl_command_queue)Queue::getDefault().ptr();
3062    return qq;
3063}
3064
3065/////////////////////////////////////////// KernelArg /////////////////////////////////////////////
3066
3067KernelArg::KernelArg()
3068    : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
3069{
3070}
3071
3072KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
3073    : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
3074{
3075}
3076
3077KernelArg KernelArg::Constant(const Mat& m)
3078{
3079    CV_Assert(m.isContinuous());
3080    return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
3081}
3082
3083/////////////////////////////////////////// Kernel /////////////////////////////////////////////
3084
3085struct Kernel::Impl
3086{
3087    Impl(const char* kname, const Program& prog) :
3088        refcount(1), e(0), nu(0)
3089    {
3090        cl_program ph = (cl_program)prog.ptr();
3091        cl_int retval = 0;
3092        handle = ph != 0 ?
3093            clCreateKernel(ph, kname, &retval) : 0;
3094        CV_OclDbgAssert(retval == CL_SUCCESS);
3095        for( int i = 0; i < MAX_ARRS; i++ )
3096            u[i] = 0;
3097        haveTempDstUMats = false;
3098    }
3099
3100    void cleanupUMats()
3101    {
3102        for( int i = 0; i < MAX_ARRS; i++ )
3103            if( u[i] )
3104            {
3105                if( CV_XADD(&u[i]->urefcount, -1) == 1 )
3106                    u[i]->currAllocator->deallocate(u[i]);
3107                u[i] = 0;
3108            }
3109        nu = 0;
3110        haveTempDstUMats = false;
3111    }
3112
3113    void addUMat(const UMat& m, bool dst)
3114    {
3115        CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
3116        u[nu] = m.u;
3117        CV_XADD(&m.u->urefcount, 1);
3118        nu++;
3119        if(dst && m.u->tempUMat())
3120            haveTempDstUMats = true;
3121    }
3122
3123    void addImage(const Image2D& image)
3124    {
3125        images.push_back(image);
3126    }
3127
3128    void finit()
3129    {
3130        cleanupUMats();
3131        images.clear();
3132        if(e) { clReleaseEvent(e); e = 0; }
3133        release();
3134    }
3135
3136    ~Impl()
3137    {
3138        if(handle)
3139            clReleaseKernel(handle);
3140    }
3141
3142    IMPLEMENT_REFCOUNTABLE();
3143
3144    cl_kernel handle;
3145    cl_event e;
3146    enum { MAX_ARRS = 16 };
3147    UMatData* u[MAX_ARRS];
3148    int nu;
3149    std::list<Image2D> images;
3150    bool haveTempDstUMats;
3151};
3152
3153}}
3154
3155extern "C"
3156{
3157static void CL_CALLBACK oclCleanupCallback(cl_event, cl_int, void *p)
3158{
3159    ((cv::ocl::Kernel::Impl*)p)->finit();
3160}
3161
3162}
3163
3164namespace cv { namespace ocl {
3165
3166Kernel::Kernel()
3167{
3168    p = 0;
3169}
3170
3171Kernel::Kernel(const char* kname, const Program& prog)
3172{
3173    p = 0;
3174    create(kname, prog);
3175}
3176
3177Kernel::Kernel(const char* kname, const ProgramSource& src,
3178               const String& buildopts, String* errmsg)
3179{
3180    p = 0;
3181    create(kname, src, buildopts, errmsg);
3182}
3183
3184Kernel::Kernel(const Kernel& k)
3185{
3186    p = k.p;
3187    if(p)
3188        p->addref();
3189}
3190
3191Kernel& Kernel::operator = (const Kernel& k)
3192{
3193    Impl* newp = (Impl*)k.p;
3194    if(newp)
3195        newp->addref();
3196    if(p)
3197        p->release();
3198    p = newp;
3199    return *this;
3200}
3201
3202Kernel::~Kernel()
3203{
3204    if(p)
3205        p->release();
3206}
3207
3208bool Kernel::create(const char* kname, const Program& prog)
3209{
3210    if(p)
3211        p->release();
3212    p = new Impl(kname, prog);
3213    if(p->handle == 0)
3214    {
3215        p->release();
3216        p = 0;
3217    }
3218#ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
3219    CV_Assert(p);
3220#endif
3221    return p != 0;
3222}
3223
3224bool Kernel::create(const char* kname, const ProgramSource& src,
3225                    const String& buildopts, String* errmsg)
3226{
3227    if(p)
3228    {
3229        p->release();
3230        p = 0;
3231    }
3232    String tempmsg;
3233    if( !errmsg ) errmsg = &tempmsg;
3234    const Program& prog = Context::getDefault().getProg(src, buildopts, *errmsg);
3235    return create(kname, prog);
3236}
3237
3238void* Kernel::ptr() const
3239{
3240    return p ? p->handle : 0;
3241}
3242
3243bool Kernel::empty() const
3244{
3245    return ptr() == 0;
3246}
3247
3248int Kernel::set(int i, const void* value, size_t sz)
3249{
3250    if (!p || !p->handle)
3251        return -1;
3252    if (i < 0)
3253        return i;
3254    if( i == 0 )
3255        p->cleanupUMats();
3256
3257    cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
3258    CV_OclDbgAssert(retval == CL_SUCCESS);
3259    if (retval != CL_SUCCESS)
3260        return -1;
3261    return i+1;
3262}
3263
3264int Kernel::set(int i, const Image2D& image2D)
3265{
3266    p->addImage(image2D);
3267    cl_mem h = (cl_mem)image2D.ptr();
3268    return set(i, &h, sizeof(h));
3269}
3270
3271int Kernel::set(int i, const UMat& m)
3272{
3273    return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
3274}
3275
3276int Kernel::set(int i, const KernelArg& arg)
3277{
3278    if( !p || !p->handle )
3279        return -1;
3280    if (i < 0)
3281        return i;
3282    if( i == 0 )
3283        p->cleanupUMats();
3284    if( arg.m )
3285    {
3286        int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
3287                          ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
3288        bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
3289        cl_mem h = (cl_mem)arg.m->handle(accessFlags);
3290
3291        if (!h)
3292        {
3293            p->release();
3294            p = 0;
3295            return -1;
3296        }
3297
3298#ifdef HAVE_OPENCL_SVM
3299        if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
3300        {
3301            const Context& ctx = Context::getDefault();
3302            const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
3303            uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
3304            CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
3305#if 1 // TODO
3306            cl_int status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
3307#else
3308            cl_int status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
3309#endif
3310            CV_Assert(status == CL_SUCCESS);
3311        }
3312        else
3313#endif
3314        {
3315            CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
3316        }
3317
3318        if (ptronly)
3319        {
3320            i++;
3321        }
3322        else if( arg.m->dims <= 2 )
3323        {
3324            UMat2D u2d(*arg.m);
3325            CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step) == CL_SUCCESS);
3326            CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset) == CL_SUCCESS);
3327            i += 3;
3328
3329            if( !(arg.flags & KernelArg::NO_SIZE) )
3330            {
3331                int cols = u2d.cols*arg.wscale/arg.iwscale;
3332                CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows) == CL_SUCCESS);
3333                CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols) == CL_SUCCESS);
3334                i += 2;
3335            }
3336        }
3337        else
3338        {
3339            UMat3D u3d(*arg.m);
3340            CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep) == CL_SUCCESS);
3341            CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step) == CL_SUCCESS);
3342            CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset) == CL_SUCCESS);
3343            i += 4;
3344            if( !(arg.flags & KernelArg::NO_SIZE) )
3345            {
3346                int cols = u3d.cols*arg.wscale/arg.iwscale;
3347                CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows) == CL_SUCCESS);
3348                CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows) == CL_SUCCESS);
3349                CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols) == CL_SUCCESS);
3350                i += 3;
3351            }
3352        }
3353        p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0);
3354        return i;
3355    }
3356    CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj) == CL_SUCCESS);
3357    return i+1;
3358}
3359
3360
3361bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3362                 bool sync, const Queue& q)
3363{
3364    if(!p || !p->handle || p->e != 0)
3365        return false;
3366
3367    cl_command_queue qq = getQueue(q);
3368    size_t offset[CV_MAX_DIM] = {0}, globalsize[CV_MAX_DIM] = {1,1,1};
3369    size_t total = 1;
3370    CV_Assert(_globalsize != 0);
3371    for (int i = 0; i < dims; i++)
3372    {
3373        size_t val = _localsize ? _localsize[i] :
3374            dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3375        CV_Assert( val > 0 );
3376        total *= _globalsize[i];
3377        globalsize[i] = ((_globalsize[i] + val - 1)/val)*val;
3378    }
3379    if( total == 0 )
3380        return true;
3381    if( p->haveTempDstUMats )
3382        sync = true;
3383    cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
3384                                           offset, globalsize, _localsize, 0, 0,
3385                                           sync ? 0 : &p->e);
3386#if CV_OPENCL_SHOW_RUN_ERRORS
3387    if (retval != CL_SUCCESS)
3388    {
3389        printf("OpenCL program returns error: %d\n", retval);
3390        fflush(stdout);
3391    }
3392#endif
3393    if( sync || retval != CL_SUCCESS )
3394    {
3395        CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
3396        p->cleanupUMats();
3397    }
3398    else
3399    {
3400        p->addref();
3401        CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
3402    }
3403    return retval == CL_SUCCESS;
3404}
3405
3406bool Kernel::runTask(bool sync, const Queue& q)
3407{
3408    if(!p || !p->handle || p->e != 0)
3409        return false;
3410
3411    cl_command_queue qq = getQueue(q);
3412    cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
3413    if( sync || retval != CL_SUCCESS )
3414    {
3415        CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
3416        p->cleanupUMats();
3417    }
3418    else
3419    {
3420        p->addref();
3421        CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
3422    }
3423    return retval == CL_SUCCESS;
3424}
3425
3426
3427size_t Kernel::workGroupSize() const
3428{
3429    if(!p || !p->handle)
3430        return 0;
3431    size_t val = 0, retsz = 0;
3432    cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3433    return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE,
3434                                    sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
3435}
3436
3437size_t Kernel::preferedWorkGroupSizeMultiple() const
3438{
3439    if(!p || !p->handle)
3440        return 0;
3441    size_t val = 0, retsz = 0;
3442    cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3443    return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
3444                                    sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
3445}
3446
3447bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3448{
3449    if(!p || !p->handle || !wsz)
3450        return 0;
3451    size_t retsz = 0;
3452    cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3453    return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
3454                                    sizeof(wsz[0])*3, wsz, &retsz) == CL_SUCCESS;
3455}
3456
3457size_t Kernel::localMemSize() const
3458{
3459    if(!p || !p->handle)
3460        return 0;
3461    size_t retsz = 0;
3462    cl_ulong val = 0;
3463    cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3464    return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE,
3465                                    sizeof(val), &val, &retsz) == CL_SUCCESS ? (size_t)val : 0;
3466}
3467
3468/////////////////////////////////////////// Program /////////////////////////////////////////////
3469
3470struct Program::Impl
3471{
3472    Impl(const ProgramSource& _src,
3473         const String& _buildflags, String& errmsg)
3474    {
3475        refcount = 1;
3476        const Context& ctx = Context::getDefault();
3477        src = _src;
3478        buildflags = _buildflags;
3479        const String& srcstr = src.source();
3480        const char* srcptr = srcstr.c_str();
3481        size_t srclen = srcstr.size();
3482        cl_int retval = 0;
3483
3484        handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
3485        if( handle && retval == CL_SUCCESS )
3486        {
3487            int i, n = (int)ctx.ndevices();
3488            AutoBuffer<void*> deviceListBuf(n+1);
3489            void** deviceList = deviceListBuf;
3490            for( i = 0; i < n; i++ )
3491                deviceList[i] = ctx.device(i).ptr();
3492
3493            Device device = Device::getDefault();
3494            if (device.isAMD())
3495                buildflags += " -D AMD_DEVICE";
3496            else if (device.isIntel())
3497                buildflags += " -D INTEL_DEVICE";
3498
3499            retval = clBuildProgram(handle, n,
3500                                    (const cl_device_id*)deviceList,
3501                                    buildflags.c_str(), 0, 0);
3502#if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
3503            if( retval != CL_SUCCESS )
3504#endif
3505            {
3506                size_t retsz = 0;
3507                cl_int buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
3508                                               CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
3509                if (buildInfo_retval == CL_SUCCESS && retsz > 1)
3510                {
3511                    AutoBuffer<char> bufbuf(retsz + 16);
3512                    char* buf = bufbuf;
3513                    buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
3514                                                   CL_PROGRAM_BUILD_LOG, retsz+1, buf, &retsz);
3515                    if (buildInfo_retval == CL_SUCCESS)
3516                    {
3517                        // TODO It is useful to see kernel name & program file name also
3518                        errmsg = String(buf);
3519                        printf("OpenCL program build log: %s\n%s\n", buildflags.c_str(), errmsg.c_str());
3520                        fflush(stdout);
3521                    }
3522                }
3523                if (retval != CL_SUCCESS && handle)
3524                {
3525                    clReleaseProgram(handle);
3526                    handle = NULL;
3527                }
3528            }
3529        }
3530    }
3531
3532    Impl(const String& _buf, const String& _buildflags)
3533    {
3534        refcount = 1;
3535        handle = 0;
3536        buildflags = _buildflags;
3537        if(_buf.empty())
3538            return;
3539        String prefix0 = Program::getPrefix(buildflags);
3540        const Context& ctx = Context::getDefault();
3541        const Device& dev = Device::getDefault();
3542        const char* pos0 = _buf.c_str();
3543        const char* pos1 = strchr(pos0, '\n');
3544        if(!pos1)
3545            return;
3546        const char* pos2 = strchr(pos1+1, '\n');
3547        if(!pos2)
3548            return;
3549        const char* pos3 = strchr(pos2+1, '\n');
3550        if(!pos3)
3551            return;
3552        size_t prefixlen = (pos3 - pos0)+1;
3553        String prefix(pos0, prefixlen);
3554        if( prefix != prefix0 )
3555            return;
3556        const uchar* bin = (uchar*)(pos3+1);
3557        void* devid = dev.ptr();
3558        size_t codelen = _buf.length() - prefixlen;
3559        cl_int binstatus = 0, retval = 0;
3560        handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), 1, (cl_device_id*)&devid,
3561                                           &codelen, &bin, &binstatus, &retval);
3562        CV_OclDbgAssert(retval == CL_SUCCESS);
3563    }
3564
3565    String store()
3566    {
3567        if(!handle)
3568            return String();
3569        size_t progsz = 0, retsz = 0;
3570        String prefix = Program::getPrefix(buildflags);
3571        size_t prefixlen = prefix.length();
3572        if(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(progsz), &progsz, &retsz) != CL_SUCCESS)
3573            return String();
3574        AutoBuffer<uchar> bufbuf(prefixlen + progsz + 16);
3575        uchar* buf = bufbuf;
3576        memcpy(buf, prefix.c_str(), prefixlen);
3577        buf += prefixlen;
3578        if(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(buf), &buf, &retsz) != CL_SUCCESS)
3579            return String();
3580        buf[progsz] = (uchar)'\0';
3581        return String((const char*)(uchar*)bufbuf, prefixlen + progsz);
3582    }
3583
3584    ~Impl()
3585    {
3586        if( handle )
3587        {
3588#ifdef _WIN32
3589            if (!cv::__termination)
3590#endif
3591            {
3592                clReleaseProgram(handle);
3593            }
3594            handle = NULL;
3595        }
3596    }
3597
3598    IMPLEMENT_REFCOUNTABLE();
3599
3600    ProgramSource src;
3601    String buildflags;
3602    cl_program handle;
3603};
3604
3605
3606Program::Program() { p = 0; }
3607
3608Program::Program(const ProgramSource& src,
3609        const String& buildflags, String& errmsg)
3610{
3611    p = 0;
3612    create(src, buildflags, errmsg);
3613}
3614
3615Program::Program(const Program& prog)
3616{
3617    p = prog.p;
3618    if(p)
3619        p->addref();
3620}
3621
3622Program& Program::operator = (const Program& prog)
3623{
3624    Impl* newp = (Impl*)prog.p;
3625    if(newp)
3626        newp->addref();
3627    if(p)
3628        p->release();
3629    p = newp;
3630    return *this;
3631}
3632
3633Program::~Program()
3634{
3635    if(p)
3636        p->release();
3637}
3638
3639bool Program::create(const ProgramSource& src,
3640            const String& buildflags, String& errmsg)
3641{
3642    if(p)
3643        p->release();
3644    p = new Impl(src, buildflags, errmsg);
3645    if(!p->handle)
3646    {
3647        p->release();
3648        p = 0;
3649    }
3650    return p != 0;
3651}
3652
3653const ProgramSource& Program::source() const
3654{
3655    static ProgramSource dummy;
3656    return p ? p->src : dummy;
3657}
3658
3659void* Program::ptr() const
3660{
3661    return p ? p->handle : 0;
3662}
3663
3664bool Program::read(const String& bin, const String& buildflags)
3665{
3666    if(p)
3667        p->release();
3668    p = new Impl(bin, buildflags);
3669    return p->handle != 0;
3670}
3671
3672bool Program::write(String& bin) const
3673{
3674    if(!p)
3675        return false;
3676    bin = p->store();
3677    return !bin.empty();
3678}
3679
3680String Program::getPrefix() const
3681{
3682    if(!p)
3683        return String();
3684    return getPrefix(p->buildflags);
3685}
3686
3687String Program::getPrefix(const String& buildflags)
3688{
3689    const Context& ctx = Context::getDefault();
3690    const Device& dev = ctx.device(0);
3691    return format("name=%s\ndriver=%s\nbuildflags=%s\n",
3692                  dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str());
3693}
3694
3695///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3696
3697struct ProgramSource::Impl
3698{
3699    Impl(const char* _src)
3700    {
3701        init(String(_src));
3702    }
3703    Impl(const String& _src)
3704    {
3705        init(_src);
3706    }
3707    void init(const String& _src)
3708    {
3709        refcount = 1;
3710        src = _src;
3711        h = crc64((uchar*)src.c_str(), src.size());
3712    }
3713
3714    IMPLEMENT_REFCOUNTABLE();
3715    String src;
3716    ProgramSource::hash_t h;
3717};
3718
3719
3720ProgramSource::ProgramSource()
3721{
3722    p = 0;
3723}
3724
3725ProgramSource::ProgramSource(const char* prog)
3726{
3727    p = new Impl(prog);
3728}
3729
3730ProgramSource::ProgramSource(const String& prog)
3731{
3732    p = new Impl(prog);
3733}
3734
3735ProgramSource::~ProgramSource()
3736{
3737    if(p)
3738        p->release();
3739}
3740
3741ProgramSource::ProgramSource(const ProgramSource& prog)
3742{
3743    p = prog.p;
3744    if(p)
3745        p->addref();
3746}
3747
3748ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
3749{
3750    Impl* newp = (Impl*)prog.p;
3751    if(newp)
3752        newp->addref();
3753    if(p)
3754        p->release();
3755    p = newp;
3756    return *this;
3757}
3758
3759const String& ProgramSource::source() const
3760{
3761    static String dummy;
3762    return p ? p->src : dummy;
3763}
3764
3765ProgramSource::hash_t ProgramSource::hash() const
3766{
3767    return p ? p->h : 0;
3768}
3769
3770//////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
3771
3772template<typename T>
3773class OpenCLBufferPool
3774{
3775protected:
3776    ~OpenCLBufferPool() { }
3777public:
3778    virtual T allocate(size_t size) = 0;
3779    virtual void release(T buffer) = 0;
3780};
3781
3782template <typename Derived, typename BufferEntry, typename T>
3783class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
3784{
3785private:
3786    inline Derived& derived() { return *static_cast<Derived*>(this); }
3787protected:
3788    Mutex mutex_;
3789
3790    size_t currentReservedSize;
3791    size_t maxReservedSize;
3792
3793    std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
3794    std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
3795
3796    // synchronized
3797    bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
3798    {
3799        typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
3800        for (; i != allocatedEntries_.end(); ++i)
3801        {
3802            BufferEntry& e = *i;
3803            if (e.clBuffer_ == buffer)
3804            {
3805                entry = e;
3806                allocatedEntries_.erase(i);
3807                return true;
3808            }
3809        }
3810        return false;
3811    }
3812
3813    // synchronized
3814    bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
3815    {
3816        if (reservedEntries_.empty())
3817            return false;
3818        typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
3819        typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
3820        BufferEntry result;
3821        size_t minDiff = (size_t)(-1);
3822        for (; i != reservedEntries_.end(); ++i)
3823        {
3824            BufferEntry& e = *i;
3825            if (e.capacity_ >= size)
3826            {
3827                size_t diff = e.capacity_ - size;
3828                if (diff < size / 8 && (result_pos == reservedEntries_.end() || diff < minDiff))
3829                {
3830                    minDiff = diff;
3831                    result_pos = i;
3832                    result = e;
3833                    if (diff == 0)
3834                        break;
3835                }
3836            }
3837        }
3838        if (result_pos != reservedEntries_.end())
3839        {
3840            //CV_DbgAssert(result == *result_pos);
3841            reservedEntries_.erase(result_pos);
3842            entry = result;
3843            currentReservedSize -= entry.capacity_;
3844            allocatedEntries_.push_back(entry);
3845            return true;
3846        }
3847        return false;
3848    }
3849
3850    // synchronized
3851    void _checkSizeOfReservedEntries()
3852    {
3853        while (currentReservedSize > maxReservedSize)
3854        {
3855            CV_DbgAssert(!reservedEntries_.empty());
3856            const BufferEntry& entry = reservedEntries_.back();
3857            CV_DbgAssert(currentReservedSize >= entry.capacity_);
3858            currentReservedSize -= entry.capacity_;
3859            derived()._releaseBufferEntry(entry);
3860            reservedEntries_.pop_back();
3861        }
3862    }
3863
3864    inline size_t _allocationGranularity(size_t size)
3865    {
3866        // heuristic values
3867        if (size < 1024)
3868            return 16;
3869        else if (size < 64*1024)
3870            return 64;
3871        else if (size < 1024*1024)
3872            return 4096;
3873        else if (size < 16*1024*1024)
3874            return 64*1024;
3875        else
3876            return 1024*1024;
3877    }
3878
3879public:
3880    OpenCLBufferPoolBaseImpl()
3881        : currentReservedSize(0),
3882          maxReservedSize(0)
3883    {
3884        // nothing
3885    }
3886    virtual ~OpenCLBufferPoolBaseImpl()
3887    {
3888        freeAllReservedBuffers();
3889        CV_Assert(reservedEntries_.empty());
3890    }
3891public:
3892    virtual T allocate(size_t size)
3893    {
3894        AutoLock locker(mutex_);
3895        BufferEntry entry;
3896        if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
3897        {
3898            CV_DbgAssert(size <= entry.capacity_);
3899            LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
3900        }
3901        else
3902        {
3903            derived()._allocateBufferEntry(entry, size);
3904        }
3905        return entry.clBuffer_;
3906    }
3907    virtual void release(T buffer)
3908    {
3909        AutoLock locker(mutex_);
3910        BufferEntry entry;
3911        CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
3912        if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
3913        {
3914            derived()._releaseBufferEntry(entry);
3915        }
3916        else
3917        {
3918            reservedEntries_.push_front(entry);
3919            currentReservedSize += entry.capacity_;
3920            _checkSizeOfReservedEntries();
3921        }
3922    }
3923
3924    virtual size_t getReservedSize() const { return currentReservedSize; }
3925    virtual size_t getMaxReservedSize() const { return maxReservedSize; }
3926    virtual void setMaxReservedSize(size_t size)
3927    {
3928        AutoLock locker(mutex_);
3929        size_t oldMaxReservedSize = maxReservedSize;
3930        maxReservedSize = size;
3931        if (maxReservedSize < oldMaxReservedSize)
3932        {
3933            typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
3934            for (; i != reservedEntries_.end();)
3935            {
3936                const BufferEntry& entry = *i;
3937                if (entry.capacity_ > maxReservedSize / 8)
3938                {
3939                    CV_DbgAssert(currentReservedSize >= entry.capacity_);
3940                    currentReservedSize -= entry.capacity_;
3941                    derived()._releaseBufferEntry(entry);
3942                    i = reservedEntries_.erase(i);
3943                    continue;
3944                }
3945                ++i;
3946            }
3947            _checkSizeOfReservedEntries();
3948        }
3949    }
3950    virtual void freeAllReservedBuffers()
3951    {
3952        AutoLock locker(mutex_);
3953        typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
3954        for (; i != reservedEntries_.end(); ++i)
3955        {
3956            const BufferEntry& entry = *i;
3957            derived()._releaseBufferEntry(entry);
3958        }
3959        reservedEntries_.clear();
3960        currentReservedSize = 0;
3961    }
3962};
3963
3964struct CLBufferEntry
3965{
3966    cl_mem clBuffer_;
3967    size_t capacity_;
3968    CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
3969};
3970
3971class OpenCLBufferPoolImpl : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
3972{
3973public:
3974    typedef struct CLBufferEntry BufferEntry;
3975protected:
3976    int createFlags_;
3977public:
3978    OpenCLBufferPoolImpl(int createFlags = 0)
3979        : createFlags_(createFlags)
3980    {
3981    }
3982
3983    void _allocateBufferEntry(BufferEntry& entry, size_t size)
3984    {
3985        CV_DbgAssert(entry.clBuffer_ == NULL);
3986        entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
3987        Context& ctx = Context::getDefault();
3988        cl_int retval = CL_SUCCESS;
3989        entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
3990        CV_Assert(retval == CL_SUCCESS);
3991        CV_Assert(entry.clBuffer_ != NULL);
3992        if(retval == CL_SUCCESS)
3993        {
3994            CV_IMPL_ADD(CV_IMPL_OCL);
3995        }
3996        LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
3997                (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
3998        allocatedEntries_.push_back(entry);
3999    }
4000
4001    void _releaseBufferEntry(const BufferEntry& entry)
4002    {
4003        CV_Assert(entry.capacity_ != 0);
4004        CV_Assert(entry.clBuffer_ != NULL);
4005        LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
4006                entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4007        clReleaseMemObject(entry.clBuffer_);
4008    }
4009};
4010
4011#ifdef HAVE_OPENCL_SVM
4012struct CLSVMBufferEntry
4013{
4014    void* clBuffer_;
4015    size_t capacity_;
4016    CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
4017};
4018class OpenCLSVMBufferPoolImpl : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
4019{
4020public:
4021    typedef struct CLSVMBufferEntry BufferEntry;
4022public:
4023    OpenCLSVMBufferPoolImpl()
4024    {
4025    }
4026
4027    void _allocateBufferEntry(BufferEntry& entry, size_t size)
4028    {
4029        CV_DbgAssert(entry.clBuffer_ == NULL);
4030        entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4031
4032        Context& ctx = Context::getDefault();
4033        const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4034        bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4035        cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
4036                (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4037
4038        const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4039        CV_DbgAssert(svmFns->isValid());
4040
4041        CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
4042        void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
4043        CV_Assert(buf);
4044
4045        entry.clBuffer_ = buf;
4046        {
4047            CV_IMPL_ADD(CV_IMPL_OCL);
4048        }
4049        LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
4050                (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4051        allocatedEntries_.push_back(entry);
4052    }
4053
4054    void _releaseBufferEntry(const BufferEntry& entry)
4055    {
4056        CV_Assert(entry.capacity_ != 0);
4057        CV_Assert(entry.clBuffer_ != NULL);
4058        LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
4059                entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4060        Context& ctx = Context::getDefault();
4061        const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4062        CV_DbgAssert(svmFns->isValid());
4063        CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n",  entry.clBuffer_);
4064        svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
4065    }
4066};
4067#endif
4068
4069
4070
4071#if defined _MSC_VER
4072#pragma warning(disable:4127) // conditional expression is constant
4073#endif
4074template <bool readAccess, bool writeAccess>
4075class AlignedDataPtr
4076{
4077protected:
4078    const size_t size_;
4079    uchar* const originPtr_;
4080    const size_t alignment_;
4081    uchar* ptr_;
4082    uchar* allocatedPtr_;
4083
4084public:
4085    AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
4086        : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
4087    {
4088        CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
4089        if (((size_t)ptr_ & (alignment - 1)) != 0)
4090        {
4091            allocatedPtr_ = new uchar[size_ + alignment - 1];
4092            ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
4093            if (readAccess)
4094            {
4095                memcpy(ptr_, originPtr_, size_);
4096            }
4097        }
4098    }
4099
4100    uchar* getAlignedPtr() const
4101    {
4102        CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
4103        return ptr_;
4104    }
4105
4106    ~AlignedDataPtr()
4107    {
4108        if (allocatedPtr_)
4109        {
4110            if (writeAccess)
4111            {
4112                memcpy(originPtr_, ptr_, size_);
4113            }
4114            delete[] allocatedPtr_;
4115            allocatedPtr_ = NULL;
4116        }
4117        ptr_ = NULL;
4118    }
4119private:
4120    AlignedDataPtr(const AlignedDataPtr&); // disabled
4121    AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
4122};
4123#if defined _MSC_VER
4124#pragma warning(default:4127) // conditional expression is constant
4125#endif
4126
4127#ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
4128#define CV_OPENCL_DATA_PTR_ALIGNMENT 16
4129#endif
4130
4131class OpenCLAllocator : public MatAllocator
4132{
4133    mutable OpenCLBufferPoolImpl bufferPool;
4134    mutable OpenCLBufferPoolImpl bufferPoolHostPtr;
4135#ifdef  HAVE_OPENCL_SVM
4136    mutable OpenCLSVMBufferPoolImpl bufferPoolSVM;
4137#endif
4138
4139    enum AllocatorFlags
4140    {
4141        ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
4142        ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1
4143#ifdef HAVE_OPENCL_SVM
4144        ,ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2
4145#endif
4146    };
4147public:
4148    OpenCLAllocator()
4149        : bufferPool(0),
4150          bufferPoolHostPtr(CL_MEM_ALLOC_HOST_PTR)
4151    {
4152        size_t defaultPoolSize, poolSize;
4153        defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
4154        poolSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
4155        bufferPool.setMaxReservedSize(poolSize);
4156        poolSize = getConfigurationParameterForSize("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
4157        bufferPoolHostPtr.setMaxReservedSize(poolSize);
4158#ifdef HAVE_OPENCL_SVM
4159        poolSize = getConfigurationParameterForSize("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
4160        bufferPoolSVM.setMaxReservedSize(poolSize);
4161#endif
4162
4163        matStdAllocator = Mat::getStdAllocator();
4164    }
4165
4166    UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
4167            int flags, UMatUsageFlags usageFlags) const
4168    {
4169        UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
4170        return u;
4171    }
4172
4173    void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const
4174    {
4175        const Device& dev = ctx.device(0);
4176        createFlags = 0;
4177        if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
4178            createFlags |= CL_MEM_ALLOC_HOST_PTR;
4179
4180        if( dev.hostUnifiedMemory() )
4181            flags0 = 0;
4182        else
4183            flags0 = UMatData::COPY_ON_MAP;
4184    }
4185
4186    UMatData* allocate(int dims, const int* sizes, int type,
4187                       void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const
4188    {
4189        if(!useOpenCL())
4190            return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
4191        CV_Assert(data == 0);
4192        size_t total = CV_ELEM_SIZE(type);
4193        for( int i = dims-1; i >= 0; i-- )
4194        {
4195            if( step )
4196                step[i] = total;
4197            total *= sizes[i];
4198        }
4199
4200        Context& ctx = Context::getDefault();
4201
4202        int createFlags = 0, flags0 = 0;
4203        getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
4204
4205        void* handle = NULL;
4206        int allocatorFlags = 0;
4207
4208#ifdef HAVE_OPENCL_SVM
4209        const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4210        if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
4211        {
4212            allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
4213            handle = bufferPoolSVM.allocate(total);
4214
4215            // this property is constant, so single buffer pool can be used here
4216            bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4217            allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
4218        }
4219        else
4220#endif
4221        if (createFlags == 0)
4222        {
4223            allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
4224            handle = bufferPool.allocate(total);
4225        }
4226        else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
4227        {
4228            allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
4229            handle = bufferPoolHostPtr.allocate(total);
4230        }
4231        else
4232        {
4233            CV_Assert(handle != NULL); // Unsupported, throw
4234        }
4235
4236        if (!handle)
4237            return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
4238
4239        UMatData* u = new UMatData(this);
4240        u->data = 0;
4241        u->size = total;
4242        u->handle = handle;
4243        u->flags = flags0;
4244        u->allocatorFlags_ = allocatorFlags;
4245        CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
4246        return u;
4247    }
4248
4249    bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const
4250    {
4251        if(!u)
4252            return false;
4253
4254        UMatDataAutoLock lock(u);
4255
4256        if(u->handle == 0)
4257        {
4258            CV_Assert(u->origdata != 0);
4259            Context& ctx = Context::getDefault();
4260            int createFlags = 0, flags0 = 0;
4261            getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
4262
4263            cl_context ctx_handle = (cl_context)ctx.ptr();
4264            int allocatorFlags = 0;
4265            int tempUMatFlags = 0;
4266            void* handle = NULL;
4267            cl_int retval = CL_SUCCESS;
4268
4269#ifdef HAVE_OPENCL_SVM
4270            svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4271            bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
4272            if (useSVM && svmCaps.isSupportFineGrainSystem())
4273            {
4274                allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
4275                tempUMatFlags = UMatData::TEMP_UMAT;
4276                handle = u->origdata;
4277                CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
4278            }
4279            else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
4280            {
4281                if (!(accessFlags & ACCESS_FAST)) // memcpy used
4282                {
4283                    bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4284
4285                    cl_svm_mem_flags memFlags = createFlags |
4286                            (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4287
4288                    const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4289                    CV_DbgAssert(svmFns->isValid());
4290
4291                    CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
4292                    handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
4293                    CV_Assert(handle);
4294
4295                    cl_command_queue q = NULL;
4296                    if (!isFineGrainBuffer)
4297                    {
4298                        q = (cl_command_queue)Queue::getDefault().ptr();
4299                        CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
4300                        cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
4301                                handle, u->size,
4302                                0, NULL, NULL);
4303                        CV_Assert(status == CL_SUCCESS);
4304
4305                    }
4306                    memcpy(handle, u->origdata, u->size);
4307                    if (!isFineGrainBuffer)
4308                    {
4309                        CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
4310                        cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
4311                        CV_Assert(status == CL_SUCCESS);
4312                    }
4313
4314                    tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
4315                    allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
4316                                                : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
4317                }
4318            }
4319            else
4320#endif
4321            {
4322                tempUMatFlags = UMatData::TEMP_UMAT;
4323                handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
4324                                           u->size, u->origdata, &retval);
4325                if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
4326                {
4327                    handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
4328                                               u->size, u->origdata, &retval);
4329                    tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
4330                }
4331            }
4332            if(!handle || retval != CL_SUCCESS)
4333                return false;
4334            u->handle = handle;
4335            u->prevAllocator = u->currAllocator;
4336            u->currAllocator = this;
4337            u->flags |= tempUMatFlags;
4338            u->allocatorFlags_ = allocatorFlags;
4339        }
4340        if(accessFlags & ACCESS_WRITE)
4341            u->markHostCopyObsolete(true);
4342        return true;
4343    }
4344
4345    /*void sync(UMatData* u) const
4346    {
4347        cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4348        UMatDataAutoLock lock(u);
4349
4350        if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
4351        {
4352            if( u->tempCopiedUMat() )
4353            {
4354                clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4355                                    u->size, u->origdata, 0, 0, 0);
4356            }
4357            else
4358            {
4359                cl_int retval = 0;
4360                void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4361                                                (CL_MAP_READ | CL_MAP_WRITE),
4362                                                0, u->size, 0, 0, 0, &retval);
4363                clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
4364                clFinish(q);
4365            }
4366            u->markHostCopyObsolete(false);
4367        }
4368        else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
4369        {
4370            clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4371                                 u->size, u->data, 0, 0, 0);
4372        }
4373    }*/
4374
4375    void deallocate(UMatData* u) const
4376    {
4377        if(!u)
4378            return;
4379
4380        CV_Assert(u->urefcount >= 0);
4381        CV_Assert(u->refcount >= 0);
4382
4383        CV_Assert(u->handle != 0 && u->urefcount == 0);
4384        if(u->tempUMat())
4385        {
4386//            UMatDataAutoLock lock(u);
4387
4388            if( u->hostCopyObsolete() && u->refcount > 0 )
4389            {
4390#ifdef HAVE_OPENCL_SVM
4391                if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4392                {
4393                    Context& ctx = Context::getDefault();
4394                    const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4395                    CV_DbgAssert(svmFns->isValid());
4396
4397                    if( u->tempCopiedUMat() )
4398                    {
4399                        CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
4400                                (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
4401                        bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
4402                        cl_command_queue q = NULL;
4403                        if (!isFineGrainBuffer)
4404                        {
4405                            CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
4406                            q = (cl_command_queue)Queue::getDefault().ptr();
4407                            CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
4408                            cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
4409                                    u->handle, u->size,
4410                                    0, NULL, NULL);
4411                            CV_Assert(status == CL_SUCCESS);
4412                        }
4413                        clFinish(q);
4414                        memcpy(u->origdata, u->handle, u->size);
4415                        if (!isFineGrainBuffer)
4416                        {
4417                            CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4418                            cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
4419                            CV_Assert(status == CL_SUCCESS);
4420                        }
4421                    }
4422                    else
4423                    {
4424                        CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
4425                        // nothing
4426                    }
4427                }
4428                else
4429#endif
4430                {
4431                    cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4432                    if( u->tempCopiedUMat() )
4433                    {
4434                        AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
4435                        CV_OclDbgAssert(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4436                                            u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS);
4437                    }
4438                    else
4439                    {
4440                        // TODO Is it really needed for clCreateBuffer with CL_MEM_USE_HOST_PTR?
4441                        cl_int retval = 0;
4442                        void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4443                                                        (CL_MAP_READ | CL_MAP_WRITE),
4444                                                        0, u->size, 0, 0, 0, &retval);
4445                        CV_OclDbgAssert(retval == CL_SUCCESS);
4446                        CV_OclDbgAssert(clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0) == CL_SUCCESS);
4447                        CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
4448                    }
4449                }
4450                u->markHostCopyObsolete(false);
4451            }
4452#ifdef HAVE_OPENCL_SVM
4453            if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4454            {
4455                if( u->tempCopiedUMat() )
4456                {
4457                    Context& ctx = Context::getDefault();
4458                    const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4459                    CV_DbgAssert(svmFns->isValid());
4460
4461                    CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
4462                    svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
4463                }
4464            }
4465            else
4466#endif
4467            {
4468                clReleaseMemObject((cl_mem)u->handle);
4469            }
4470            u->handle = 0;
4471            u->currAllocator = u->prevAllocator;
4472            if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
4473                fastFree(u->data);
4474            u->data = u->origdata;
4475            if(u->refcount == 0)
4476                u->currAllocator->deallocate(u);
4477        }
4478        else
4479        {
4480            CV_Assert(u->refcount == 0);
4481            if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
4482            {
4483                fastFree(u->data);
4484                u->data = 0;
4485            }
4486            if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
4487            {
4488                bufferPool.release((cl_mem)u->handle);
4489            }
4490            else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
4491            {
4492                bufferPoolHostPtr.release((cl_mem)u->handle);
4493            }
4494#ifdef HAVE_OPENCL_SVM
4495            else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
4496            {
4497                if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
4498                {
4499                    //nothing
4500                }
4501                else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
4502                        (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4503                {
4504                    Context& ctx = Context::getDefault();
4505                    const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4506                    CV_DbgAssert(svmFns->isValid());
4507                    cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4508
4509                    if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
4510                    {
4511                        CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4512                        cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
4513                        CV_Assert(status == CL_SUCCESS);
4514                    }
4515                }
4516                bufferPoolSVM.release((void*)u->handle);
4517            }
4518#endif
4519            else
4520            {
4521                clReleaseMemObject((cl_mem)u->handle);
4522            }
4523            u->handle = 0;
4524            delete u;
4525        }
4526    }
4527
4528    void map(UMatData* u, int accessFlags) const
4529    {
4530        if(!u)
4531            return;
4532
4533        CV_Assert( u->handle != 0 );
4534
4535        UMatDataAutoLock autolock(u);
4536
4537        if(accessFlags & ACCESS_WRITE)
4538            u->markDeviceCopyObsolete(true);
4539
4540        cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4541
4542        // FIXIT Workaround for UMat synchronization issue
4543        // if( u->refcount == 0 )
4544        {
4545            if( !u->copyOnMap() )
4546            {
4547                // TODO
4548                // because there can be other map requests for the same UMat with different access flags,
4549                // we use the universal (read-write) access mode.
4550#ifdef HAVE_OPENCL_SVM
4551                if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4552                {
4553                    if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4554                    {
4555                        Context& ctx = Context::getDefault();
4556                        const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4557                        CV_DbgAssert(svmFns->isValid());
4558
4559                        if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
4560                        {
4561                            CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
4562                            cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
4563                                    u->handle, u->size,
4564                                    0, NULL, NULL);
4565                            CV_Assert(status == CL_SUCCESS);
4566                            u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
4567                        }
4568                    }
4569                    clFinish(q);
4570                    u->data = (uchar*)u->handle;
4571                    u->markHostCopyObsolete(false);
4572                    u->markDeviceMemMapped(true);
4573                    return;
4574                }
4575#endif
4576                if (u->data) // FIXIT Workaround for UMat synchronization issue
4577                {
4578                    //CV_Assert(u->hostCopyObsolete() == false);
4579                    return;
4580                }
4581
4582                cl_int retval = 0;
4583                u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4584                                                     (CL_MAP_READ | CL_MAP_WRITE),
4585                                                     0, u->size, 0, 0, 0, &retval);
4586                if(u->data && retval == CL_SUCCESS)
4587                {
4588                    u->markHostCopyObsolete(false);
4589                    u->markDeviceMemMapped(true);
4590                    return;
4591                }
4592
4593                // TODO Is it really a good idea and was it tested well?
4594                // if map failed, switch to copy-on-map mode for the particular buffer
4595                u->flags |= UMatData::COPY_ON_MAP;
4596            }
4597
4598            if(!u->data)
4599            {
4600                u->data = (uchar*)fastMalloc(u->size);
4601                u->markHostCopyObsolete(true);
4602            }
4603        }
4604
4605        if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
4606        {
4607            AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
4608#ifdef HAVE_OPENCL_SVM
4609            CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
4610#endif
4611            CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4612                                           u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
4613            u->markHostCopyObsolete(false);
4614        }
4615    }
4616
4617    void unmap(UMatData* u) const
4618    {
4619        if(!u)
4620            return;
4621
4622
4623        CV_Assert(u->handle != 0);
4624
4625        UMatDataAutoLock autolock(u);
4626
4627        // FIXIT Workaround for UMat synchronization issue
4628        if(u->refcount > 0)
4629            return;
4630
4631        cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4632        cl_int retval = 0;
4633        if( !u->copyOnMap() && u->deviceMemMapped() )
4634        {
4635            CV_Assert(u->data != NULL);
4636            u->markDeviceMemMapped(false);
4637#ifdef HAVE_OPENCL_SVM
4638            if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4639            {
4640                if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4641                {
4642                    Context& ctx = Context::getDefault();
4643                    const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4644                    CV_DbgAssert(svmFns->isValid());
4645
4646                    CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
4647                    {
4648                        CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4649                        cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
4650                                0, NULL, NULL);
4651                        CV_Assert(status == CL_SUCCESS);
4652                        clFinish(q);
4653                        u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
4654                    }
4655                }
4656                u->data = 0;
4657                u->markDeviceCopyObsolete(false);
4658                u->markHostCopyObsolete(false);
4659                return;
4660            }
4661#endif
4662            CV_Assert( (retval = clEnqueueUnmapMemObject(q,
4663                                (cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS );
4664            if (Device::getDefault().isAMD())
4665            {
4666                // required for multithreaded applications (see stitching test)
4667                CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
4668            }
4669            u->data = 0;
4670        }
4671        else if( u->copyOnMap() && u->deviceCopyObsolete() )
4672        {
4673            AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
4674#ifdef HAVE_OPENCL_SVM
4675            CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
4676#endif
4677            CV_Assert( (retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4678                                u->size, alignedPtr.getAlignedPtr(), 0, 0, 0)) == CL_SUCCESS );
4679        }
4680        u->markDeviceCopyObsolete(false);
4681        u->markHostCopyObsolete(false);
4682    }
4683
4684    bool checkContinuous(int dims, const size_t sz[],
4685                         const size_t srcofs[], const size_t srcstep[],
4686                         const size_t dstofs[], const size_t dststep[],
4687                         size_t& total, size_t new_sz[],
4688                         size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
4689                         size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
4690    {
4691        bool iscontinuous = true;
4692        srcrawofs = srcofs ? srcofs[dims-1] : 0;
4693        dstrawofs = dstofs ? dstofs[dims-1] : 0;
4694        total = sz[dims-1];
4695        for( int i = dims-2; i >= 0; i-- )
4696        {
4697            if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
4698                iscontinuous = false;
4699            total *= sz[i];
4700            if( srcofs )
4701                srcrawofs += srcofs[i]*srcstep[i];
4702            if( dstofs )
4703                dstrawofs += dstofs[i]*dststep[i];
4704        }
4705
4706        if( !iscontinuous )
4707        {
4708            // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
4709            if( dims == 2 )
4710            {
4711                new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
4712                // we assume that new_... arrays are initialized by caller
4713                // with 0's, so there is no else branch
4714                if( srcofs )
4715                {
4716                    new_srcofs[0] = srcofs[1];
4717                    new_srcofs[1] = srcofs[0];
4718                    new_srcofs[2] = 0;
4719                }
4720
4721                if( dstofs )
4722                {
4723                    new_dstofs[0] = dstofs[1];
4724                    new_dstofs[1] = dstofs[0];
4725                    new_dstofs[2] = 0;
4726                }
4727
4728                new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
4729                new_dststep[0] = dststep[0]; new_dststep[1] = 0;
4730            }
4731            else
4732            {
4733                // we could check for dims == 3 here,
4734                // but from user perspective this one is more informative
4735                CV_Assert(dims <= 3);
4736                new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
4737                if( srcofs )
4738                {
4739                    new_srcofs[0] = srcofs[2];
4740                    new_srcofs[1] = srcofs[1];
4741                    new_srcofs[2] = srcofs[0];
4742                }
4743
4744                if( dstofs )
4745                {
4746                    new_dstofs[0] = dstofs[2];
4747                    new_dstofs[1] = dstofs[1];
4748                    new_dstofs[2] = dstofs[0];
4749                }
4750
4751                new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
4752                new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
4753            }
4754        }
4755        return iscontinuous;
4756    }
4757
4758    void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
4759                  const size_t srcofs[], const size_t srcstep[],
4760                  const size_t dststep[]) const
4761    {
4762        if(!u)
4763            return;
4764        UMatDataAutoLock autolock(u);
4765
4766        if( u->data && !u->hostCopyObsolete() )
4767        {
4768            Mat::getStdAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
4769            return;
4770        }
4771        CV_Assert( u->handle != 0 );
4772
4773        cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4774
4775        size_t total = 0, new_sz[] = {0, 0, 0};
4776        size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4777        size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4778
4779        bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
4780                                            total, new_sz,
4781                                            srcrawofs, new_srcofs, new_srcstep,
4782                                            dstrawofs, new_dstofs, new_dststep);
4783
4784#ifdef HAVE_OPENCL_SVM
4785        if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4786        {
4787            CV_DbgAssert(u->data == NULL || u->data == u->handle);
4788            Context& ctx = Context::getDefault();
4789            const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4790            CV_DbgAssert(svmFns->isValid());
4791
4792            CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
4793            if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4794            {
4795                CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
4796                cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
4797                        u->handle, u->size,
4798                        0, NULL, NULL);
4799                CV_Assert(status == CL_SUCCESS);
4800            }
4801            clFinish(q);
4802            if( iscontinuous )
4803            {
4804                memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
4805            }
4806            else
4807            {
4808                // This code is from MatAllocator::download()
4809                int isz[CV_MAX_DIM];
4810                uchar* srcptr = (uchar*)u->handle;
4811                for( int i = 0; i < dims; i++ )
4812                {
4813                    CV_Assert( sz[i] <= (size_t)INT_MAX );
4814                    if( sz[i] == 0 )
4815                    return;
4816                    if( srcofs )
4817                    srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
4818                    isz[i] = (int)sz[i];
4819                }
4820
4821                Mat src(dims, isz, CV_8U, srcptr, srcstep);
4822                Mat dst(dims, isz, CV_8U, dstptr, dststep);
4823
4824                const Mat* arrays[] = { &src, &dst };
4825                uchar* ptrs[2];
4826                NAryMatIterator it(arrays, ptrs, 2);
4827                size_t j, planesz = it.size;
4828
4829                for( j = 0; j < it.nplanes; j++, ++it )
4830                    memcpy(ptrs[1], ptrs[0], planesz);
4831            }
4832            if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4833            {
4834                CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4835                cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
4836                        0, NULL, NULL);
4837                CV_Assert(status == CL_SUCCESS);
4838                clFinish(q);
4839            }
4840        }
4841        else
4842#endif
4843        {
4844            AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, sz[0] * dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
4845            if( iscontinuous )
4846            {
4847                CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
4848                                               srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
4849            }
4850            else
4851            {
4852                CV_Assert( clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
4853                                new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1],
4854                                new_dststep[0], new_dststep[1], alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
4855            }
4856        }
4857    }
4858
4859    void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
4860                const size_t dstofs[], const size_t dststep[],
4861                const size_t srcstep[]) const
4862    {
4863        if(!u)
4864            return;
4865
4866        // there should be no user-visible CPU copies of the UMat which we are going to copy to
4867        CV_Assert(u->refcount == 0 || u->tempUMat());
4868
4869        size_t total = 0, new_sz[] = {0, 0, 0};
4870        size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4871        size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4872
4873        bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
4874                                            total, new_sz,
4875                                            srcrawofs, new_srcofs, new_srcstep,
4876                                            dstrawofs, new_dstofs, new_dststep);
4877
4878        UMatDataAutoLock autolock(u);
4879
4880        // if there is cached CPU copy of the GPU matrix,
4881        // we could use it as a destination.
4882        // we can do it in 2 cases:
4883        //    1. we overwrite the whole content
4884        //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
4885        if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
4886        {
4887            Mat::getStdAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
4888            u->markHostCopyObsolete(false);
4889            u->markDeviceCopyObsolete(true);
4890            return;
4891        }
4892
4893        CV_Assert( u->handle != 0 );
4894        cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4895
4896#ifdef HAVE_OPENCL_SVM
4897        if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4898        {
4899            CV_DbgAssert(u->data == NULL || u->data == u->handle);
4900            Context& ctx = Context::getDefault();
4901            const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4902            CV_DbgAssert(svmFns->isValid());
4903
4904            CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
4905            if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4906            {
4907                CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
4908                cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
4909                        u->handle, u->size,
4910                        0, NULL, NULL);
4911                CV_Assert(status == CL_SUCCESS);
4912            }
4913            clFinish(q);
4914            if( iscontinuous )
4915            {
4916                memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
4917            }
4918            else
4919            {
4920                // This code is from MatAllocator::upload()
4921                int isz[CV_MAX_DIM];
4922                uchar* dstptr = (uchar*)u->handle;
4923                for( int i = 0; i < dims; i++ )
4924                {
4925                    CV_Assert( sz[i] <= (size_t)INT_MAX );
4926                    if( sz[i] == 0 )
4927                    return;
4928                    if( dstofs )
4929                    dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
4930                    isz[i] = (int)sz[i];
4931                }
4932
4933                Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
4934                Mat dst(dims, isz, CV_8U, dstptr, dststep);
4935
4936                const Mat* arrays[] = { &src, &dst };
4937                uchar* ptrs[2];
4938                NAryMatIterator it(arrays, ptrs, 2);
4939                size_t j, planesz = it.size;
4940
4941                for( j = 0; j < it.nplanes; j++, ++it )
4942                    memcpy(ptrs[1], ptrs[0], planesz);
4943            }
4944            if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4945            {
4946                CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4947                cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
4948                        0, NULL, NULL);
4949                CV_Assert(status == CL_SUCCESS);
4950                clFinish(q);
4951            }
4952        }
4953        else
4954#endif
4955        {
4956            AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, sz[0] * srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
4957            if( iscontinuous )
4958            {
4959                CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle,
4960                    CL_TRUE, dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
4961            }
4962            else
4963            {
4964                CV_Assert( clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
4965                    new_dstofs, new_srcofs, new_sz, new_dststep[0], new_dststep[1],
4966                    new_srcstep[0], new_srcstep[1], alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
4967            }
4968        }
4969        u->markHostCopyObsolete(true);
4970#ifdef HAVE_OPENCL_SVM
4971        if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
4972                (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
4973        {
4974            // nothing
4975        }
4976        else
4977#endif
4978        {
4979            u->markHostCopyObsolete(true);
4980        }
4981        u->markDeviceCopyObsolete(false);
4982    }
4983
4984    void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
4985              const size_t srcofs[], const size_t srcstep[],
4986              const size_t dstofs[], const size_t dststep[], bool _sync) const
4987    {
4988        if(!src || !dst)
4989            return;
4990
4991        size_t total = 0, new_sz[] = {0, 0, 0};
4992        size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4993        size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4994
4995        bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
4996                                            total, new_sz,
4997                                            srcrawofs, new_srcofs, new_srcstep,
4998                                            dstrawofs, new_dstofs, new_dststep);
4999
5000        UMatDataAutoLock src_autolock(src);
5001        UMatDataAutoLock dst_autolock(dst);
5002
5003        if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
5004        {
5005            upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
5006            return;
5007        }
5008        if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
5009        {
5010            download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
5011            dst->markHostCopyObsolete(false);
5012#ifdef HAVE_OPENCL_SVM
5013            if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5014                    (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5015            {
5016                // nothing
5017            }
5018            else
5019#endif
5020            {
5021                dst->markDeviceCopyObsolete(true);
5022            }
5023            return;
5024        }
5025
5026        // there should be no user-visible CPU copies of the UMat which we are going to copy to
5027        CV_Assert(dst->refcount == 0);
5028        cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5029
5030        cl_int retval = CL_SUCCESS;
5031#ifdef HAVE_OPENCL_SVM
5032        if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
5033                (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5034        {
5035            if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
5036                            (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5037            {
5038                Context& ctx = Context::getDefault();
5039                const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5040                CV_DbgAssert(svmFns->isValid());
5041
5042                if( iscontinuous )
5043                {
5044                    CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
5045                            (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
5046                    cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
5047                            (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
5048                            total, 0, NULL, NULL);
5049                    CV_Assert(status == CL_SUCCESS);
5050                }
5051                else
5052                {
5053                    clFinish(q);
5054                    // This code is from MatAllocator::download()/upload()
5055                    int isz[CV_MAX_DIM];
5056                    uchar* srcptr = (uchar*)src->handle;
5057                    for( int i = 0; i < dims; i++ )
5058                    {
5059                        CV_Assert( sz[i] <= (size_t)INT_MAX );
5060                        if( sz[i] == 0 )
5061                        return;
5062                        if( srcofs )
5063                        srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5064                        isz[i] = (int)sz[i];
5065                    }
5066                    Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
5067
5068                    uchar* dstptr = (uchar*)dst->handle;
5069                    for( int i = 0; i < dims; i++ )
5070                    {
5071                        if( dstofs )
5072                        dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
5073                    }
5074                    Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
5075
5076                    const Mat* arrays[] = { &m_src, &m_dst };
5077                    uchar* ptrs[2];
5078                    NAryMatIterator it(arrays, ptrs, 2);
5079                    size_t j, planesz = it.size;
5080
5081                    for( j = 0; j < it.nplanes; j++, ++it )
5082                        memcpy(ptrs[1], ptrs[0], planesz);
5083                }
5084            }
5085            else
5086            {
5087                if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5088                {
5089                    map(src, ACCESS_READ);
5090                    upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
5091                    unmap(src);
5092                }
5093                else
5094                {
5095                    map(dst, ACCESS_WRITE);
5096                    download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
5097                    unmap(dst);
5098                }
5099            }
5100        }
5101        else
5102#endif
5103        {
5104            if( iscontinuous )
5105            {
5106                CV_Assert( (retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
5107                                               srcrawofs, dstrawofs, total, 0, 0, 0)) == CL_SUCCESS );
5108            }
5109            else
5110            {
5111                CV_Assert( (retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
5112                                                   new_srcofs, new_dstofs, new_sz,
5113                                                   new_srcstep[0], new_srcstep[1],
5114                                                   new_dststep[0], new_dststep[1],
5115                                                   0, 0, 0)) == CL_SUCCESS );
5116            }
5117        }
5118        if (retval == CL_SUCCESS)
5119        {
5120            CV_IMPL_ADD(CV_IMPL_OCL)
5121        }
5122
5123#ifdef HAVE_OPENCL_SVM
5124        if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5125                (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5126        {
5127            // nothing
5128        }
5129        else
5130#endif
5131        {
5132            dst->markHostCopyObsolete(true);
5133        }
5134        dst->markDeviceCopyObsolete(false);
5135
5136        if( _sync )
5137        {
5138            CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
5139        }
5140    }
5141
5142    BufferPoolController* getBufferPoolController(const char* id) const {
5143#ifdef HAVE_OPENCL_SVM
5144        if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
5145        {
5146            return &bufferPoolSVM;
5147        }
5148#endif
5149        if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
5150        {
5151            return &bufferPoolHostPtr;
5152        }
5153        if (id != NULL && strcmp(id, "OCL") != 0)
5154        {
5155            CV_ErrorNoReturn(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
5156        }
5157        return &bufferPool;
5158    }
5159
5160    MatAllocator* matStdAllocator;
5161};
5162
5163MatAllocator* getOpenCLAllocator()
5164{
5165    static MatAllocator * allocator = new OpenCLAllocator();
5166    return allocator;
5167}
5168
5169///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
5170
5171static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
5172{
5173    cl_uint numDevices = 0;
5174    CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
5175                                0, NULL, &numDevices) == CL_SUCCESS);
5176
5177    if (numDevices == 0)
5178    {
5179        devices.clear();
5180        return;
5181    }
5182
5183    devices.resize((size_t)numDevices);
5184    CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
5185                                numDevices, &devices[0], &numDevices) == CL_SUCCESS);
5186}
5187
5188struct PlatformInfo::Impl
5189{
5190    Impl(void* id)
5191    {
5192        refcount = 1;
5193        handle = *(cl_platform_id*)id;
5194        getDevices(devices, handle);
5195    }
5196
5197    String getStrProp(cl_device_info prop) const
5198    {
5199        char buf[1024];
5200        size_t sz=0;
5201        return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
5202            sz < sizeof(buf) ? String(buf) : String();
5203    }
5204
5205    IMPLEMENT_REFCOUNTABLE();
5206    std::vector<cl_device_id> devices;
5207    cl_platform_id handle;
5208};
5209
5210PlatformInfo::PlatformInfo()
5211{
5212    p = 0;
5213}
5214
5215PlatformInfo::PlatformInfo(void* platform_id)
5216{
5217    p = new Impl(platform_id);
5218}
5219
5220PlatformInfo::~PlatformInfo()
5221{
5222    if(p)
5223        p->release();
5224}
5225
5226PlatformInfo::PlatformInfo(const PlatformInfo& i)
5227{
5228    if (i.p)
5229        i.p->addref();
5230    p = i.p;
5231}
5232
5233PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
5234{
5235    if (i.p != p)
5236    {
5237        if (i.p)
5238            i.p->addref();
5239        if (p)
5240            p->release();
5241        p = i.p;
5242    }
5243    return *this;
5244}
5245
5246int PlatformInfo::deviceNumber() const
5247{
5248    return p ? (int)p->devices.size() : 0;
5249}
5250
5251void PlatformInfo::getDevice(Device& device, int d) const
5252{
5253    CV_Assert(p && d < (int)p->devices.size() );
5254    if(p)
5255        device.set(p->devices[d]);
5256}
5257
5258String PlatformInfo::name() const
5259{
5260    return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
5261}
5262
5263String PlatformInfo::vendor() const
5264{
5265    return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
5266}
5267
5268String PlatformInfo::version() const
5269{
5270    return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
5271}
5272
5273static void getPlatforms(std::vector<cl_platform_id>& platforms)
5274{
5275    cl_uint numPlatforms = 0;
5276    CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
5277
5278    if (numPlatforms == 0)
5279    {
5280        platforms.clear();
5281        return;
5282    }
5283
5284    platforms.resize((size_t)numPlatforms);
5285    CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
5286}
5287
5288void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
5289{
5290    std::vector<cl_platform_id> platforms;
5291    getPlatforms(platforms);
5292
5293    for (size_t i = 0; i < platforms.size(); i++)
5294        platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
5295}
5296
5297const char* typeToStr(int type)
5298{
5299    static const char* tab[]=
5300    {
5301        "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
5302        "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
5303        "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
5304        "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
5305        "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
5306        "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
5307        "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
5308        "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
5309    };
5310    int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
5311    return cn > 16 ? "?" : tab[depth*16 + cn-1];
5312}
5313
5314const char* memopTypeToStr(int type)
5315{
5316    static const char* tab[] =
5317    {
5318        "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
5319        "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
5320        "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
5321        "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
5322        "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
5323        "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
5324        "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
5325        "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
5326    };
5327    int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
5328    return cn > 16 ? "?" : tab[depth*16 + cn-1];
5329}
5330
5331const char* vecopTypeToStr(int type)
5332{
5333    static const char* tab[] =
5334    {
5335        "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
5336        "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
5337        "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
5338        "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
5339        "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
5340        "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
5341        "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
5342        "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
5343    };
5344    int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
5345    return cn > 16 ? "?" : tab[depth*16 + cn-1];
5346}
5347
5348const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
5349{
5350    if( sdepth == ddepth )
5351        return "noconvert";
5352    const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
5353    if( ddepth >= CV_32F ||
5354        (ddepth == CV_32S && sdepth < CV_32S) ||
5355        (ddepth == CV_16S && sdepth <= CV_8S) ||
5356        (ddepth == CV_16U && sdepth == CV_8U))
5357    {
5358        sprintf(buf, "convert_%s", typestr);
5359    }
5360    else if( sdepth >= CV_32F )
5361        sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
5362    else
5363        sprintf(buf, "convert_%s_sat", typestr);
5364
5365    return buf;
5366}
5367
5368template <typename T>
5369static std::string kerToStr(const Mat & k)
5370{
5371    int width = k.cols - 1, depth = k.depth();
5372    const T * const data = k.ptr<T>();
5373
5374    std::ostringstream stream;
5375    stream.precision(10);
5376
5377    if (depth <= CV_8S)
5378    {
5379        for (int i = 0; i < width; ++i)
5380            stream << "DIG(" << (int)data[i] << ")";
5381        stream << "DIG(" << (int)data[width] << ")";
5382    }
5383    else if (depth == CV_32F)
5384    {
5385        stream.setf(std::ios_base::showpoint);
5386        for (int i = 0; i < width; ++i)
5387            stream << "DIG(" << data[i] << "f)";
5388        stream << "DIG(" << data[width] << "f)";
5389    }
5390    else
5391    {
5392        for (int i = 0; i < width; ++i)
5393            stream << "DIG(" << data[i] << ")";
5394        stream << "DIG(" << data[width] << ")";
5395    }
5396
5397    return stream.str();
5398}
5399
5400String kernelToStr(InputArray _kernel, int ddepth, const char * name)
5401{
5402    Mat kernel = _kernel.getMat().reshape(1, 1);
5403
5404    int depth = kernel.depth();
5405    if (ddepth < 0)
5406        ddepth = depth;
5407
5408    if (ddepth != depth)
5409        kernel.convertTo(kernel, ddepth);
5410
5411    typedef std::string (* func_t)(const Mat &);
5412    static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
5413                                    kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
5414    const func_t func = funcs[ddepth];
5415    CV_Assert(func != 0);
5416
5417    return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
5418}
5419
5420#define PROCESS_SRC(src) \
5421    do \
5422    { \
5423        if (!src.empty()) \
5424        { \
5425            CV_Assert(src.isMat() || src.isUMat()); \
5426            Size csize = src.size(); \
5427            int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
5428                ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
5429            if (cwidth < ckercn || ckercn <= 0) \
5430                return 1; \
5431            cols.push_back(cwidth); \
5432            if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
5433                return 1; \
5434            offsets.push_back(src.offset()); \
5435            steps.push_back(src.step()); \
5436            dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
5437            kercns.push_back(ckercn); \
5438        } \
5439    } \
5440    while ((void)0, 0)
5441
5442int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
5443                              InputArray src4, InputArray src5, InputArray src6,
5444                              InputArray src7, InputArray src8, InputArray src9,
5445                              OclVectorStrategy strat)
5446{
5447    const ocl::Device & d = ocl::Device::getDefault();
5448
5449    int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
5450        d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
5451        d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
5452        d.preferredVectorWidthDouble(), -1 };
5453
5454    // if the device says don't use vectors
5455    if (vectorWidths[0] == 1)
5456    {
5457        // it's heuristic
5458        vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
5459        vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
5460        vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
5461    }
5462
5463    return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
5464}
5465
5466int checkOptimalVectorWidth(const int *vectorWidths,
5467                            InputArray src1, InputArray src2, InputArray src3,
5468                            InputArray src4, InputArray src5, InputArray src6,
5469                            InputArray src7, InputArray src8, InputArray src9,
5470                            OclVectorStrategy strat)
5471{
5472    CV_Assert(vectorWidths);
5473
5474    int ref_type = src1.type();
5475
5476    std::vector<size_t> offsets, steps, cols;
5477    std::vector<int> dividers, kercns;
5478    PROCESS_SRC(src1);
5479    PROCESS_SRC(src2);
5480    PROCESS_SRC(src3);
5481    PROCESS_SRC(src4);
5482    PROCESS_SRC(src5);
5483    PROCESS_SRC(src6);
5484    PROCESS_SRC(src7);
5485    PROCESS_SRC(src8);
5486    PROCESS_SRC(src9);
5487
5488    size_t size = offsets.size();
5489
5490    for (size_t i = 0; i < size; ++i)
5491        while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
5492            dividers[i] >>= 1, kercns[i] >>= 1;
5493
5494    // default strategy
5495    int kercn = *std::min_element(kercns.begin(), kercns.end());
5496
5497    return kercn;
5498}
5499
5500int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
5501                                 InputArray src4, InputArray src5, InputArray src6,
5502                                 InputArray src7, InputArray src8, InputArray src9)
5503{
5504    return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
5505}
5506
5507#undef PROCESS_SRC
5508
5509
5510// TODO Make this as a method of OpenCL "BuildOptions" class
5511void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
5512{
5513    if (!buildOptions.empty())
5514        buildOptions += " ";
5515    int type = _m.type(), depth = CV_MAT_DEPTH(type);
5516    buildOptions += format(
5517            "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
5518            name.c_str(), ocl::typeToStr(type),
5519            name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
5520            name.c_str(), (int)CV_MAT_CN(type),
5521            name.c_str(), (int)CV_ELEM_SIZE(type),
5522            name.c_str(), (int)CV_ELEM_SIZE1(type),
5523            name.c_str(), (int)depth
5524            );
5525}
5526
5527
5528struct Image2D::Impl
5529{
5530    Impl(const UMat &src, bool norm, bool alias)
5531    {
5532        handle = 0;
5533        refcount = 1;
5534        init(src, norm, alias);
5535    }
5536
5537    ~Impl()
5538    {
5539        if (handle)
5540            clReleaseMemObject(handle);
5541    }
5542
5543    static cl_image_format getImageFormat(int depth, int cn, bool norm)
5544    {
5545        cl_image_format format;
5546        static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
5547                                       CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
5548        static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
5549                                                CL_SNORM_INT16, -1, -1, -1, -1 };
5550        static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
5551
5552        int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
5553        int channelOrder = channelOrders[cn];
5554        format.image_channel_data_type = (cl_channel_type)channelType;
5555        format.image_channel_order = (cl_channel_order)channelOrder;
5556        return format;
5557    }
5558
5559    static bool isFormatSupported(cl_image_format format)
5560    {
5561        if (!haveOpenCL())
5562            CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
5563
5564        cl_context context = (cl_context)Context::getDefault().ptr();
5565        // Figure out how many formats are supported by this context.
5566        cl_uint numFormats = 0;
5567        cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
5568                                                CL_MEM_OBJECT_IMAGE2D, numFormats,
5569                                                NULL, &numFormats);
5570        AutoBuffer<cl_image_format> formats(numFormats);
5571        err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
5572                                         CL_MEM_OBJECT_IMAGE2D, numFormats,
5573                                         formats, NULL);
5574        CV_OclDbgAssert(err == CL_SUCCESS);
5575        for (cl_uint i = 0; i < numFormats; ++i)
5576        {
5577            if (!memcmp(&formats[i], &format, sizeof(format)))
5578            {
5579                return true;
5580            }
5581        }
5582        return false;
5583    }
5584
5585    void init(const UMat &src, bool norm, bool alias)
5586    {
5587        if (!haveOpenCL())
5588            CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
5589
5590        CV_Assert(!src.empty());
5591        CV_Assert(ocl::Device::getDefault().imageSupport());
5592
5593        int err, depth = src.depth(), cn = src.channels();
5594        CV_Assert(cn <= 4);
5595        cl_image_format format = getImageFormat(depth, cn, norm);
5596
5597        if (!isFormatSupported(format))
5598            CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
5599
5600        if (alias && !src.handle(ACCESS_RW))
5601            CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
5602
5603        cl_context context = (cl_context)Context::getDefault().ptr();
5604        cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
5605
5606#ifdef CL_VERSION_1_2
5607        // this enables backwards portability to
5608        // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
5609        const Device & d = ocl::Device::getDefault();
5610        int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
5611        CV_Assert(!alias || canCreateAlias(src));
5612        if (1 < major || (1 == major && 2 <= minor))
5613        {
5614            cl_image_desc desc;
5615            desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
5616            desc.image_width      = src.cols;
5617            desc.image_height     = src.rows;
5618            desc.image_depth      = 0;
5619            desc.image_array_size = 1;
5620            desc.image_row_pitch  = alias ? src.step[0] : 0;
5621            desc.image_slice_pitch = 0;
5622            desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
5623            desc.num_mip_levels   = 0;
5624            desc.num_samples      = 0;
5625            handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
5626        }
5627        else
5628#endif
5629        {
5630            CV_SUPPRESS_DEPRECATED_START
5631            CV_Assert(!alias);  // This is an OpenCL 1.2 extension
5632            handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
5633            CV_SUPPRESS_DEPRECATED_END
5634        }
5635        CV_OclDbgAssert(err == CL_SUCCESS);
5636
5637        size_t origin[] = { 0, 0, 0 };
5638        size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
5639
5640        cl_mem devData;
5641        if (!alias && !src.isContinuous())
5642        {
5643            devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
5644            CV_OclDbgAssert(err == CL_SUCCESS);
5645
5646            const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
5647            CV_Assert(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
5648                roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL) == CL_SUCCESS);
5649            CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
5650        }
5651        else
5652        {
5653            devData = (cl_mem)src.handle(ACCESS_READ);
5654        }
5655        CV_Assert(devData != NULL);
5656
5657        if (!alias)
5658        {
5659            CV_OclDbgAssert(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0) == CL_SUCCESS);
5660            if (!src.isContinuous())
5661            {
5662                CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
5663                CV_OclDbgAssert(clReleaseMemObject(devData) == CL_SUCCESS);
5664            }
5665        }
5666    }
5667
5668    IMPLEMENT_REFCOUNTABLE();
5669
5670    cl_mem handle;
5671};
5672
5673Image2D::Image2D()
5674{
5675    p = NULL;
5676}
5677
5678Image2D::Image2D(const UMat &src, bool norm, bool alias)
5679{
5680    p = new Impl(src, norm, alias);
5681}
5682
5683bool Image2D::canCreateAlias(const UMat &m)
5684{
5685    bool ret = false;
5686    const Device & d = ocl::Device::getDefault();
5687    if (d.imageFromBufferSupport() && !m.empty())
5688    {
5689        // This is the required pitch alignment in pixels
5690        uint pitchAlign = d.imagePitchAlignment();
5691        if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
5692        {
5693            // We don't currently handle the case where the buffer was created
5694            // with CL_MEM_USE_HOST_PTR
5695            if (!m.u->tempUMat())
5696            {
5697                ret = true;
5698            }
5699        }
5700    }
5701    return ret;
5702}
5703
5704bool Image2D::isFormatSupported(int depth, int cn, bool norm)
5705{
5706    cl_image_format format = Impl::getImageFormat(depth, cn, norm);
5707
5708    return Impl::isFormatSupported(format);
5709}
5710
5711Image2D::Image2D(const Image2D & i)
5712{
5713    p = i.p;
5714    if (p)
5715        p->addref();
5716}
5717
5718Image2D & Image2D::operator = (const Image2D & i)
5719{
5720    if (i.p != p)
5721    {
5722        if (i.p)
5723            i.p->addref();
5724        if (p)
5725            p->release();
5726        p = i.p;
5727    }
5728    return *this;
5729}
5730
5731Image2D::~Image2D()
5732{
5733    if (p)
5734        p->release();
5735}
5736
5737void* Image2D::ptr() const
5738{
5739    return p ? p->handle : 0;
5740}
5741
5742bool internal::isPerformanceCheckBypassed()
5743{
5744    static bool initialized = false;
5745    static bool value = false;
5746    if (!initialized)
5747    {
5748        value = getBoolParameter("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
5749        initialized = true;
5750    }
5751    return value;
5752}
5753
5754bool internal::isCLBuffer(UMat& u)
5755{
5756    void* h = u.handle(ACCESS_RW);
5757    if (!h)
5758        return true;
5759    CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
5760#if 1
5761    if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
5762        return false;
5763#else
5764    cl_mem_object_type type = 0;
5765    cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
5766    if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
5767        return false;
5768#endif
5769    return true;
5770}
5771
5772}}
5773