1// This file is auto-generated. Do not edit!
2
3#include "precomp.hpp"
4#include "opencl_kernels_video.hpp"
5
6namespace cv
7{
8namespace ocl
9{
10namespace video
11{
12
13const struct ProgramEntry bgfg_mog2={"bgfg_mog2",
14"#if CN==1\n"
15"#define T_MEAN float\n"
16"#define F_ZERO (0.0f)\n"
17"#define cnMode 1\n"
18"#define frameToMean(a, b) (b) = *(a);\n"
19"#define meanToFrame(a, b) *b = convert_uchar_sat(a);\n"
20"inline float sum(float val)\n"
21"{\n"
22"return val;\n"
23"}\n"
24"#else\n"
25"#define T_MEAN float4\n"
26"#define F_ZERO (0.0f, 0.0f, 0.0f, 0.0f)\n"
27"#define cnMode 4\n"
28"#define meanToFrame(a, b)\\\n"
29"b[0] = convert_uchar_sat(a.x); \\\n"
30"b[1] = convert_uchar_sat(a.y); \\\n"
31"b[2] = convert_uchar_sat(a.z);\n"
32"#define frameToMean(a, b)\\\n"
33"b.x = a[0]; \\\n"
34"b.y = a[1]; \\\n"
35"b.z = a[2]; \\\n"
36"b.w = 0.0f;\n"
37"inline float sum(const float4 val)\n"
38"{\n"
39"return (val.x + val.y + val.z);\n"
40"}\n"
41"#endif\n"
42"__kernel void mog2_kernel(__global const uchar* frame, int frame_step, int frame_offset, int frame_row, int frame_col,\n"
43"__global uchar* modesUsed,\n"
44"__global uchar* weight,\n"
45"__global uchar* mean,\n"
46"__global uchar* variance,\n"
47"__global uchar* fgmask, int fgmask_step, int fgmask_offset,\n"
48"float alphaT, float alpha1, float prune,\n"
49"float c_Tb, float c_TB, float c_Tg, float c_varMin,\n"
50"float c_varMax, float c_varInit, float c_tau\n"
51"#ifdef SHADOW_DETECT\n"
52", uchar c_shadowVal\n"
53"#endif\n"
54")\n"
55"{\n"
56"int x = get_global_id(0);\n"
57"int y = get_global_id(1);\n"
58"if( x < frame_col && y < frame_row)\n"
59"{\n"
60"__global const uchar* _frame = (frame + mad24(y, frame_step, mad24(x, CN, frame_offset)));\n"
61"T_MEAN pix;\n"
62"frameToMean(_frame, pix);\n"
63"uchar foreground = 255;\n"
64"bool fitsPDF = false;\n"
65"int pt_idx =  mad24(y, frame_col, x);\n"
66"int idx_step = frame_row * frame_col;\n"
67"__global uchar* _modesUsed = modesUsed + pt_idx;\n"
68"uchar nmodes = _modesUsed[0];\n"
69"float totalWeight = 0.0f;\n"
70"__global float* _weight = (__global float*)(weight);\n"
71"__global float* _variance = (__global float*)(variance);\n"
72"__global T_MEAN* _mean = (__global T_MEAN*)(mean);\n"
73"uchar mode = 0;\n"
74"for (; mode < nmodes; ++mode)\n"
75"{\n"
76"int mode_idx = mad24(mode, idx_step, pt_idx);\n"
77"float c_weight = mad(alpha1, _weight[mode_idx], prune);\n"
78"float c_var = _variance[mode_idx];\n"
79"T_MEAN c_mean = _mean[mode_idx];\n"
80"T_MEAN diff = c_mean - pix;\n"
81"float dist2 = dot(diff, diff);\n"
82"if (totalWeight < c_TB && dist2 < c_Tb * c_var)\n"
83"foreground = 0;\n"
84"if (dist2 < c_Tg * c_var)\n"
85"{\n"
86"fitsPDF = true;\n"
87"c_weight += alphaT;\n"
88"float k = alphaT / c_weight;\n"
89"T_MEAN mean_new = mad((T_MEAN)-k, diff, c_mean);\n"
90"float variance_new  = clamp(mad(k, (dist2 - c_var), c_var), c_varMin, c_varMax);\n"
91"for (int i = mode; i > 0; --i)\n"
92"{\n"
93"int prev_idx = mode_idx - idx_step;\n"
94"if (c_weight < _weight[prev_idx])\n"
95"break;\n"
96"_weight[mode_idx]   = _weight[prev_idx];\n"
97"_variance[mode_idx] = _variance[prev_idx];\n"
98"_mean[mode_idx]     = _mean[prev_idx];\n"
99"mode_idx = prev_idx;\n"
100"}\n"
101"_mean[mode_idx]     = mean_new;\n"
102"_variance[mode_idx] = variance_new;\n"
103"_weight[mode_idx]   = c_weight;\n"
104"totalWeight += c_weight;\n"
105"mode ++;\n"
106"break;\n"
107"}\n"
108"if (c_weight < -prune)\n"
109"c_weight = 0.0f;\n"
110"_weight[mode_idx] = c_weight;\n"
111"totalWeight += c_weight;\n"
112"}\n"
113"for (; mode < nmodes; ++mode)\n"
114"{\n"
115"int mode_idx = mad24(mode, idx_step, pt_idx);\n"
116"float c_weight = mad(alpha1, _weight[mode_idx], prune);\n"
117"if (c_weight < -prune)\n"
118"{\n"
119"c_weight = 0.0f;\n"
120"nmodes = mode;\n"
121"break;\n"
122"}\n"
123"_weight[mode_idx] = c_weight;\n"
124"totalWeight += c_weight;\n"
125"}\n"
126"if (0.f < totalWeight)\n"
127"{\n"
128"totalWeight = 1.f / totalWeight;\n"
129"for (int mode = 0; mode < nmodes; ++mode)\n"
130"_weight[mad24(mode, idx_step, pt_idx)] *= totalWeight;\n"
131"}\n"
132"if (!fitsPDF)\n"
133"{\n"
134"uchar mode = nmodes == (NMIXTURES) ? (NMIXTURES) - 1 : nmodes++;\n"
135"int mode_idx = mad24(mode, idx_step, pt_idx);\n"
136"if (nmodes == 1)\n"
137"_weight[mode_idx] = 1.f;\n"
138"else\n"
139"{\n"
140"_weight[mode_idx] = alphaT;\n"
141"for (int i = pt_idx; i < mode_idx; i += idx_step)\n"
142"_weight[i] *= alpha1;\n"
143"}\n"
144"for (int i = nmodes - 1; i > 0; --i)\n"
145"{\n"
146"int prev_idx = mode_idx - idx_step;\n"
147"if (alphaT < _weight[prev_idx])\n"
148"break;\n"
149"_weight[mode_idx]   = _weight[prev_idx];\n"
150"_variance[mode_idx] = _variance[prev_idx];\n"
151"_mean[mode_idx]     = _mean[prev_idx];\n"
152"mode_idx = prev_idx;\n"
153"}\n"
154"_mean[mode_idx] = pix;\n"
155"_variance[mode_idx] = c_varInit;\n"
156"}\n"
157"_modesUsed[0] = nmodes;\n"
158"#ifdef SHADOW_DETECT\n"
159"if (foreground)\n"
160"{\n"
161"float tWeight = 0.0f;\n"
162"for (uchar mode = 0; mode < nmodes; ++mode)\n"
163"{\n"
164"int mode_idx = mad24(mode, idx_step, pt_idx);\n"
165"T_MEAN c_mean = _mean[mode_idx];\n"
166"T_MEAN pix_mean = pix * c_mean;\n"
167"float numerator = sum(pix_mean);\n"
168"float denominator = dot(c_mean, c_mean);\n"
169"if (denominator == 0)\n"
170"break;\n"
171"if (numerator <= denominator && numerator >= c_tau * denominator)\n"
172"{\n"
173"float a = numerator / denominator;\n"
174"T_MEAN dD = mad(a, c_mean, -pix);\n"
175"if (dot(dD, dD) < c_Tb * _variance[mode_idx] * a * a)\n"
176"{\n"
177"foreground = c_shadowVal;\n"
178"break;\n"
179"}\n"
180"}\n"
181"tWeight += _weight[mode_idx];\n"
182"if (tWeight > c_TB)\n"
183"break;\n"
184"}\n"
185"}\n"
186"#endif\n"
187"__global uchar* _fgmask = fgmask + mad24(y, fgmask_step, x + fgmask_offset);\n"
188"*_fgmask = (uchar)foreground;\n"
189"}\n"
190"}\n"
191"__kernel void getBackgroundImage2_kernel(__global const uchar* modesUsed,\n"
192"__global const uchar* weight,\n"
193"__global const uchar* mean,\n"
194"__global uchar* dst, int dst_step, int dst_offset, int dst_row, int dst_col,\n"
195"float c_TB)\n"
196"{\n"
197"int x = get_global_id(0);\n"
198"int y = get_global_id(1);\n"
199"if(x < dst_col && y < dst_row)\n"
200"{\n"
201"int pt_idx =  mad24(y, dst_col, x);\n"
202"__global const uchar* _modesUsed = modesUsed + pt_idx;\n"
203"uchar nmodes = _modesUsed[0];\n"
204"T_MEAN meanVal = (T_MEAN)F_ZERO;\n"
205"float totalWeight = 0.0f;\n"
206"__global const float* _weight = (__global const float*)weight;\n"
207"__global const T_MEAN* _mean = (__global const T_MEAN*)(mean);\n"
208"int idx_step = dst_row * dst_col;\n"
209"for (uchar mode = 0; mode < nmodes; ++mode)\n"
210"{\n"
211"int mode_idx = mad24(mode, idx_step, pt_idx);\n"
212"float c_weight = _weight[mode_idx];\n"
213"T_MEAN c_mean = _mean[mode_idx];\n"
214"meanVal = mad(c_weight, c_mean, meanVal);\n"
215"totalWeight += c_weight;\n"
216"if (totalWeight > c_TB)\n"
217"break;\n"
218"}\n"
219"if (0.f < totalWeight)\n"
220"meanVal = meanVal / totalWeight;\n"
221"else\n"
222"meanVal = (T_MEAN)(0.f);\n"
223"__global uchar* _dst = dst + mad24(y, dst_step, mad24(x, CN, dst_offset));\n"
224"meanToFrame(meanVal, _dst);\n"
225"}\n"
226"}\n"
227, "b6e3850899862b7f0ab67cb32f1d52e9"};
228ProgramSource bgfg_mog2_oclsrc(bgfg_mog2.programStr);
229const struct ProgramEntry optical_flow_farneback={"optical_flow_farneback",
230"#define tx  (int)get_local_id(0)\n"
231"#define ty  get_local_id(1)\n"
232"#define bx  get_group_id(0)\n"
233"#define bdx (int)get_local_size(0)\n"
234"#define BORDER_SIZE 5\n"
235"#define MAX_KSIZE_HALF 100\n"
236"#ifndef polyN\n"
237"#define polyN 5\n"
238"#endif\n"
239"#if USE_DOUBLE\n"
240"#ifdef cl_amd_fp64\n"
241"#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
242"#elif defined (cl_khr_fp64)\n"
243"#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
244"#endif\n"
245"#define TYPE double\n"
246"#define VECTYPE double4\n"
247"#else\n"
248"#define TYPE float\n"
249"#define VECTYPE float4\n"
250"#endif\n"
251"__kernel void polynomialExpansion(__global __const float * src, int srcStep,\n"
252"__global float * dst, int dstStep,\n"
253"const int rows, const  int cols,\n"
254"__global __const float * c_g,\n"
255"__global __const float * c_xg,\n"
256"__global __const float * c_xxg,\n"
257"__local float * smem,\n"
258"const VECTYPE ig)\n"
259"{\n"
260"const int y = get_global_id(1);\n"
261"const int x = bx * (bdx - 2*polyN) + tx - polyN;\n"
262"int xWarped;\n"
263"__local float *row = smem + tx;\n"
264"if (y < rows && y >= 0)\n"
265"{\n"
266"xWarped = min(max(x, 0), cols - 1);\n"
267"row[0] = src[mad24(y, srcStep, xWarped)] * c_g[0];\n"
268"row[bdx] = 0.f;\n"
269"row[2*bdx] = 0.f;\n"
270"#pragma unroll\n"
271"for (int k = 1; k <= polyN; ++k)\n"
272"{\n"
273"float t0 = src[mad24(max(y - k, 0), srcStep, xWarped)];\n"
274"float t1 = src[mad24(min(y + k, rows - 1), srcStep, xWarped)];\n"
275"row[0] += c_g[k] * (t0 + t1);\n"
276"row[bdx] += c_xg[k] * (t1 - t0);\n"
277"row[2*bdx] += c_xxg[k] * (t0 + t1);\n"
278"}\n"
279"}\n"
280"barrier(CLK_LOCAL_MEM_FENCE);\n"
281"if (y < rows && y >= 0 && tx >= polyN && tx + polyN < bdx && x < cols)\n"
282"{\n"
283"TYPE b1 = c_g[0] * row[0];\n"
284"TYPE b3 = c_g[0] * row[bdx];\n"
285"TYPE b5 = c_g[0] * row[2*bdx];\n"
286"TYPE b2 = 0, b4 = 0, b6 = 0;\n"
287"#pragma unroll\n"
288"for (int k = 1; k <= polyN; ++k)\n"
289"{\n"
290"b1 += (row[k] + row[-k]) * c_g[k];\n"
291"b4 += (row[k] + row[-k]) * c_xxg[k];\n"
292"b2 += (row[k] - row[-k]) * c_xg[k];\n"
293"b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];\n"
294"b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];\n"
295"b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];\n"
296"}\n"
297"dst[mad24(y, dstStep, xWarped)] = (float)(b3*ig.s0);\n"
298"dst[mad24(rows + y, dstStep, xWarped)] = (float)(b2*ig.s0);\n"
299"dst[mad24(2*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b5*ig.s2);\n"
300"dst[mad24(3*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b4*ig.s2);\n"
301"dst[mad24(4*rows + y, dstStep, xWarped)] = (float)(b6*ig.s3);\n"
302"}\n"
303"}\n"
304"inline int idx_row_low(const int y, const int last_row)\n"
305"{\n"
306"return abs(y) % (last_row + 1);\n"
307"}\n"
308"inline int idx_row_high(const int y, const int last_row)\n"
309"{\n"
310"return abs(last_row - abs(last_row - y)) % (last_row + 1);\n"
311"}\n"
312"inline int idx_col_low(const int x, const int last_col)\n"
313"{\n"
314"return abs(x) % (last_col + 1);\n"
315"}\n"
316"inline int idx_col_high(const int x, const int last_col)\n"
317"{\n"
318"return abs(last_col - abs(last_col - x)) % (last_col + 1);\n"
319"}\n"
320"inline int idx_col(const int x, const int last_col)\n"
321"{\n"
322"return idx_col_low(idx_col_high(x, last_col), last_col);\n"
323"}\n"
324"__kernel void gaussianBlur(__global const float * src, int srcStep,\n"
325"__global float * dst, int dstStep, const int rows, const  int cols,\n"
326"__global const float * c_gKer, const int ksizeHalf,\n"
327"__local float * smem)\n"
328"{\n"
329"const int y = get_global_id(1);\n"
330"const int x = get_global_id(0);\n"
331"__local float *row = smem + ty * (bdx + 2*ksizeHalf);\n"
332"if (y < rows)\n"
333"{\n"
334"for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)\n"
335"{\n"
336"int xExt = (int)(bx * bdx) + i - ksizeHalf;\n"
337"xExt = idx_col(xExt, cols - 1);\n"
338"row[i] = src[mad24(y, srcStep, xExt)] * c_gKer[0];\n"
339"for (int j = 1; j <= ksizeHalf; ++j)\n"
340"row[i] += (src[mad24(idx_row_low(y - j, rows - 1), srcStep, xExt)]\n"
341"+ src[mad24(idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j];\n"
342"}\n"
343"}\n"
344"barrier(CLK_LOCAL_MEM_FENCE);\n"
345"if (y < rows && y >= 0 && x < cols && x >= 0)\n"
346"{\n"
347"row += tx + ksizeHalf;\n"
348"float res = row[0] * c_gKer[0];\n"
349"for (int i = 1; i <= ksizeHalf; ++i)\n"
350"res += (row[-i] + row[i]) * c_gKer[i];\n"
351"dst[mad24(y, dstStep, x)] = res;\n"
352"}\n"
353"}\n"
354"__kernel void gaussianBlur5(__global const float * src, int srcStep,\n"
355"__global float * dst, int dstStep,\n"
356"const int rows, const  int cols,\n"
357"__global const float * c_gKer, const int ksizeHalf,\n"
358"__local float * smem)\n"
359"{\n"
360"const int y = get_global_id(1);\n"
361"const int x = get_global_id(0);\n"
362"const int smw = bdx + 2*ksizeHalf;\n"
363"__local volatile float *row = smem + 5 * ty * smw;\n"
364"if (y < rows)\n"
365"{\n"
366"for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)\n"
367"{\n"
368"int xExt = (int)(bx * bdx) + i - ksizeHalf;\n"
369"xExt = idx_col(xExt, cols - 1);\n"
370"#pragma unroll\n"
371"for (int k = 0; k < 5; ++k)\n"
372"row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)] * c_gKer[0];\n"
373"for (int j = 1; j <= ksizeHalf; ++j)\n"
374"#pragma unroll\n"
375"for (int k = 0; k < 5; ++k)\n"
376"row[k*smw + i] +=\n"
377"(src[mad24(k*rows + idx_row_low(y - j, rows - 1), srcStep, xExt)] +\n"
378"src[mad24(k*rows + idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j];\n"
379"}\n"
380"}\n"
381"barrier(CLK_LOCAL_MEM_FENCE);\n"
382"if (y < rows && y >= 0 && x < cols && x >= 0)\n"
383"{\n"
384"row += tx + ksizeHalf;\n"
385"float res[5];\n"
386"#pragma unroll\n"
387"for (int k = 0; k < 5; ++k)\n"
388"res[k] = row[k*smw] * c_gKer[0];\n"
389"for (int i = 1; i <= ksizeHalf; ++i)\n"
390"#pragma unroll\n"
391"for (int k = 0; k < 5; ++k)\n"
392"res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];\n"
393"#pragma unroll\n"
394"for (int k = 0; k < 5; ++k)\n"
395"dst[mad24(k*rows + y, dstStep, x)] = res[k];\n"
396"}\n"
397"}\n"
398"__constant float c_border[BORDER_SIZE + 1] = { 0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f };\n"
399"__kernel void updateMatrices(__global const float * flowx, int xStep,\n"
400"__global const float * flowy, int yStep,\n"
401"const int rows, const int cols,\n"
402"__global const float * R0, int R0Step,\n"
403"__global const float * R1, int R1Step,\n"
404"__global float * M, int mStep)\n"
405"{\n"
406"const int y = get_global_id(1);\n"
407"const int x = get_global_id(0);\n"
408"if (y < rows && y >= 0 && x < cols && x >= 0)\n"
409"{\n"
410"float dx = flowx[mad24(y, xStep, x)];\n"
411"float dy = flowy[mad24(y, yStep, x)];\n"
412"float fx = x + dx;\n"
413"float fy = y + dy;\n"
414"int x1 = convert_int(floor(fx));\n"
415"int y1 = convert_int(floor(fy));\n"
416"fx -= x1;\n"
417"fy -= y1;\n"
418"float r2, r3, r4, r5, r6;\n"
419"if (x1 >= 0 && y1 >= 0 && x1 < cols - 1 && y1 < rows - 1)\n"
420"{\n"
421"float a00 = (1.f - fx) * (1.f - fy);\n"
422"float a01 = fx * (1.f - fy);\n"
423"float a10 = (1.f - fx) * fy;\n"
424"float a11 = fx * fy;\n"
425"r2 = a00 * R1[mad24(y1, R1Step, x1)] +\n"
426"a01 * R1[mad24(y1, R1Step, x1 + 1)] +\n"
427"a10 * R1[mad24(y1 + 1, R1Step, x1)] +\n"
428"a11 * R1[mad24(y1 + 1, R1Step, x1 + 1)];\n"
429"r3 = a00 * R1[mad24(rows + y1, R1Step, x1)] +\n"
430"a01 * R1[mad24(rows + y1, R1Step, x1 + 1)] +\n"
431"a10 * R1[mad24(rows + y1 + 1, R1Step, x1)] +\n"
432"a11 * R1[mad24(rows + y1 + 1, R1Step, x1 + 1)];\n"
433"r4 = a00 * R1[mad24(2*rows + y1, R1Step, x1)] +\n"
434"a01 * R1[mad24(2*rows + y1, R1Step, x1 + 1)] +\n"
435"a10 * R1[mad24(2*rows + y1 + 1, R1Step, x1)] +\n"
436"a11 * R1[mad24(2*rows + y1 + 1, R1Step, x1 + 1)];\n"
437"r5 = a00 * R1[mad24(3*rows + y1, R1Step, x1)] +\n"
438"a01 * R1[mad24(3*rows + y1, R1Step, x1 + 1)] +\n"
439"a10 * R1[mad24(3*rows + y1 + 1, R1Step, x1)] +\n"
440"a11 * R1[mad24(3*rows + y1 + 1, R1Step, x1 + 1)];\n"
441"r6 = a00 * R1[mad24(4*rows + y1, R1Step, x1)] +\n"
442"a01 * R1[mad24(4*rows + y1, R1Step, x1 + 1)] +\n"
443"a10 * R1[mad24(4*rows + y1 + 1, R1Step, x1)] +\n"
444"a11 * R1[mad24(4*rows + y1 + 1, R1Step, x1 + 1)];\n"
445"r4 = (R0[mad24(2*rows + y, R0Step, x)] + r4) * 0.5f;\n"
446"r5 = (R0[mad24(3*rows + y, R0Step, x)] + r5) * 0.5f;\n"
447"r6 = (R0[mad24(4*rows + y, R0Step, x)] + r6) * 0.25f;\n"
448"}\n"
449"else\n"
450"{\n"
451"r2 = r3 = 0.f;\n"
452"r4 = R0[mad24(2*rows + y, R0Step, x)];\n"
453"r5 = R0[mad24(3*rows + y, R0Step, x)];\n"
454"r6 = R0[mad24(4*rows + y, R0Step, x)] * 0.5f;\n"
455"}\n"
456"r2 = (R0[mad24(y, R0Step, x)] - r2) * 0.5f;\n"
457"r3 = (R0[mad24(rows + y, R0Step, x)] - r3) * 0.5f;\n"
458"r2 += r4*dy + r6*dx;\n"
459"r3 += r6*dy + r5*dx;\n"
460"float scale =\n"
461"c_border[min(x, BORDER_SIZE)] *\n"
462"c_border[min(y, BORDER_SIZE)] *\n"
463"c_border[min(cols - x - 1, BORDER_SIZE)] *\n"
464"c_border[min(rows - y - 1, BORDER_SIZE)];\n"
465"r2 *= scale;\n"
466"r3 *= scale;\n"
467"r4 *= scale;\n"
468"r5 *= scale;\n"
469"r6 *= scale;\n"
470"M[mad24(y, mStep, x)] = r4*r4 + r6*r6;\n"
471"M[mad24(rows + y, mStep, x)] = (r4 + r5)*r6;\n"
472"M[mad24(2*rows + y, mStep, x)] = r5*r5 + r6*r6;\n"
473"M[mad24(3*rows + y, mStep, x)] = r4*r2 + r6*r3;\n"
474"M[mad24(4*rows + y, mStep, x)] = r6*r2 + r5*r3;\n"
475"}\n"
476"}\n"
477"__kernel void boxFilter5(__global const float * src, int srcStep,\n"
478"__global float * dst, int dstStep,\n"
479"const int rows, const  int cols,\n"
480"const int ksizeHalf,\n"
481"__local float * smem)\n"
482"{\n"
483"const int y = get_global_id(1);\n"
484"const int x = get_global_id(0);\n"
485"const float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));\n"
486"const int smw = bdx + 2*ksizeHalf;\n"
487"__local float *row = smem + 5 * ty * smw;\n"
488"if (y < rows)\n"
489"{\n"
490"for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)\n"
491"{\n"
492"int xExt = (int)(bx * bdx) + i - ksizeHalf;\n"
493"xExt = min(max(xExt, 0), cols - 1);\n"
494"#pragma unroll\n"
495"for (int k = 0; k < 5; ++k)\n"
496"row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)];\n"
497"for (int j = 1; j <= ksizeHalf; ++j)\n"
498"#pragma unroll\n"
499"for (int k = 0; k < 5; ++k)\n"
500"row[k*smw + i] +=\n"
501"src[mad24(k*rows + max(y - j, 0), srcStep, xExt)] +\n"
502"src[mad24(k*rows + min(y + j, rows - 1), srcStep, xExt)];\n"
503"}\n"
504"}\n"
505"barrier(CLK_LOCAL_MEM_FENCE);\n"
506"if (y < rows && y >= 0 && x < cols && x >= 0)\n"
507"{\n"
508"row += tx + ksizeHalf;\n"
509"float res[5];\n"
510"#pragma unroll\n"
511"for (int k = 0; k < 5; ++k)\n"
512"res[k] = row[k*smw];\n"
513"for (int i = 1; i <= ksizeHalf; ++i)\n"
514"#pragma unroll\n"
515"for (int k = 0; k < 5; ++k)\n"
516"res[k] += row[k*smw - i] + row[k*smw + i];\n"
517"#pragma unroll\n"
518"for (int k = 0; k < 5; ++k)\n"
519"dst[mad24(k*rows + y, dstStep, x)] = res[k] * boxAreaInv;\n"
520"}\n"
521"}\n"
522"__kernel void updateFlow(__global const float * M, int mStep,\n"
523"__global float * flowx, int xStep,\n"
524"__global float * flowy, int yStep,\n"
525"const int rows, const int cols)\n"
526"{\n"
527"const int y = get_global_id(1);\n"
528"const int x = get_global_id(0);\n"
529"if (y < rows && y >= 0 && x < cols && x >= 0)\n"
530"{\n"
531"float g11 = M[mad24(y, mStep, x)];\n"
532"float g12 = M[mad24(rows + y, mStep, x)];\n"
533"float g22 = M[mad24(2*rows + y, mStep, x)];\n"
534"float h1 =  M[mad24(3*rows + y, mStep, x)];\n"
535"float h2 =  M[mad24(4*rows + y, mStep, x)];\n"
536"float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);\n"
537"flowx[mad24(y, xStep, x)] = (g11*h2 - g12*h1) * detInv;\n"
538"flowy[mad24(y, yStep, x)] = (g22*h1 - g12*h2) * detInv;\n"
539"}\n"
540"}\n"
541, "529300e6242f574f83d11a089cc120c0"};
542ProgramSource optical_flow_farneback_oclsrc(optical_flow_farneback.programStr);
543const struct ProgramEntry optical_flow_tvl1={"optical_flow_tvl1",
544"__kernel void centeredGradientKernel(__global const float* src_ptr, int src_col, int src_row, int src_step,\n"
545"__global float* dx, __global float* dy, int d_step)\n"
546"{\n"
547"int x = get_global_id(0);\n"
548"int y = get_global_id(1);\n"
549"if((x < src_col)&&(y < src_row))\n"
550"{\n"
551"int src_x1 = (x + 1) < (src_col -1)? (x + 1) : (src_col - 1);\n"
552"int src_x2 = (x - 1) > 0 ? (x -1) : 0;\n"
553"dx[y * d_step+ x] = 0.5f * (src_ptr[y * src_step + src_x1] - src_ptr[y * src_step+ src_x2]);\n"
554"int src_y1 = (y+1) < (src_row - 1) ? (y + 1) : (src_row - 1);\n"
555"int src_y2 = (y - 1) > 0 ? (y - 1) : 0;\n"
556"dy[y * d_step+ x] = 0.5f * (src_ptr[src_y1 * src_step + x] - src_ptr[src_y2 * src_step+ x]);\n"
557"}\n"
558"}\n"
559"inline float bicubicCoeff(float x_)\n"
560"{\n"
561"float x = fabs(x_);\n"
562"if (x <= 1.0f)\n"
563"return x * x * (1.5f * x - 2.5f) + 1.0f;\n"
564"else if (x < 2.0f)\n"
565"return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;\n"
566"else\n"
567"return 0.0f;\n"
568"}\n"
569"__kernel void warpBackwardKernel(__global const float* I0, int I0_step, int I0_col, int I0_row,\n"
570"image2d_t tex_I1, image2d_t tex_I1x, image2d_t tex_I1y,\n"
571"__global const float* u1, int u1_step,\n"
572"__global const float* u2,\n"
573"__global float* I1w,\n"
574"__global float* I1wx, \n"
575"__global float* I1wy, \n"
576"__global float* grad, \n"
577"__global float* rho,\n"
578"int I1w_step,\n"
579"int u2_step,\n"
580"int u1_offset_x,\n"
581"int u1_offset_y,\n"
582"int u2_offset_x,\n"
583"int u2_offset_y)\n"
584"{\n"
585"int x = get_global_id(0);\n"
586"int y = get_global_id(1);\n"
587"if(x < I0_col&&y < I0_row)\n"
588"{\n"
589"float u1Val = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];\n"
590"float u2Val = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];\n"
591"float wx = x + u1Val;\n"
592"float wy = y + u2Val;\n"
593"int xmin = ceil(wx - 2.0f);\n"
594"int xmax = floor(wx + 2.0f);\n"
595"int ymin = ceil(wy - 2.0f);\n"
596"int ymax = floor(wy + 2.0f);\n"
597"float sum  = 0.0f;\n"
598"float sumx = 0.0f;\n"
599"float sumy = 0.0f;\n"
600"float wsum = 0.0f;\n"
601"sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
602"for (int cy = ymin; cy <= ymax; ++cy)\n"
603"{\n"
604"for (int cx = xmin; cx <= xmax; ++cx)\n"
605"{\n"
606"float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);\n"
607"int2 cood = (int2)(cx, cy);\n"
608"sum += w * read_imagef(tex_I1, sampleri, cood).x;\n"
609"sumx += w * read_imagef(tex_I1x, sampleri, cood).x;\n"
610"sumy += w * read_imagef(tex_I1y, sampleri, cood).x;\n"
611"wsum += w;\n"
612"}\n"
613"}\n"
614"float coeff = 1.0f / wsum;\n"
615"float I1wVal  = sum  * coeff;\n"
616"float I1wxVal = sumx * coeff;\n"
617"float I1wyVal = sumy * coeff;\n"
618"I1w[y * I1w_step + x]  = I1wVal;\n"
619"I1wx[y * I1w_step + x] = I1wxVal;\n"
620"I1wy[y * I1w_step + x] = I1wyVal;\n"
621"float Ix2 = I1wxVal * I1wxVal;\n"
622"float Iy2 = I1wyVal * I1wyVal;\n"
623"grad[y * I1w_step + x] = Ix2 + Iy2;\n"
624"float I0Val = I0[y * I0_step + x];\n"
625"rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;\n"
626"}\n"
627"}\n"
628"inline float readImage(__global float *image,  int x,  int y,  int rows,  int cols, int elemCntPerRow)\n"
629"{\n"
630"int i0 = clamp(x, 0, cols - 1);\n"
631"int j0 = clamp(y, 0, rows - 1);\n"
632"return image[j0 * elemCntPerRow + i0];\n"
633"}\n"
634"__kernel void warpBackwardKernelNoImage2d(__global const float* I0, int I0_step, int I0_col, int I0_row,\n"
635"__global const float* tex_I1, __global const float* tex_I1x, __global const float* tex_I1y,\n"
636"__global const float* u1, int u1_step,\n"
637"__global const float* u2,\n"
638"__global float* I1w,\n"
639"__global float* I1wx, \n"
640"__global float* I1wy, \n"
641"__global float* grad, \n"
642"__global float* rho,\n"
643"int I1w_step,\n"
644"int u2_step,\n"
645"int I1_step,\n"
646"int I1x_step)\n"
647"{\n"
648"int x = get_global_id(0);\n"
649"int y = get_global_id(1);\n"
650"if(x < I0_col&&y < I0_row)\n"
651"{\n"
652"float u1Val = u1[y * u1_step + x];\n"
653"float u2Val = u2[y * u2_step + x];\n"
654"float wx = x + u1Val;\n"
655"float wy = y + u2Val;\n"
656"int xmin = ceil(wx - 2.0f);\n"
657"int xmax = floor(wx + 2.0f);\n"
658"int ymin = ceil(wy - 2.0f);\n"
659"int ymax = floor(wy + 2.0f);\n"
660"float sum  = 0.0f;\n"
661"float sumx = 0.0f;\n"
662"float sumy = 0.0f;\n"
663"float wsum = 0.0f;\n"
664"for (int cy = ymin; cy <= ymax; ++cy)\n"
665"{\n"
666"for (int cx = xmin; cx <= xmax; ++cx)\n"
667"{\n"
668"float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);\n"
669"int2 cood = (int2)(cx, cy);\n"
670"sum += w * readImage(tex_I1, cood.x, cood.y, I0_col, I0_row, I1_step);\n"
671"sumx += w * readImage(tex_I1x, cood.x, cood.y, I0_col, I0_row, I1x_step);\n"
672"sumy += w * readImage(tex_I1y, cood.x, cood.y, I0_col, I0_row, I1x_step);\n"
673"wsum += w;\n"
674"}\n"
675"}\n"
676"float coeff = 1.0f / wsum;\n"
677"float I1wVal  = sum  * coeff;\n"
678"float I1wxVal = sumx * coeff;\n"
679"float I1wyVal = sumy * coeff;\n"
680"I1w[y * I1w_step + x]  = I1wVal;\n"
681"I1wx[y * I1w_step + x] = I1wxVal;\n"
682"I1wy[y * I1w_step + x] = I1wyVal;\n"
683"float Ix2 = I1wxVal * I1wxVal;\n"
684"float Iy2 = I1wyVal * I1wyVal;\n"
685"grad[y * I1w_step + x] = Ix2 + Iy2;\n"
686"float I0Val = I0[y * I0_step + x];\n"
687"rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;\n"
688"}\n"
689"}\n"
690"__kernel void estimateDualVariablesKernel(__global const float* u1, int u1_col, int u1_row, int u1_step,\n"
691"__global const float* u2,\n"
692"__global float* p11, int p11_step,\n"
693"__global float* p12,\n"
694"__global float* p21,\n"
695"__global float* p22,\n"
696"float taut,\n"
697"int u2_step,\n"
698"int u1_offset_x,\n"
699"int u1_offset_y,\n"
700"int u2_offset_x,\n"
701"int u2_offset_y)\n"
702"{\n"
703"int x = get_global_id(0);\n"
704"int y = get_global_id(1);\n"
705"if(x < u1_col && y < u1_row)\n"
706"{\n"
707"int src_x1 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);\n"
708"float u1x = u1[(y + u1_offset_y) * u1_step + src_x1 + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];\n"
709"int src_y1 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1);\n"
710"float u1y = u1[(src_y1 + u1_offset_y) * u1_step + x + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];\n"
711"int src_x2 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);\n"
712"float u2x = u2[(y + u2_offset_y) * u2_step + src_x2 + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];\n"
713"int src_y2 = (y + 1) <  (u1_row - 1) ? (y + 1) : (u1_row - 1);\n"
714"float u2y = u2[(src_y2 + u2_offset_y) * u2_step + x + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];\n"
715"float g1 = hypot(u1x, u1y);\n"
716"float g2 = hypot(u2x, u2y);\n"
717"float ng1 = 1.0f + taut * g1;\n"
718"float ng2 = 1.0f + taut * g2;\n"
719"p11[y * p11_step + x] = (p11[y * p11_step + x] + taut * u1x) / ng1;\n"
720"p12[y * p11_step + x] = (p12[y * p11_step + x] + taut * u1y) / ng1;\n"
721"p21[y * p11_step + x] = (p21[y * p11_step + x] + taut * u2x) / ng2;\n"
722"p22[y * p11_step + x] = (p22[y * p11_step + x] + taut * u2y) / ng2;\n"
723"}\n"
724"}\n"
725"inline float divergence(__global const float* v1, __global const float* v2, int y, int x, int v1_step, int v2_step)\n"
726"{\n"
727"if (x > 0 && y > 0)\n"
728"{\n"
729"float v1x = v1[y * v1_step + x] - v1[y * v1_step + x - 1];\n"
730"float v2y = v2[y * v2_step + x] - v2[(y - 1) * v2_step + x];\n"
731"return v1x + v2y;\n"
732"}\n"
733"else\n"
734"{\n"
735"if (y > 0)\n"
736"return v1[y * v1_step + 0] + v2[y * v2_step + 0] - v2[(y - 1) * v2_step + 0];\n"
737"else\n"
738"{\n"
739"if (x > 0)\n"
740"return v1[0 * v1_step + x] - v1[0 * v1_step + x - 1] + v2[0 * v2_step + x];\n"
741"else\n"
742"return v1[0 * v1_step + 0] + v2[0 * v2_step + 0];\n"
743"}\n"
744"}\n"
745"}\n"
746"__kernel void estimateUKernel(__global const float* I1wx, int I1wx_col, int I1wx_row, int I1wx_step,\n"
747"__global const float* I1wy, \n"
748"__global const float* grad, \n"
749"__global const float* rho_c, \n"
750"__global const float* p11, \n"
751"__global const float* p12, \n"
752"__global const float* p21, \n"
753"__global const float* p22, \n"
754"__global float* u1, int u1_step,\n"
755"__global float* u2,\n"
756"__global float* error, float l_t, float theta, int u2_step,\n"
757"int u1_offset_x,\n"
758"int u1_offset_y,\n"
759"int u2_offset_x,\n"
760"int u2_offset_y,\n"
761"char calc_error)\n"
762"{\n"
763"int x = get_global_id(0);\n"
764"int y = get_global_id(1);\n"
765"if(x < I1wx_col && y < I1wx_row)\n"
766"{\n"
767"float I1wxVal = I1wx[y * I1wx_step + x];\n"
768"float I1wyVal = I1wy[y * I1wx_step + x];\n"
769"float gradVal = grad[y * I1wx_step + x];\n"
770"float u1OldVal = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];\n"
771"float u2OldVal = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];\n"
772"float rho = rho_c[y * I1wx_step + x] + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);\n"
773"float d1 = 0.0f;\n"
774"float d2 = 0.0f;\n"
775"if (rho < -l_t * gradVal)\n"
776"{\n"
777"d1 = l_t * I1wxVal;\n"
778"d2 = l_t * I1wyVal;\n"
779"}\n"
780"else if (rho > l_t * gradVal)\n"
781"{\n"
782"d1 = -l_t * I1wxVal;\n"
783"d2 = -l_t * I1wyVal;\n"
784"}\n"
785"else if (gradVal > 1.192092896e-07f)\n"
786"{\n"
787"float fi = -rho / gradVal;\n"
788"d1 = fi * I1wxVal;\n"
789"d2 = fi * I1wyVal;\n"
790"}\n"
791"float v1 = u1OldVal + d1;\n"
792"float v2 = u2OldVal + d2;\n"
793"float div_p1 = divergence(p11, p12, y, x, I1wx_step, I1wx_step);\n"
794"float div_p2 = divergence(p21, p22, y, x, I1wx_step, I1wx_step);\n"
795"float u1NewVal = v1 + theta * div_p1;\n"
796"float u2NewVal = v2 + theta * div_p2;\n"
797"u1[(y + u1_offset_y) * u1_step + x + u1_offset_x] = u1NewVal;\n"
798"u2[(y + u2_offset_y) * u2_step + x + u2_offset_x] = u2NewVal;\n"
799"if(calc_error)\n"
800"{\n"
801"float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);\n"
802"float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);\n"
803"error[y * I1wx_step + x] = n1 + n2;\n"
804"}\n"
805"}\n"
806"}\n"
807, "a9d306a49b405703820fae23312ebd28"};
808ProgramSource optical_flow_tvl1_oclsrc(optical_flow_tvl1.programStr);
809const struct ProgramEntry pyrlk={"pyrlk",
810"#define GRIDSIZE    3\n"
811"#define LSx 8\n"
812"#define LSy 8\n"
813"#define LM_W (LSx*GRIDSIZE+2)\n"
814"#define LM_H (LSy*GRIDSIZE+2)\n"
815"#define BUFFER  (LSx*LSy)\n"
816"#define BUFFER2 BUFFER>>1\n"
817"#ifndef WAVE_SIZE\n"
818"#define WAVE_SIZE 1\n"
819"#endif\n"
820"#ifdef CPU\n"
821"inline void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local float* smem2,  __local float* smem3, int tid)\n"
822"{\n"
823"smem1[tid] = val1;\n"
824"smem2[tid] = val2;\n"
825"smem3[tid] = val3;\n"
826"barrier(CLK_LOCAL_MEM_FENCE);\n"
827"for(int i = BUFFER2; i > 0; i >>= 1)\n"
828"{\n"
829"if(tid < i)\n"
830"{\n"
831"smem1[tid] += smem1[tid + i];\n"
832"smem2[tid] += smem2[tid + i];\n"
833"smem3[tid] += smem3[tid + i];\n"
834"}\n"
835"barrier(CLK_LOCAL_MEM_FENCE);\n"
836"}\n"
837"}\n"
838"inline void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)\n"
839"{\n"
840"smem1[tid] = val1;\n"
841"smem2[tid] = val2;\n"
842"barrier(CLK_LOCAL_MEM_FENCE);\n"
843"for(int i = BUFFER2; i > 0; i >>= 1)\n"
844"{\n"
845"if(tid < i)\n"
846"{\n"
847"smem1[tid] += smem1[tid + i];\n"
848"smem2[tid] += smem2[tid + i];\n"
849"}\n"
850"barrier(CLK_LOCAL_MEM_FENCE);\n"
851"}\n"
852"}\n"
853"inline void reduce1(float val1, volatile __local float* smem1, int tid)\n"
854"{\n"
855"smem1[tid] = val1;\n"
856"barrier(CLK_LOCAL_MEM_FENCE);\n"
857"for(int i = BUFFER2; i > 0; i >>= 1)\n"
858"{\n"
859"if(tid < i)\n"
860"{\n"
861"smem1[tid] += smem1[tid + i];\n"
862"}\n"
863"barrier(CLK_LOCAL_MEM_FENCE);\n"
864"}\n"
865"}\n"
866"#else\n"
867"inline void reduce3(float val1, float val2, float val3,\n"
868"__local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)\n"
869"{\n"
870"smem1[tid] = val1;\n"
871"smem2[tid] = val2;\n"
872"smem3[tid] = val3;\n"
873"barrier(CLK_LOCAL_MEM_FENCE);\n"
874"if (tid < 32)\n"
875"{\n"
876"smem1[tid] += smem1[tid + 32];\n"
877"smem2[tid] += smem2[tid + 32];\n"
878"smem3[tid] += smem3[tid + 32];\n"
879"#if WAVE_SIZE < 32\n"
880"}\n"
881"barrier(CLK_LOCAL_MEM_FENCE);\n"
882"if (tid < 16)\n"
883"{\n"
884"#endif\n"
885"smem1[tid] += smem1[tid + 16];\n"
886"smem2[tid] += smem2[tid + 16];\n"
887"smem3[tid] += smem3[tid + 16];\n"
888"#if WAVE_SIZE <16\n"
889"}\n"
890"barrier(CLK_LOCAL_MEM_FENCE);\n"
891"if (tid<1)\n"
892"{\n"
893"#endif\n"
894"local float8* m1 = (local float8*)smem1;\n"
895"local float8* m2 = (local float8*)smem2;\n"
896"local float8* m3 = (local float8*)smem3;\n"
897"float8 t1 = m1[0]+m1[1];\n"
898"float8 t2 = m2[0]+m2[1];\n"
899"float8 t3 = m3[0]+m3[1];\n"
900"float4 t14 = t1.lo + t1.hi;\n"
901"float4 t24 = t2.lo + t2.hi;\n"
902"float4 t34 = t3.lo + t3.hi;\n"
903"smem1[0] = t14.x+t14.y+t14.z+t14.w;\n"
904"smem2[0] = t24.x+t24.y+t24.z+t24.w;\n"
905"smem3[0] = t34.x+t34.y+t34.z+t34.w;\n"
906"}\n"
907"barrier(CLK_LOCAL_MEM_FENCE);\n"
908"}\n"
909"inline void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)\n"
910"{\n"
911"smem1[tid] = val1;\n"
912"smem2[tid] = val2;\n"
913"barrier(CLK_LOCAL_MEM_FENCE);\n"
914"if (tid < 32)\n"
915"{\n"
916"smem1[tid] += smem1[tid + 32];\n"
917"smem2[tid] += smem2[tid + 32];\n"
918"#if WAVE_SIZE < 32\n"
919"}\n"
920"barrier(CLK_LOCAL_MEM_FENCE);\n"
921"if (tid < 16)\n"
922"{\n"
923"#endif\n"
924"smem1[tid] += smem1[tid + 16];\n"
925"smem2[tid] += smem2[tid + 16];\n"
926"#if WAVE_SIZE <16\n"
927"}\n"
928"barrier(CLK_LOCAL_MEM_FENCE);\n"
929"if (tid<1)\n"
930"{\n"
931"#endif\n"
932"local float8* m1 = (local float8*)smem1;\n"
933"local float8* m2 = (local float8*)smem2;\n"
934"float8 t1 = m1[0]+m1[1];\n"
935"float8 t2 = m2[0]+m2[1];\n"
936"float4 t14 = t1.lo + t1.hi;\n"
937"float4 t24 = t2.lo + t2.hi;\n"
938"smem1[0] = t14.x+t14.y+t14.z+t14.w;\n"
939"smem2[0] = t24.x+t24.y+t24.z+t24.w;\n"
940"}\n"
941"barrier(CLK_LOCAL_MEM_FENCE);\n"
942"}\n"
943"inline void reduce1(float val1, __local volatile float* smem1, int tid)\n"
944"{\n"
945"smem1[tid] = val1;\n"
946"barrier(CLK_LOCAL_MEM_FENCE);\n"
947"if (tid < 32)\n"
948"{\n"
949"smem1[tid] += smem1[tid + 32];\n"
950"#if WAVE_SIZE < 32\n"
951"}\n"
952"barrier(CLK_LOCAL_MEM_FENCE);\n"
953"if (tid < 16)\n"
954"{\n"
955"#endif\n"
956"smem1[tid] += smem1[tid + 16];\n"
957"#if WAVE_SIZE <16\n"
958"}\n"
959"barrier(CLK_LOCAL_MEM_FENCE);\n"
960"if (tid<1)\n"
961"{\n"
962"#endif\n"
963"local float8* m1 = (local float8*)smem1;\n"
964"float8 t1 = m1[0]+m1[1];\n"
965"float4 t14 = t1.lo + t1.hi;\n"
966"smem1[0] = t14.x+t14.y+t14.z+t14.w;\n"
967"}\n"
968"barrier(CLK_LOCAL_MEM_FENCE);\n"
969"}\n"
970"#endif\n"
971"#define SCALE (1.0f / (1 << 20))\n"
972"#define  THRESHOLD  0.01f\n"
973"__constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
974"#define VAL(_y,_x,_yy,_xx)    (IPatchLocal[(yid+((_y)*LSy)+1+(_yy))*LM_W+(xid+((_x)*LSx)+1+(_xx))])\n"
975"inline void SetPatch(local float* IPatchLocal, int TileY, int TileX,\n"
976"float* Pch, float* Dx, float* Dy,\n"
977"float* A11, float* A12, float* A22, float w)\n"
978"{\n"
979"unsigned int xid=get_local_id(0);\n"
980"unsigned int yid=get_local_id(1);\n"
981"*Pch = VAL(TileY,TileX,0,0);\n"
982"float dIdx = (3.0f*VAL(TileY,TileX,-1,1)+10.0f*VAL(TileY,TileX,0,1)+3.0f*VAL(TileY,TileX,+1,1))-(3.0f*VAL(TileY,TileX,-1,-1)+10.0f*VAL(TileY,TileX,0,-1)+3.0f*VAL(TileY,TileX,+1,-1));\n"
983"float dIdy = (3.0f*VAL(TileY,TileX,1,-1)+10.0f*VAL(TileY,TileX,1,0)+3.0f*VAL(TileY,TileX,1,+1))-(3.0f*VAL(TileY,TileX,-1,-1)+10.0f*VAL(TileY,TileX,-1,0)+3.0f*VAL(TileY,TileX,-1,+1));\n"
984"dIdx *= w;\n"
985"dIdy *= w;\n"
986"*Dx = dIdx;\n"
987"*Dy = dIdy;\n"
988"*A11 += dIdx * dIdx;\n"
989"*A12 += dIdx * dIdy;\n"
990"*A22 += dIdy * dIdy;\n"
991"}\n"
992"#undef VAL\n"
993"inline void GetPatch(image2d_t J, float x, float y,\n"
994"float* Pch, float* Dx, float* Dy,\n"
995"float* b1, float* b2)\n"
996"{\n"
997"float J_val = read_imagef(J, sampler, (float2)(x, y)).x;\n"
998"float diff = (J_val - *Pch) * 32.0f;\n"
999"*b1 += diff**Dx;\n"
1000"*b2 += diff**Dy;\n"
1001"}\n"
1002"inline void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval)\n"
1003"{\n"
1004"float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch;\n"
1005"*errval += fabs(diff);\n"
1006"}\n"
1007"#define READI(_y,_x)    IPatchLocal[(yid+((_y)*LSy))*LM_W+(xid+((_x)*LSx))] = read_imagef(I, sampler, (float2)(Point.x + xid+(_x)*LSx + 0.5f-1, Point.y + yid+(_y)*LSy+ 0.5f-1)).x;\n"
1008"void ReadPatchIToLocalMem(image2d_t I, float2 Point, local float* IPatchLocal)\n"
1009"{\n"
1010"unsigned int xid=get_local_id(0);\n"
1011"unsigned int yid=get_local_id(1);\n"
1012"READI(0,0);READI(0,1);READI(0,2);\n"
1013"READI(1,0);READI(1,1);READI(1,2);\n"
1014"READI(2,0);READI(2,1);READI(2,2);\n"
1015"if(xid<2)\n"
1016"{\n"
1017"READI(0,3);\n"
1018"READI(1,3);\n"
1019"READI(2,3);\n"
1020"}\n"
1021"if(yid<2)\n"
1022"{\n"
1023"READI(3,0);READI(3,1);READI(3,2);\n"
1024"}\n"
1025"if(yid<2 && xid<2)\n"
1026"{\n"
1027"READI(3,3);\n"
1028"}\n"
1029"barrier(CLK_LOCAL_MEM_FENCE);\n"
1030"}\n"
1031"#undef READI\n"
1032"__attribute__((reqd_work_group_size(LSx, LSy, 1)))\n"
1033"__kernel void lkSparse(image2d_t I, image2d_t J,\n"
1034"__global const float2* prevPts, __global float2* nextPts, __global uchar* status, __global float* err,\n"
1035"const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)\n"
1036"{\n"
1037"__local float smem1[BUFFER];\n"
1038"__local float smem2[BUFFER];\n"
1039"__local float smem3[BUFFER];\n"
1040"unsigned int xid=get_local_id(0);\n"
1041"unsigned int yid=get_local_id(1);\n"
1042"unsigned int gid=get_group_id(0);\n"
1043"unsigned int xsize=get_local_size(0);\n"
1044"unsigned int ysize=get_local_size(1);\n"
1045"int xBase, yBase, k;\n"
1046"float wx = ((xid+2*xsize)<c_winSize_x)?1:0;\n"
1047"float wy = ((yid+2*ysize)<c_winSize_y)?1:0;\n"
1048"float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);\n"
1049"const int tid = mad24(yid, xsize, xid);\n"
1050"float2 prevPt = prevPts[gid] / (float2)(1 << level);\n"
1051"if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)\n"
1052"{\n"
1053"if (tid == 0 && level == 0)\n"
1054"{\n"
1055"status[gid] = 0;\n"
1056"}\n"
1057"return;\n"
1058"}\n"
1059"prevPt -= c_halfWin;\n"
1060"float A11 = 0;\n"
1061"float A12 = 0;\n"
1062"float A22 = 0;\n"
1063"float I_patch[GRIDSIZE][GRIDSIZE];\n"
1064"float dIdx_patch[GRIDSIZE][GRIDSIZE];\n"
1065"float dIdy_patch[GRIDSIZE][GRIDSIZE];\n"
1066"local float IPatchLocal[LM_W*LM_H];\n"
1067"ReadPatchIToLocalMem(I,prevPt,IPatchLocal);\n"
1068"{\n"
1069"SetPatch(IPatchLocal, 0, 0,\n"
1070"&I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],\n"
1071"&A11, &A12, &A22,1);\n"
1072"SetPatch(IPatchLocal, 0, 1,\n"
1073"&I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],\n"
1074"&A11, &A12, &A22,1);\n"
1075"SetPatch(IPatchLocal, 0, 2,\n"
1076"&I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],\n"
1077"&A11, &A12, &A22,wx);\n"
1078"}\n"
1079"{\n"
1080"SetPatch(IPatchLocal, 1, 0,\n"
1081"&I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],\n"
1082"&A11, &A12, &A22,1);\n"
1083"SetPatch(IPatchLocal, 1,1,\n"
1084"&I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],\n"
1085"&A11, &A12, &A22,1);\n"
1086"SetPatch(IPatchLocal, 1,2,\n"
1087"&I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],\n"
1088"&A11, &A12, &A22,wx);\n"
1089"}\n"
1090"{\n"
1091"SetPatch(IPatchLocal, 2,0,\n"
1092"&I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],\n"
1093"&A11, &A12, &A22,wy);\n"
1094"SetPatch(IPatchLocal, 2,1,\n"
1095"&I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],\n"
1096"&A11, &A12, &A22,wy);\n"
1097"SetPatch(IPatchLocal, 2,2,\n"
1098"&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],\n"
1099"&A11, &A12, &A22,wx*wy);\n"
1100"}\n"
1101"reduce3(A11, A12, A22, smem1, smem2, smem3, tid);\n"
1102"A11 = smem1[0];\n"
1103"A12 = smem2[0];\n"
1104"A22 = smem3[0];\n"
1105"barrier(CLK_LOCAL_MEM_FENCE);\n"
1106"float D = A11 * A22 - A12 * A12;\n"
1107"if (D < 1.192092896e-07f)\n"
1108"{\n"
1109"if (tid == 0 && level == 0)\n"
1110"status[gid] = 0;\n"
1111"return;\n"
1112"}\n"
1113"A11 /= D;\n"
1114"A12 /= D;\n"
1115"A22 /= D;\n"
1116"prevPt = nextPts[gid] * 2.0f - c_halfWin;\n"
1117"for (k = 0; k < c_iters; ++k)\n"
1118"{\n"
1119"if (prevPt.x < -c_halfWin.x || prevPt.x >= cols || prevPt.y < -c_halfWin.y || prevPt.y >= rows)\n"
1120"{\n"
1121"if (tid == 0 && level == 0)\n"
1122"status[gid] = 0;\n"
1123"break;\n"
1124"}\n"
1125"float b1 = 0;\n"
1126"float b2 = 0;\n"
1127"yBase=yid;\n"
1128"{\n"
1129"xBase=xid;\n"
1130"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1131"&I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],\n"
1132"&b1, &b2);\n"
1133"xBase+=xsize;\n"
1134"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1135"&I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],\n"
1136"&b1, &b2);\n"
1137"xBase+=xsize;\n"
1138"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1139"&I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],\n"
1140"&b1, &b2);\n"
1141"}\n"
1142"yBase+=ysize;\n"
1143"{\n"
1144"xBase=xid;\n"
1145"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1146"&I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],\n"
1147"&b1, &b2);\n"
1148"xBase+=xsize;\n"
1149"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1150"&I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],\n"
1151"&b1, &b2);\n"
1152"xBase+=xsize;\n"
1153"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1154"&I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],\n"
1155"&b1, &b2);\n"
1156"}\n"
1157"yBase+=ysize;\n"
1158"{\n"
1159"xBase=xid;\n"
1160"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1161"&I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],\n"
1162"&b1, &b2);\n"
1163"xBase+=xsize;\n"
1164"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1165"&I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],\n"
1166"&b1, &b2);\n"
1167"xBase+=xsize;\n"
1168"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1169"&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],\n"
1170"&b1, &b2);\n"
1171"}\n"
1172"reduce2(b1, b2, smem1, smem2, tid);\n"
1173"b1 = smem1[0];\n"
1174"b2 = smem2[0];\n"
1175"barrier(CLK_LOCAL_MEM_FENCE);\n"
1176"float2 delta;\n"
1177"delta.x = A12 * b2 - A22 * b1;\n"
1178"delta.y = A12 * b1 - A11 * b2;\n"
1179"prevPt += delta;\n"
1180"if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD)\n"
1181"break;\n"
1182"}\n"
1183"D = 0.0f;\n"
1184"if (calcErr)\n"
1185"{\n"
1186"yBase=yid;\n"
1187"{\n"
1188"xBase=xid;\n"
1189"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1190"&I_patch[0][0], &D);\n"
1191"xBase+=xsize;\n"
1192"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1193"&I_patch[0][1], &D);\n"
1194"xBase+=xsize;\n"
1195"if(xBase<c_winSize_x)\n"
1196"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1197"&I_patch[0][2], &D);\n"
1198"}\n"
1199"yBase+=ysize;\n"
1200"{\n"
1201"xBase=xid;\n"
1202"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1203"&I_patch[1][0], &D);\n"
1204"xBase+=xsize;\n"
1205"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1206"&I_patch[1][1], &D);\n"
1207"xBase+=xsize;\n"
1208"if(xBase<c_winSize_x)\n"
1209"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1210"&I_patch[1][2], &D);\n"
1211"}\n"
1212"yBase+=ysize;\n"
1213"if(yBase<c_winSize_y)\n"
1214"{\n"
1215"xBase=xid;\n"
1216"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1217"&I_patch[2][0], &D);\n"
1218"xBase+=xsize;\n"
1219"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1220"&I_patch[2][1], &D);\n"
1221"xBase+=xsize;\n"
1222"if(xBase<c_winSize_x)\n"
1223"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1224"&I_patch[2][2], &D);\n"
1225"}\n"
1226"reduce1(D, smem1, tid);\n"
1227"}\n"
1228"if (tid == 0)\n"
1229"{\n"
1230"prevPt += c_halfWin;\n"
1231"nextPts[gid] = prevPt;\n"
1232"if (calcErr)\n"
1233"err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y);\n"
1234"}\n"
1235"}\n"
1236, "b7099fcbc60bd5528dacc491eadd88c1"};
1237ProgramSource pyrlk_oclsrc(pyrlk.programStr);
1238}
1239}}
1240