1// This file is auto-generated. Do not edit! 2 3#include "precomp.hpp" 4#include "opencl_kernels_video.hpp" 5 6namespace cv 7{ 8namespace ocl 9{ 10namespace video 11{ 12 13const struct ProgramEntry bgfg_mog2={"bgfg_mog2", 14"#if CN==1\n" 15"#define T_MEAN float\n" 16"#define F_ZERO (0.0f)\n" 17"#define cnMode 1\n" 18"#define frameToMean(a, b) (b) = *(a);\n" 19"#define meanToFrame(a, b) *b = convert_uchar_sat(a);\n" 20"inline float sum(float val)\n" 21"{\n" 22"return val;\n" 23"}\n" 24"#else\n" 25"#define T_MEAN float4\n" 26"#define F_ZERO (0.0f, 0.0f, 0.0f, 0.0f)\n" 27"#define cnMode 4\n" 28"#define meanToFrame(a, b)\\\n" 29"b[0] = convert_uchar_sat(a.x); \\\n" 30"b[1] = convert_uchar_sat(a.y); \\\n" 31"b[2] = convert_uchar_sat(a.z);\n" 32"#define frameToMean(a, b)\\\n" 33"b.x = a[0]; \\\n" 34"b.y = a[1]; \\\n" 35"b.z = a[2]; \\\n" 36"b.w = 0.0f;\n" 37"inline float sum(const float4 val)\n" 38"{\n" 39"return (val.x + val.y + val.z);\n" 40"}\n" 41"#endif\n" 42"__kernel void mog2_kernel(__global const uchar* frame, int frame_step, int frame_offset, int frame_row, int frame_col,\n" 43"__global uchar* modesUsed,\n" 44"__global uchar* weight,\n" 45"__global uchar* mean,\n" 46"__global uchar* variance,\n" 47"__global uchar* fgmask, int fgmask_step, int fgmask_offset,\n" 48"float alphaT, float alpha1, float prune,\n" 49"float c_Tb, float c_TB, float c_Tg, float c_varMin,\n" 50"float c_varMax, float c_varInit, float c_tau\n" 51"#ifdef SHADOW_DETECT\n" 52", uchar c_shadowVal\n" 53"#endif\n" 54")\n" 55"{\n" 56"int x = get_global_id(0);\n" 57"int y = get_global_id(1);\n" 58"if( x < frame_col && y < frame_row)\n" 59"{\n" 60"__global const uchar* _frame = (frame + mad24(y, frame_step, mad24(x, CN, frame_offset)));\n" 61"T_MEAN pix;\n" 62"frameToMean(_frame, pix);\n" 63"uchar foreground = 255;\n" 64"bool fitsPDF = false;\n" 65"int pt_idx = mad24(y, frame_col, x);\n" 66"int idx_step = frame_row * frame_col;\n" 67"__global uchar* _modesUsed = modesUsed + pt_idx;\n" 68"uchar nmodes = _modesUsed[0];\n" 69"float totalWeight = 0.0f;\n" 70"__global float* _weight = (__global float*)(weight);\n" 71"__global float* _variance = (__global float*)(variance);\n" 72"__global T_MEAN* _mean = (__global T_MEAN*)(mean);\n" 73"uchar mode = 0;\n" 74"for (; mode < nmodes; ++mode)\n" 75"{\n" 76"int mode_idx = mad24(mode, idx_step, pt_idx);\n" 77"float c_weight = mad(alpha1, _weight[mode_idx], prune);\n" 78"float c_var = _variance[mode_idx];\n" 79"T_MEAN c_mean = _mean[mode_idx];\n" 80"T_MEAN diff = c_mean - pix;\n" 81"float dist2 = dot(diff, diff);\n" 82"if (totalWeight < c_TB && dist2 < c_Tb * c_var)\n" 83"foreground = 0;\n" 84"if (dist2 < c_Tg * c_var)\n" 85"{\n" 86"fitsPDF = true;\n" 87"c_weight += alphaT;\n" 88"float k = alphaT / c_weight;\n" 89"T_MEAN mean_new = mad((T_MEAN)-k, diff, c_mean);\n" 90"float variance_new = clamp(mad(k, (dist2 - c_var), c_var), c_varMin, c_varMax);\n" 91"for (int i = mode; i > 0; --i)\n" 92"{\n" 93"int prev_idx = mode_idx - idx_step;\n" 94"if (c_weight < _weight[prev_idx])\n" 95"break;\n" 96"_weight[mode_idx] = _weight[prev_idx];\n" 97"_variance[mode_idx] = _variance[prev_idx];\n" 98"_mean[mode_idx] = _mean[prev_idx];\n" 99"mode_idx = prev_idx;\n" 100"}\n" 101"_mean[mode_idx] = mean_new;\n" 102"_variance[mode_idx] = variance_new;\n" 103"_weight[mode_idx] = c_weight;\n" 104"totalWeight += c_weight;\n" 105"mode ++;\n" 106"break;\n" 107"}\n" 108"if (c_weight < -prune)\n" 109"c_weight = 0.0f;\n" 110"_weight[mode_idx] = c_weight;\n" 111"totalWeight += c_weight;\n" 112"}\n" 113"for (; mode < nmodes; ++mode)\n" 114"{\n" 115"int mode_idx = mad24(mode, idx_step, pt_idx);\n" 116"float c_weight = mad(alpha1, _weight[mode_idx], prune);\n" 117"if (c_weight < -prune)\n" 118"{\n" 119"c_weight = 0.0f;\n" 120"nmodes = mode;\n" 121"break;\n" 122"}\n" 123"_weight[mode_idx] = c_weight;\n" 124"totalWeight += c_weight;\n" 125"}\n" 126"if (0.f < totalWeight)\n" 127"{\n" 128"totalWeight = 1.f / totalWeight;\n" 129"for (int mode = 0; mode < nmodes; ++mode)\n" 130"_weight[mad24(mode, idx_step, pt_idx)] *= totalWeight;\n" 131"}\n" 132"if (!fitsPDF)\n" 133"{\n" 134"uchar mode = nmodes == (NMIXTURES) ? (NMIXTURES) - 1 : nmodes++;\n" 135"int mode_idx = mad24(mode, idx_step, pt_idx);\n" 136"if (nmodes == 1)\n" 137"_weight[mode_idx] = 1.f;\n" 138"else\n" 139"{\n" 140"_weight[mode_idx] = alphaT;\n" 141"for (int i = pt_idx; i < mode_idx; i += idx_step)\n" 142"_weight[i] *= alpha1;\n" 143"}\n" 144"for (int i = nmodes - 1; i > 0; --i)\n" 145"{\n" 146"int prev_idx = mode_idx - idx_step;\n" 147"if (alphaT < _weight[prev_idx])\n" 148"break;\n" 149"_weight[mode_idx] = _weight[prev_idx];\n" 150"_variance[mode_idx] = _variance[prev_idx];\n" 151"_mean[mode_idx] = _mean[prev_idx];\n" 152"mode_idx = prev_idx;\n" 153"}\n" 154"_mean[mode_idx] = pix;\n" 155"_variance[mode_idx] = c_varInit;\n" 156"}\n" 157"_modesUsed[0] = nmodes;\n" 158"#ifdef SHADOW_DETECT\n" 159"if (foreground)\n" 160"{\n" 161"float tWeight = 0.0f;\n" 162"for (uchar mode = 0; mode < nmodes; ++mode)\n" 163"{\n" 164"int mode_idx = mad24(mode, idx_step, pt_idx);\n" 165"T_MEAN c_mean = _mean[mode_idx];\n" 166"T_MEAN pix_mean = pix * c_mean;\n" 167"float numerator = sum(pix_mean);\n" 168"float denominator = dot(c_mean, c_mean);\n" 169"if (denominator == 0)\n" 170"break;\n" 171"if (numerator <= denominator && numerator >= c_tau * denominator)\n" 172"{\n" 173"float a = numerator / denominator;\n" 174"T_MEAN dD = mad(a, c_mean, -pix);\n" 175"if (dot(dD, dD) < c_Tb * _variance[mode_idx] * a * a)\n" 176"{\n" 177"foreground = c_shadowVal;\n" 178"break;\n" 179"}\n" 180"}\n" 181"tWeight += _weight[mode_idx];\n" 182"if (tWeight > c_TB)\n" 183"break;\n" 184"}\n" 185"}\n" 186"#endif\n" 187"__global uchar* _fgmask = fgmask + mad24(y, fgmask_step, x + fgmask_offset);\n" 188"*_fgmask = (uchar)foreground;\n" 189"}\n" 190"}\n" 191"__kernel void getBackgroundImage2_kernel(__global const uchar* modesUsed,\n" 192"__global const uchar* weight,\n" 193"__global const uchar* mean,\n" 194"__global uchar* dst, int dst_step, int dst_offset, int dst_row, int dst_col,\n" 195"float c_TB)\n" 196"{\n" 197"int x = get_global_id(0);\n" 198"int y = get_global_id(1);\n" 199"if(x < dst_col && y < dst_row)\n" 200"{\n" 201"int pt_idx = mad24(y, dst_col, x);\n" 202"__global const uchar* _modesUsed = modesUsed + pt_idx;\n" 203"uchar nmodes = _modesUsed[0];\n" 204"T_MEAN meanVal = (T_MEAN)F_ZERO;\n" 205"float totalWeight = 0.0f;\n" 206"__global const float* _weight = (__global const float*)weight;\n" 207"__global const T_MEAN* _mean = (__global const T_MEAN*)(mean);\n" 208"int idx_step = dst_row * dst_col;\n" 209"for (uchar mode = 0; mode < nmodes; ++mode)\n" 210"{\n" 211"int mode_idx = mad24(mode, idx_step, pt_idx);\n" 212"float c_weight = _weight[mode_idx];\n" 213"T_MEAN c_mean = _mean[mode_idx];\n" 214"meanVal = mad(c_weight, c_mean, meanVal);\n" 215"totalWeight += c_weight;\n" 216"if (totalWeight > c_TB)\n" 217"break;\n" 218"}\n" 219"if (0.f < totalWeight)\n" 220"meanVal = meanVal / totalWeight;\n" 221"else\n" 222"meanVal = (T_MEAN)(0.f);\n" 223"__global uchar* _dst = dst + mad24(y, dst_step, mad24(x, CN, dst_offset));\n" 224"meanToFrame(meanVal, _dst);\n" 225"}\n" 226"}\n" 227, "b6e3850899862b7f0ab67cb32f1d52e9"}; 228ProgramSource bgfg_mog2_oclsrc(bgfg_mog2.programStr); 229const struct ProgramEntry optical_flow_farneback={"optical_flow_farneback", 230"#define tx (int)get_local_id(0)\n" 231"#define ty get_local_id(1)\n" 232"#define bx get_group_id(0)\n" 233"#define bdx (int)get_local_size(0)\n" 234"#define BORDER_SIZE 5\n" 235"#define MAX_KSIZE_HALF 100\n" 236"#ifndef polyN\n" 237"#define polyN 5\n" 238"#endif\n" 239"#if USE_DOUBLE\n" 240"#ifdef cl_amd_fp64\n" 241"#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" 242"#elif defined (cl_khr_fp64)\n" 243"#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" 244"#endif\n" 245"#define TYPE double\n" 246"#define VECTYPE double4\n" 247"#else\n" 248"#define TYPE float\n" 249"#define VECTYPE float4\n" 250"#endif\n" 251"__kernel void polynomialExpansion(__global __const float * src, int srcStep,\n" 252"__global float * dst, int dstStep,\n" 253"const int rows, const int cols,\n" 254"__global __const float * c_g,\n" 255"__global __const float * c_xg,\n" 256"__global __const float * c_xxg,\n" 257"__local float * smem,\n" 258"const VECTYPE ig)\n" 259"{\n" 260"const int y = get_global_id(1);\n" 261"const int x = bx * (bdx - 2*polyN) + tx - polyN;\n" 262"int xWarped;\n" 263"__local float *row = smem + tx;\n" 264"if (y < rows && y >= 0)\n" 265"{\n" 266"xWarped = min(max(x, 0), cols - 1);\n" 267"row[0] = src[mad24(y, srcStep, xWarped)] * c_g[0];\n" 268"row[bdx] = 0.f;\n" 269"row[2*bdx] = 0.f;\n" 270"#pragma unroll\n" 271"for (int k = 1; k <= polyN; ++k)\n" 272"{\n" 273"float t0 = src[mad24(max(y - k, 0), srcStep, xWarped)];\n" 274"float t1 = src[mad24(min(y + k, rows - 1), srcStep, xWarped)];\n" 275"row[0] += c_g[k] * (t0 + t1);\n" 276"row[bdx] += c_xg[k] * (t1 - t0);\n" 277"row[2*bdx] += c_xxg[k] * (t0 + t1);\n" 278"}\n" 279"}\n" 280"barrier(CLK_LOCAL_MEM_FENCE);\n" 281"if (y < rows && y >= 0 && tx >= polyN && tx + polyN < bdx && x < cols)\n" 282"{\n" 283"TYPE b1 = c_g[0] * row[0];\n" 284"TYPE b3 = c_g[0] * row[bdx];\n" 285"TYPE b5 = c_g[0] * row[2*bdx];\n" 286"TYPE b2 = 0, b4 = 0, b6 = 0;\n" 287"#pragma unroll\n" 288"for (int k = 1; k <= polyN; ++k)\n" 289"{\n" 290"b1 += (row[k] + row[-k]) * c_g[k];\n" 291"b4 += (row[k] + row[-k]) * c_xxg[k];\n" 292"b2 += (row[k] - row[-k]) * c_xg[k];\n" 293"b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];\n" 294"b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];\n" 295"b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];\n" 296"}\n" 297"dst[mad24(y, dstStep, xWarped)] = (float)(b3*ig.s0);\n" 298"dst[mad24(rows + y, dstStep, xWarped)] = (float)(b2*ig.s0);\n" 299"dst[mad24(2*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b5*ig.s2);\n" 300"dst[mad24(3*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b4*ig.s2);\n" 301"dst[mad24(4*rows + y, dstStep, xWarped)] = (float)(b6*ig.s3);\n" 302"}\n" 303"}\n" 304"inline int idx_row_low(const int y, const int last_row)\n" 305"{\n" 306"return abs(y) % (last_row + 1);\n" 307"}\n" 308"inline int idx_row_high(const int y, const int last_row)\n" 309"{\n" 310"return abs(last_row - abs(last_row - y)) % (last_row + 1);\n" 311"}\n" 312"inline int idx_col_low(const int x, const int last_col)\n" 313"{\n" 314"return abs(x) % (last_col + 1);\n" 315"}\n" 316"inline int idx_col_high(const int x, const int last_col)\n" 317"{\n" 318"return abs(last_col - abs(last_col - x)) % (last_col + 1);\n" 319"}\n" 320"inline int idx_col(const int x, const int last_col)\n" 321"{\n" 322"return idx_col_low(idx_col_high(x, last_col), last_col);\n" 323"}\n" 324"__kernel void gaussianBlur(__global const float * src, int srcStep,\n" 325"__global float * dst, int dstStep, const int rows, const int cols,\n" 326"__global const float * c_gKer, const int ksizeHalf,\n" 327"__local float * smem)\n" 328"{\n" 329"const int y = get_global_id(1);\n" 330"const int x = get_global_id(0);\n" 331"__local float *row = smem + ty * (bdx + 2*ksizeHalf);\n" 332"if (y < rows)\n" 333"{\n" 334"for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)\n" 335"{\n" 336"int xExt = (int)(bx * bdx) + i - ksizeHalf;\n" 337"xExt = idx_col(xExt, cols - 1);\n" 338"row[i] = src[mad24(y, srcStep, xExt)] * c_gKer[0];\n" 339"for (int j = 1; j <= ksizeHalf; ++j)\n" 340"row[i] += (src[mad24(idx_row_low(y - j, rows - 1), srcStep, xExt)]\n" 341"+ src[mad24(idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j];\n" 342"}\n" 343"}\n" 344"barrier(CLK_LOCAL_MEM_FENCE);\n" 345"if (y < rows && y >= 0 && x < cols && x >= 0)\n" 346"{\n" 347"row += tx + ksizeHalf;\n" 348"float res = row[0] * c_gKer[0];\n" 349"for (int i = 1; i <= ksizeHalf; ++i)\n" 350"res += (row[-i] + row[i]) * c_gKer[i];\n" 351"dst[mad24(y, dstStep, x)] = res;\n" 352"}\n" 353"}\n" 354"__kernel void gaussianBlur5(__global const float * src, int srcStep,\n" 355"__global float * dst, int dstStep,\n" 356"const int rows, const int cols,\n" 357"__global const float * c_gKer, const int ksizeHalf,\n" 358"__local float * smem)\n" 359"{\n" 360"const int y = get_global_id(1);\n" 361"const int x = get_global_id(0);\n" 362"const int smw = bdx + 2*ksizeHalf;\n" 363"__local volatile float *row = smem + 5 * ty * smw;\n" 364"if (y < rows)\n" 365"{\n" 366"for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)\n" 367"{\n" 368"int xExt = (int)(bx * bdx) + i - ksizeHalf;\n" 369"xExt = idx_col(xExt, cols - 1);\n" 370"#pragma unroll\n" 371"for (int k = 0; k < 5; ++k)\n" 372"row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)] * c_gKer[0];\n" 373"for (int j = 1; j <= ksizeHalf; ++j)\n" 374"#pragma unroll\n" 375"for (int k = 0; k < 5; ++k)\n" 376"row[k*smw + i] +=\n" 377"(src[mad24(k*rows + idx_row_low(y - j, rows - 1), srcStep, xExt)] +\n" 378"src[mad24(k*rows + idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j];\n" 379"}\n" 380"}\n" 381"barrier(CLK_LOCAL_MEM_FENCE);\n" 382"if (y < rows && y >= 0 && x < cols && x >= 0)\n" 383"{\n" 384"row += tx + ksizeHalf;\n" 385"float res[5];\n" 386"#pragma unroll\n" 387"for (int k = 0; k < 5; ++k)\n" 388"res[k] = row[k*smw] * c_gKer[0];\n" 389"for (int i = 1; i <= ksizeHalf; ++i)\n" 390"#pragma unroll\n" 391"for (int k = 0; k < 5; ++k)\n" 392"res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];\n" 393"#pragma unroll\n" 394"for (int k = 0; k < 5; ++k)\n" 395"dst[mad24(k*rows + y, dstStep, x)] = res[k];\n" 396"}\n" 397"}\n" 398"__constant float c_border[BORDER_SIZE + 1] = { 0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f };\n" 399"__kernel void updateMatrices(__global const float * flowx, int xStep,\n" 400"__global const float * flowy, int yStep,\n" 401"const int rows, const int cols,\n" 402"__global const float * R0, int R0Step,\n" 403"__global const float * R1, int R1Step,\n" 404"__global float * M, int mStep)\n" 405"{\n" 406"const int y = get_global_id(1);\n" 407"const int x = get_global_id(0);\n" 408"if (y < rows && y >= 0 && x < cols && x >= 0)\n" 409"{\n" 410"float dx = flowx[mad24(y, xStep, x)];\n" 411"float dy = flowy[mad24(y, yStep, x)];\n" 412"float fx = x + dx;\n" 413"float fy = y + dy;\n" 414"int x1 = convert_int(floor(fx));\n" 415"int y1 = convert_int(floor(fy));\n" 416"fx -= x1;\n" 417"fy -= y1;\n" 418"float r2, r3, r4, r5, r6;\n" 419"if (x1 >= 0 && y1 >= 0 && x1 < cols - 1 && y1 < rows - 1)\n" 420"{\n" 421"float a00 = (1.f - fx) * (1.f - fy);\n" 422"float a01 = fx * (1.f - fy);\n" 423"float a10 = (1.f - fx) * fy;\n" 424"float a11 = fx * fy;\n" 425"r2 = a00 * R1[mad24(y1, R1Step, x1)] +\n" 426"a01 * R1[mad24(y1, R1Step, x1 + 1)] +\n" 427"a10 * R1[mad24(y1 + 1, R1Step, x1)] +\n" 428"a11 * R1[mad24(y1 + 1, R1Step, x1 + 1)];\n" 429"r3 = a00 * R1[mad24(rows + y1, R1Step, x1)] +\n" 430"a01 * R1[mad24(rows + y1, R1Step, x1 + 1)] +\n" 431"a10 * R1[mad24(rows + y1 + 1, R1Step, x1)] +\n" 432"a11 * R1[mad24(rows + y1 + 1, R1Step, x1 + 1)];\n" 433"r4 = a00 * R1[mad24(2*rows + y1, R1Step, x1)] +\n" 434"a01 * R1[mad24(2*rows + y1, R1Step, x1 + 1)] +\n" 435"a10 * R1[mad24(2*rows + y1 + 1, R1Step, x1)] +\n" 436"a11 * R1[mad24(2*rows + y1 + 1, R1Step, x1 + 1)];\n" 437"r5 = a00 * R1[mad24(3*rows + y1, R1Step, x1)] +\n" 438"a01 * R1[mad24(3*rows + y1, R1Step, x1 + 1)] +\n" 439"a10 * R1[mad24(3*rows + y1 + 1, R1Step, x1)] +\n" 440"a11 * R1[mad24(3*rows + y1 + 1, R1Step, x1 + 1)];\n" 441"r6 = a00 * R1[mad24(4*rows + y1, R1Step, x1)] +\n" 442"a01 * R1[mad24(4*rows + y1, R1Step, x1 + 1)] +\n" 443"a10 * R1[mad24(4*rows + y1 + 1, R1Step, x1)] +\n" 444"a11 * R1[mad24(4*rows + y1 + 1, R1Step, x1 + 1)];\n" 445"r4 = (R0[mad24(2*rows + y, R0Step, x)] + r4) * 0.5f;\n" 446"r5 = (R0[mad24(3*rows + y, R0Step, x)] + r5) * 0.5f;\n" 447"r6 = (R0[mad24(4*rows + y, R0Step, x)] + r6) * 0.25f;\n" 448"}\n" 449"else\n" 450"{\n" 451"r2 = r3 = 0.f;\n" 452"r4 = R0[mad24(2*rows + y, R0Step, x)];\n" 453"r5 = R0[mad24(3*rows + y, R0Step, x)];\n" 454"r6 = R0[mad24(4*rows + y, R0Step, x)] * 0.5f;\n" 455"}\n" 456"r2 = (R0[mad24(y, R0Step, x)] - r2) * 0.5f;\n" 457"r3 = (R0[mad24(rows + y, R0Step, x)] - r3) * 0.5f;\n" 458"r2 += r4*dy + r6*dx;\n" 459"r3 += r6*dy + r5*dx;\n" 460"float scale =\n" 461"c_border[min(x, BORDER_SIZE)] *\n" 462"c_border[min(y, BORDER_SIZE)] *\n" 463"c_border[min(cols - x - 1, BORDER_SIZE)] *\n" 464"c_border[min(rows - y - 1, BORDER_SIZE)];\n" 465"r2 *= scale;\n" 466"r3 *= scale;\n" 467"r4 *= scale;\n" 468"r5 *= scale;\n" 469"r6 *= scale;\n" 470"M[mad24(y, mStep, x)] = r4*r4 + r6*r6;\n" 471"M[mad24(rows + y, mStep, x)] = (r4 + r5)*r6;\n" 472"M[mad24(2*rows + y, mStep, x)] = r5*r5 + r6*r6;\n" 473"M[mad24(3*rows + y, mStep, x)] = r4*r2 + r6*r3;\n" 474"M[mad24(4*rows + y, mStep, x)] = r6*r2 + r5*r3;\n" 475"}\n" 476"}\n" 477"__kernel void boxFilter5(__global const float * src, int srcStep,\n" 478"__global float * dst, int dstStep,\n" 479"const int rows, const int cols,\n" 480"const int ksizeHalf,\n" 481"__local float * smem)\n" 482"{\n" 483"const int y = get_global_id(1);\n" 484"const int x = get_global_id(0);\n" 485"const float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));\n" 486"const int smw = bdx + 2*ksizeHalf;\n" 487"__local float *row = smem + 5 * ty * smw;\n" 488"if (y < rows)\n" 489"{\n" 490"for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)\n" 491"{\n" 492"int xExt = (int)(bx * bdx) + i - ksizeHalf;\n" 493"xExt = min(max(xExt, 0), cols - 1);\n" 494"#pragma unroll\n" 495"for (int k = 0; k < 5; ++k)\n" 496"row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)];\n" 497"for (int j = 1; j <= ksizeHalf; ++j)\n" 498"#pragma unroll\n" 499"for (int k = 0; k < 5; ++k)\n" 500"row[k*smw + i] +=\n" 501"src[mad24(k*rows + max(y - j, 0), srcStep, xExt)] +\n" 502"src[mad24(k*rows + min(y + j, rows - 1), srcStep, xExt)];\n" 503"}\n" 504"}\n" 505"barrier(CLK_LOCAL_MEM_FENCE);\n" 506"if (y < rows && y >= 0 && x < cols && x >= 0)\n" 507"{\n" 508"row += tx + ksizeHalf;\n" 509"float res[5];\n" 510"#pragma unroll\n" 511"for (int k = 0; k < 5; ++k)\n" 512"res[k] = row[k*smw];\n" 513"for (int i = 1; i <= ksizeHalf; ++i)\n" 514"#pragma unroll\n" 515"for (int k = 0; k < 5; ++k)\n" 516"res[k] += row[k*smw - i] + row[k*smw + i];\n" 517"#pragma unroll\n" 518"for (int k = 0; k < 5; ++k)\n" 519"dst[mad24(k*rows + y, dstStep, x)] = res[k] * boxAreaInv;\n" 520"}\n" 521"}\n" 522"__kernel void updateFlow(__global const float * M, int mStep,\n" 523"__global float * flowx, int xStep,\n" 524"__global float * flowy, int yStep,\n" 525"const int rows, const int cols)\n" 526"{\n" 527"const int y = get_global_id(1);\n" 528"const int x = get_global_id(0);\n" 529"if (y < rows && y >= 0 && x < cols && x >= 0)\n" 530"{\n" 531"float g11 = M[mad24(y, mStep, x)];\n" 532"float g12 = M[mad24(rows + y, mStep, x)];\n" 533"float g22 = M[mad24(2*rows + y, mStep, x)];\n" 534"float h1 = M[mad24(3*rows + y, mStep, x)];\n" 535"float h2 = M[mad24(4*rows + y, mStep, x)];\n" 536"float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);\n" 537"flowx[mad24(y, xStep, x)] = (g11*h2 - g12*h1) * detInv;\n" 538"flowy[mad24(y, yStep, x)] = (g22*h1 - g12*h2) * detInv;\n" 539"}\n" 540"}\n" 541, "529300e6242f574f83d11a089cc120c0"}; 542ProgramSource optical_flow_farneback_oclsrc(optical_flow_farneback.programStr); 543const struct ProgramEntry optical_flow_tvl1={"optical_flow_tvl1", 544"__kernel void centeredGradientKernel(__global const float* src_ptr, int src_col, int src_row, int src_step,\n" 545"__global float* dx, __global float* dy, int d_step)\n" 546"{\n" 547"int x = get_global_id(0);\n" 548"int y = get_global_id(1);\n" 549"if((x < src_col)&&(y < src_row))\n" 550"{\n" 551"int src_x1 = (x + 1) < (src_col -1)? (x + 1) : (src_col - 1);\n" 552"int src_x2 = (x - 1) > 0 ? (x -1) : 0;\n" 553"dx[y * d_step+ x] = 0.5f * (src_ptr[y * src_step + src_x1] - src_ptr[y * src_step+ src_x2]);\n" 554"int src_y1 = (y+1) < (src_row - 1) ? (y + 1) : (src_row - 1);\n" 555"int src_y2 = (y - 1) > 0 ? (y - 1) : 0;\n" 556"dy[y * d_step+ x] = 0.5f * (src_ptr[src_y1 * src_step + x] - src_ptr[src_y2 * src_step+ x]);\n" 557"}\n" 558"}\n" 559"inline float bicubicCoeff(float x_)\n" 560"{\n" 561"float x = fabs(x_);\n" 562"if (x <= 1.0f)\n" 563"return x * x * (1.5f * x - 2.5f) + 1.0f;\n" 564"else if (x < 2.0f)\n" 565"return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;\n" 566"else\n" 567"return 0.0f;\n" 568"}\n" 569"__kernel void warpBackwardKernel(__global const float* I0, int I0_step, int I0_col, int I0_row,\n" 570"image2d_t tex_I1, image2d_t tex_I1x, image2d_t tex_I1y,\n" 571"__global const float* u1, int u1_step,\n" 572"__global const float* u2,\n" 573"__global float* I1w,\n" 574"__global float* I1wx, \n" 575"__global float* I1wy, \n" 576"__global float* grad, \n" 577"__global float* rho,\n" 578"int I1w_step,\n" 579"int u2_step,\n" 580"int u1_offset_x,\n" 581"int u1_offset_y,\n" 582"int u2_offset_x,\n" 583"int u2_offset_y)\n" 584"{\n" 585"int x = get_global_id(0);\n" 586"int y = get_global_id(1);\n" 587"if(x < I0_col&&y < I0_row)\n" 588"{\n" 589"float u1Val = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];\n" 590"float u2Val = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];\n" 591"float wx = x + u1Val;\n" 592"float wy = y + u2Val;\n" 593"int xmin = ceil(wx - 2.0f);\n" 594"int xmax = floor(wx + 2.0f);\n" 595"int ymin = ceil(wy - 2.0f);\n" 596"int ymax = floor(wy + 2.0f);\n" 597"float sum = 0.0f;\n" 598"float sumx = 0.0f;\n" 599"float sumy = 0.0f;\n" 600"float wsum = 0.0f;\n" 601"sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n" 602"for (int cy = ymin; cy <= ymax; ++cy)\n" 603"{\n" 604"for (int cx = xmin; cx <= xmax; ++cx)\n" 605"{\n" 606"float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);\n" 607"int2 cood = (int2)(cx, cy);\n" 608"sum += w * read_imagef(tex_I1, sampleri, cood).x;\n" 609"sumx += w * read_imagef(tex_I1x, sampleri, cood).x;\n" 610"sumy += w * read_imagef(tex_I1y, sampleri, cood).x;\n" 611"wsum += w;\n" 612"}\n" 613"}\n" 614"float coeff = 1.0f / wsum;\n" 615"float I1wVal = sum * coeff;\n" 616"float I1wxVal = sumx * coeff;\n" 617"float I1wyVal = sumy * coeff;\n" 618"I1w[y * I1w_step + x] = I1wVal;\n" 619"I1wx[y * I1w_step + x] = I1wxVal;\n" 620"I1wy[y * I1w_step + x] = I1wyVal;\n" 621"float Ix2 = I1wxVal * I1wxVal;\n" 622"float Iy2 = I1wyVal * I1wyVal;\n" 623"grad[y * I1w_step + x] = Ix2 + Iy2;\n" 624"float I0Val = I0[y * I0_step + x];\n" 625"rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;\n" 626"}\n" 627"}\n" 628"inline float readImage(__global float *image, int x, int y, int rows, int cols, int elemCntPerRow)\n" 629"{\n" 630"int i0 = clamp(x, 0, cols - 1);\n" 631"int j0 = clamp(y, 0, rows - 1);\n" 632"return image[j0 * elemCntPerRow + i0];\n" 633"}\n" 634"__kernel void warpBackwardKernelNoImage2d(__global const float* I0, int I0_step, int I0_col, int I0_row,\n" 635"__global const float* tex_I1, __global const float* tex_I1x, __global const float* tex_I1y,\n" 636"__global const float* u1, int u1_step,\n" 637"__global const float* u2,\n" 638"__global float* I1w,\n" 639"__global float* I1wx, \n" 640"__global float* I1wy, \n" 641"__global float* grad, \n" 642"__global float* rho,\n" 643"int I1w_step,\n" 644"int u2_step,\n" 645"int I1_step,\n" 646"int I1x_step)\n" 647"{\n" 648"int x = get_global_id(0);\n" 649"int y = get_global_id(1);\n" 650"if(x < I0_col&&y < I0_row)\n" 651"{\n" 652"float u1Val = u1[y * u1_step + x];\n" 653"float u2Val = u2[y * u2_step + x];\n" 654"float wx = x + u1Val;\n" 655"float wy = y + u2Val;\n" 656"int xmin = ceil(wx - 2.0f);\n" 657"int xmax = floor(wx + 2.0f);\n" 658"int ymin = ceil(wy - 2.0f);\n" 659"int ymax = floor(wy + 2.0f);\n" 660"float sum = 0.0f;\n" 661"float sumx = 0.0f;\n" 662"float sumy = 0.0f;\n" 663"float wsum = 0.0f;\n" 664"for (int cy = ymin; cy <= ymax; ++cy)\n" 665"{\n" 666"for (int cx = xmin; cx <= xmax; ++cx)\n" 667"{\n" 668"float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);\n" 669"int2 cood = (int2)(cx, cy);\n" 670"sum += w * readImage(tex_I1, cood.x, cood.y, I0_col, I0_row, I1_step);\n" 671"sumx += w * readImage(tex_I1x, cood.x, cood.y, I0_col, I0_row, I1x_step);\n" 672"sumy += w * readImage(tex_I1y, cood.x, cood.y, I0_col, I0_row, I1x_step);\n" 673"wsum += w;\n" 674"}\n" 675"}\n" 676"float coeff = 1.0f / wsum;\n" 677"float I1wVal = sum * coeff;\n" 678"float I1wxVal = sumx * coeff;\n" 679"float I1wyVal = sumy * coeff;\n" 680"I1w[y * I1w_step + x] = I1wVal;\n" 681"I1wx[y * I1w_step + x] = I1wxVal;\n" 682"I1wy[y * I1w_step + x] = I1wyVal;\n" 683"float Ix2 = I1wxVal * I1wxVal;\n" 684"float Iy2 = I1wyVal * I1wyVal;\n" 685"grad[y * I1w_step + x] = Ix2 + Iy2;\n" 686"float I0Val = I0[y * I0_step + x];\n" 687"rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;\n" 688"}\n" 689"}\n" 690"__kernel void estimateDualVariablesKernel(__global const float* u1, int u1_col, int u1_row, int u1_step,\n" 691"__global const float* u2,\n" 692"__global float* p11, int p11_step,\n" 693"__global float* p12,\n" 694"__global float* p21,\n" 695"__global float* p22,\n" 696"float taut,\n" 697"int u2_step,\n" 698"int u1_offset_x,\n" 699"int u1_offset_y,\n" 700"int u2_offset_x,\n" 701"int u2_offset_y)\n" 702"{\n" 703"int x = get_global_id(0);\n" 704"int y = get_global_id(1);\n" 705"if(x < u1_col && y < u1_row)\n" 706"{\n" 707"int src_x1 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);\n" 708"float u1x = u1[(y + u1_offset_y) * u1_step + src_x1 + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];\n" 709"int src_y1 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1);\n" 710"float u1y = u1[(src_y1 + u1_offset_y) * u1_step + x + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];\n" 711"int src_x2 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);\n" 712"float u2x = u2[(y + u2_offset_y) * u2_step + src_x2 + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];\n" 713"int src_y2 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1);\n" 714"float u2y = u2[(src_y2 + u2_offset_y) * u2_step + x + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];\n" 715"float g1 = hypot(u1x, u1y);\n" 716"float g2 = hypot(u2x, u2y);\n" 717"float ng1 = 1.0f + taut * g1;\n" 718"float ng2 = 1.0f + taut * g2;\n" 719"p11[y * p11_step + x] = (p11[y * p11_step + x] + taut * u1x) / ng1;\n" 720"p12[y * p11_step + x] = (p12[y * p11_step + x] + taut * u1y) / ng1;\n" 721"p21[y * p11_step + x] = (p21[y * p11_step + x] + taut * u2x) / ng2;\n" 722"p22[y * p11_step + x] = (p22[y * p11_step + x] + taut * u2y) / ng2;\n" 723"}\n" 724"}\n" 725"inline float divergence(__global const float* v1, __global const float* v2, int y, int x, int v1_step, int v2_step)\n" 726"{\n" 727"if (x > 0 && y > 0)\n" 728"{\n" 729"float v1x = v1[y * v1_step + x] - v1[y * v1_step + x - 1];\n" 730"float v2y = v2[y * v2_step + x] - v2[(y - 1) * v2_step + x];\n" 731"return v1x + v2y;\n" 732"}\n" 733"else\n" 734"{\n" 735"if (y > 0)\n" 736"return v1[y * v1_step + 0] + v2[y * v2_step + 0] - v2[(y - 1) * v2_step + 0];\n" 737"else\n" 738"{\n" 739"if (x > 0)\n" 740"return v1[0 * v1_step + x] - v1[0 * v1_step + x - 1] + v2[0 * v2_step + x];\n" 741"else\n" 742"return v1[0 * v1_step + 0] + v2[0 * v2_step + 0];\n" 743"}\n" 744"}\n" 745"}\n" 746"__kernel void estimateUKernel(__global const float* I1wx, int I1wx_col, int I1wx_row, int I1wx_step,\n" 747"__global const float* I1wy, \n" 748"__global const float* grad, \n" 749"__global const float* rho_c, \n" 750"__global const float* p11, \n" 751"__global const float* p12, \n" 752"__global const float* p21, \n" 753"__global const float* p22, \n" 754"__global float* u1, int u1_step,\n" 755"__global float* u2,\n" 756"__global float* error, float l_t, float theta, int u2_step,\n" 757"int u1_offset_x,\n" 758"int u1_offset_y,\n" 759"int u2_offset_x,\n" 760"int u2_offset_y,\n" 761"char calc_error)\n" 762"{\n" 763"int x = get_global_id(0);\n" 764"int y = get_global_id(1);\n" 765"if(x < I1wx_col && y < I1wx_row)\n" 766"{\n" 767"float I1wxVal = I1wx[y * I1wx_step + x];\n" 768"float I1wyVal = I1wy[y * I1wx_step + x];\n" 769"float gradVal = grad[y * I1wx_step + x];\n" 770"float u1OldVal = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];\n" 771"float u2OldVal = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];\n" 772"float rho = rho_c[y * I1wx_step + x] + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);\n" 773"float d1 = 0.0f;\n" 774"float d2 = 0.0f;\n" 775"if (rho < -l_t * gradVal)\n" 776"{\n" 777"d1 = l_t * I1wxVal;\n" 778"d2 = l_t * I1wyVal;\n" 779"}\n" 780"else if (rho > l_t * gradVal)\n" 781"{\n" 782"d1 = -l_t * I1wxVal;\n" 783"d2 = -l_t * I1wyVal;\n" 784"}\n" 785"else if (gradVal > 1.192092896e-07f)\n" 786"{\n" 787"float fi = -rho / gradVal;\n" 788"d1 = fi * I1wxVal;\n" 789"d2 = fi * I1wyVal;\n" 790"}\n" 791"float v1 = u1OldVal + d1;\n" 792"float v2 = u2OldVal + d2;\n" 793"float div_p1 = divergence(p11, p12, y, x, I1wx_step, I1wx_step);\n" 794"float div_p2 = divergence(p21, p22, y, x, I1wx_step, I1wx_step);\n" 795"float u1NewVal = v1 + theta * div_p1;\n" 796"float u2NewVal = v2 + theta * div_p2;\n" 797"u1[(y + u1_offset_y) * u1_step + x + u1_offset_x] = u1NewVal;\n" 798"u2[(y + u2_offset_y) * u2_step + x + u2_offset_x] = u2NewVal;\n" 799"if(calc_error)\n" 800"{\n" 801"float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);\n" 802"float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);\n" 803"error[y * I1wx_step + x] = n1 + n2;\n" 804"}\n" 805"}\n" 806"}\n" 807, "a9d306a49b405703820fae23312ebd28"}; 808ProgramSource optical_flow_tvl1_oclsrc(optical_flow_tvl1.programStr); 809const struct ProgramEntry pyrlk={"pyrlk", 810"#define GRIDSIZE 3\n" 811"#define LSx 8\n" 812"#define LSy 8\n" 813"#define LM_W (LSx*GRIDSIZE+2)\n" 814"#define LM_H (LSy*GRIDSIZE+2)\n" 815"#define BUFFER (LSx*LSy)\n" 816"#define BUFFER2 BUFFER>>1\n" 817"#ifndef WAVE_SIZE\n" 818"#define WAVE_SIZE 1\n" 819"#endif\n" 820"#ifdef CPU\n" 821"inline void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)\n" 822"{\n" 823"smem1[tid] = val1;\n" 824"smem2[tid] = val2;\n" 825"smem3[tid] = val3;\n" 826"barrier(CLK_LOCAL_MEM_FENCE);\n" 827"for(int i = BUFFER2; i > 0; i >>= 1)\n" 828"{\n" 829"if(tid < i)\n" 830"{\n" 831"smem1[tid] += smem1[tid + i];\n" 832"smem2[tid] += smem2[tid + i];\n" 833"smem3[tid] += smem3[tid + i];\n" 834"}\n" 835"barrier(CLK_LOCAL_MEM_FENCE);\n" 836"}\n" 837"}\n" 838"inline void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)\n" 839"{\n" 840"smem1[tid] = val1;\n" 841"smem2[tid] = val2;\n" 842"barrier(CLK_LOCAL_MEM_FENCE);\n" 843"for(int i = BUFFER2; i > 0; i >>= 1)\n" 844"{\n" 845"if(tid < i)\n" 846"{\n" 847"smem1[tid] += smem1[tid + i];\n" 848"smem2[tid] += smem2[tid + i];\n" 849"}\n" 850"barrier(CLK_LOCAL_MEM_FENCE);\n" 851"}\n" 852"}\n" 853"inline void reduce1(float val1, volatile __local float* smem1, int tid)\n" 854"{\n" 855"smem1[tid] = val1;\n" 856"barrier(CLK_LOCAL_MEM_FENCE);\n" 857"for(int i = BUFFER2; i > 0; i >>= 1)\n" 858"{\n" 859"if(tid < i)\n" 860"{\n" 861"smem1[tid] += smem1[tid + i];\n" 862"}\n" 863"barrier(CLK_LOCAL_MEM_FENCE);\n" 864"}\n" 865"}\n" 866"#else\n" 867"inline void reduce3(float val1, float val2, float val3,\n" 868"__local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)\n" 869"{\n" 870"smem1[tid] = val1;\n" 871"smem2[tid] = val2;\n" 872"smem3[tid] = val3;\n" 873"barrier(CLK_LOCAL_MEM_FENCE);\n" 874"if (tid < 32)\n" 875"{\n" 876"smem1[tid] += smem1[tid + 32];\n" 877"smem2[tid] += smem2[tid + 32];\n" 878"smem3[tid] += smem3[tid + 32];\n" 879"#if WAVE_SIZE < 32\n" 880"}\n" 881"barrier(CLK_LOCAL_MEM_FENCE);\n" 882"if (tid < 16)\n" 883"{\n" 884"#endif\n" 885"smem1[tid] += smem1[tid + 16];\n" 886"smem2[tid] += smem2[tid + 16];\n" 887"smem3[tid] += smem3[tid + 16];\n" 888"#if WAVE_SIZE <16\n" 889"}\n" 890"barrier(CLK_LOCAL_MEM_FENCE);\n" 891"if (tid<1)\n" 892"{\n" 893"#endif\n" 894"local float8* m1 = (local float8*)smem1;\n" 895"local float8* m2 = (local float8*)smem2;\n" 896"local float8* m3 = (local float8*)smem3;\n" 897"float8 t1 = m1[0]+m1[1];\n" 898"float8 t2 = m2[0]+m2[1];\n" 899"float8 t3 = m3[0]+m3[1];\n" 900"float4 t14 = t1.lo + t1.hi;\n" 901"float4 t24 = t2.lo + t2.hi;\n" 902"float4 t34 = t3.lo + t3.hi;\n" 903"smem1[0] = t14.x+t14.y+t14.z+t14.w;\n" 904"smem2[0] = t24.x+t24.y+t24.z+t24.w;\n" 905"smem3[0] = t34.x+t34.y+t34.z+t34.w;\n" 906"}\n" 907"barrier(CLK_LOCAL_MEM_FENCE);\n" 908"}\n" 909"inline void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)\n" 910"{\n" 911"smem1[tid] = val1;\n" 912"smem2[tid] = val2;\n" 913"barrier(CLK_LOCAL_MEM_FENCE);\n" 914"if (tid < 32)\n" 915"{\n" 916"smem1[tid] += smem1[tid + 32];\n" 917"smem2[tid] += smem2[tid + 32];\n" 918"#if WAVE_SIZE < 32\n" 919"}\n" 920"barrier(CLK_LOCAL_MEM_FENCE);\n" 921"if (tid < 16)\n" 922"{\n" 923"#endif\n" 924"smem1[tid] += smem1[tid + 16];\n" 925"smem2[tid] += smem2[tid + 16];\n" 926"#if WAVE_SIZE <16\n" 927"}\n" 928"barrier(CLK_LOCAL_MEM_FENCE);\n" 929"if (tid<1)\n" 930"{\n" 931"#endif\n" 932"local float8* m1 = (local float8*)smem1;\n" 933"local float8* m2 = (local float8*)smem2;\n" 934"float8 t1 = m1[0]+m1[1];\n" 935"float8 t2 = m2[0]+m2[1];\n" 936"float4 t14 = t1.lo + t1.hi;\n" 937"float4 t24 = t2.lo + t2.hi;\n" 938"smem1[0] = t14.x+t14.y+t14.z+t14.w;\n" 939"smem2[0] = t24.x+t24.y+t24.z+t24.w;\n" 940"}\n" 941"barrier(CLK_LOCAL_MEM_FENCE);\n" 942"}\n" 943"inline void reduce1(float val1, __local volatile float* smem1, int tid)\n" 944"{\n" 945"smem1[tid] = val1;\n" 946"barrier(CLK_LOCAL_MEM_FENCE);\n" 947"if (tid < 32)\n" 948"{\n" 949"smem1[tid] += smem1[tid + 32];\n" 950"#if WAVE_SIZE < 32\n" 951"}\n" 952"barrier(CLK_LOCAL_MEM_FENCE);\n" 953"if (tid < 16)\n" 954"{\n" 955"#endif\n" 956"smem1[tid] += smem1[tid + 16];\n" 957"#if WAVE_SIZE <16\n" 958"}\n" 959"barrier(CLK_LOCAL_MEM_FENCE);\n" 960"if (tid<1)\n" 961"{\n" 962"#endif\n" 963"local float8* m1 = (local float8*)smem1;\n" 964"float8 t1 = m1[0]+m1[1];\n" 965"float4 t14 = t1.lo + t1.hi;\n" 966"smem1[0] = t14.x+t14.y+t14.z+t14.w;\n" 967"}\n" 968"barrier(CLK_LOCAL_MEM_FENCE);\n" 969"}\n" 970"#endif\n" 971"#define SCALE (1.0f / (1 << 20))\n" 972"#define THRESHOLD 0.01f\n" 973"__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n" 974"#define VAL(_y,_x,_yy,_xx) (IPatchLocal[(yid+((_y)*LSy)+1+(_yy))*LM_W+(xid+((_x)*LSx)+1+(_xx))])\n" 975"inline void SetPatch(local float* IPatchLocal, int TileY, int TileX,\n" 976"float* Pch, float* Dx, float* Dy,\n" 977"float* A11, float* A12, float* A22, float w)\n" 978"{\n" 979"unsigned int xid=get_local_id(0);\n" 980"unsigned int yid=get_local_id(1);\n" 981"*Pch = VAL(TileY,TileX,0,0);\n" 982"float dIdx = (3.0f*VAL(TileY,TileX,-1,1)+10.0f*VAL(TileY,TileX,0,1)+3.0f*VAL(TileY,TileX,+1,1))-(3.0f*VAL(TileY,TileX,-1,-1)+10.0f*VAL(TileY,TileX,0,-1)+3.0f*VAL(TileY,TileX,+1,-1));\n" 983"float dIdy = (3.0f*VAL(TileY,TileX,1,-1)+10.0f*VAL(TileY,TileX,1,0)+3.0f*VAL(TileY,TileX,1,+1))-(3.0f*VAL(TileY,TileX,-1,-1)+10.0f*VAL(TileY,TileX,-1,0)+3.0f*VAL(TileY,TileX,-1,+1));\n" 984"dIdx *= w;\n" 985"dIdy *= w;\n" 986"*Dx = dIdx;\n" 987"*Dy = dIdy;\n" 988"*A11 += dIdx * dIdx;\n" 989"*A12 += dIdx * dIdy;\n" 990"*A22 += dIdy * dIdy;\n" 991"}\n" 992"#undef VAL\n" 993"inline void GetPatch(image2d_t J, float x, float y,\n" 994"float* Pch, float* Dx, float* Dy,\n" 995"float* b1, float* b2)\n" 996"{\n" 997"float J_val = read_imagef(J, sampler, (float2)(x, y)).x;\n" 998"float diff = (J_val - *Pch) * 32.0f;\n" 999"*b1 += diff**Dx;\n" 1000"*b2 += diff**Dy;\n" 1001"}\n" 1002"inline void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval)\n" 1003"{\n" 1004"float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch;\n" 1005"*errval += fabs(diff);\n" 1006"}\n" 1007"#define READI(_y,_x) IPatchLocal[(yid+((_y)*LSy))*LM_W+(xid+((_x)*LSx))] = read_imagef(I, sampler, (float2)(Point.x + xid+(_x)*LSx + 0.5f-1, Point.y + yid+(_y)*LSy+ 0.5f-1)).x;\n" 1008"void ReadPatchIToLocalMem(image2d_t I, float2 Point, local float* IPatchLocal)\n" 1009"{\n" 1010"unsigned int xid=get_local_id(0);\n" 1011"unsigned int yid=get_local_id(1);\n" 1012"READI(0,0);READI(0,1);READI(0,2);\n" 1013"READI(1,0);READI(1,1);READI(1,2);\n" 1014"READI(2,0);READI(2,1);READI(2,2);\n" 1015"if(xid<2)\n" 1016"{\n" 1017"READI(0,3);\n" 1018"READI(1,3);\n" 1019"READI(2,3);\n" 1020"}\n" 1021"if(yid<2)\n" 1022"{\n" 1023"READI(3,0);READI(3,1);READI(3,2);\n" 1024"}\n" 1025"if(yid<2 && xid<2)\n" 1026"{\n" 1027"READI(3,3);\n" 1028"}\n" 1029"barrier(CLK_LOCAL_MEM_FENCE);\n" 1030"}\n" 1031"#undef READI\n" 1032"__attribute__((reqd_work_group_size(LSx, LSy, 1)))\n" 1033"__kernel void lkSparse(image2d_t I, image2d_t J,\n" 1034"__global const float2* prevPts, __global float2* nextPts, __global uchar* status, __global float* err,\n" 1035"const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)\n" 1036"{\n" 1037"__local float smem1[BUFFER];\n" 1038"__local float smem2[BUFFER];\n" 1039"__local float smem3[BUFFER];\n" 1040"unsigned int xid=get_local_id(0);\n" 1041"unsigned int yid=get_local_id(1);\n" 1042"unsigned int gid=get_group_id(0);\n" 1043"unsigned int xsize=get_local_size(0);\n" 1044"unsigned int ysize=get_local_size(1);\n" 1045"int xBase, yBase, k;\n" 1046"float wx = ((xid+2*xsize)<c_winSize_x)?1:0;\n" 1047"float wy = ((yid+2*ysize)<c_winSize_y)?1:0;\n" 1048"float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);\n" 1049"const int tid = mad24(yid, xsize, xid);\n" 1050"float2 prevPt = prevPts[gid] / (float2)(1 << level);\n" 1051"if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)\n" 1052"{\n" 1053"if (tid == 0 && level == 0)\n" 1054"{\n" 1055"status[gid] = 0;\n" 1056"}\n" 1057"return;\n" 1058"}\n" 1059"prevPt -= c_halfWin;\n" 1060"float A11 = 0;\n" 1061"float A12 = 0;\n" 1062"float A22 = 0;\n" 1063"float I_patch[GRIDSIZE][GRIDSIZE];\n" 1064"float dIdx_patch[GRIDSIZE][GRIDSIZE];\n" 1065"float dIdy_patch[GRIDSIZE][GRIDSIZE];\n" 1066"local float IPatchLocal[LM_W*LM_H];\n" 1067"ReadPatchIToLocalMem(I,prevPt,IPatchLocal);\n" 1068"{\n" 1069"SetPatch(IPatchLocal, 0, 0,\n" 1070"&I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],\n" 1071"&A11, &A12, &A22,1);\n" 1072"SetPatch(IPatchLocal, 0, 1,\n" 1073"&I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],\n" 1074"&A11, &A12, &A22,1);\n" 1075"SetPatch(IPatchLocal, 0, 2,\n" 1076"&I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],\n" 1077"&A11, &A12, &A22,wx);\n" 1078"}\n" 1079"{\n" 1080"SetPatch(IPatchLocal, 1, 0,\n" 1081"&I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],\n" 1082"&A11, &A12, &A22,1);\n" 1083"SetPatch(IPatchLocal, 1,1,\n" 1084"&I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],\n" 1085"&A11, &A12, &A22,1);\n" 1086"SetPatch(IPatchLocal, 1,2,\n" 1087"&I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],\n" 1088"&A11, &A12, &A22,wx);\n" 1089"}\n" 1090"{\n" 1091"SetPatch(IPatchLocal, 2,0,\n" 1092"&I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],\n" 1093"&A11, &A12, &A22,wy);\n" 1094"SetPatch(IPatchLocal, 2,1,\n" 1095"&I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],\n" 1096"&A11, &A12, &A22,wy);\n" 1097"SetPatch(IPatchLocal, 2,2,\n" 1098"&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],\n" 1099"&A11, &A12, &A22,wx*wy);\n" 1100"}\n" 1101"reduce3(A11, A12, A22, smem1, smem2, smem3, tid);\n" 1102"A11 = smem1[0];\n" 1103"A12 = smem2[0];\n" 1104"A22 = smem3[0];\n" 1105"barrier(CLK_LOCAL_MEM_FENCE);\n" 1106"float D = A11 * A22 - A12 * A12;\n" 1107"if (D < 1.192092896e-07f)\n" 1108"{\n" 1109"if (tid == 0 && level == 0)\n" 1110"status[gid] = 0;\n" 1111"return;\n" 1112"}\n" 1113"A11 /= D;\n" 1114"A12 /= D;\n" 1115"A22 /= D;\n" 1116"prevPt = nextPts[gid] * 2.0f - c_halfWin;\n" 1117"for (k = 0; k < c_iters; ++k)\n" 1118"{\n" 1119"if (prevPt.x < -c_halfWin.x || prevPt.x >= cols || prevPt.y < -c_halfWin.y || prevPt.y >= rows)\n" 1120"{\n" 1121"if (tid == 0 && level == 0)\n" 1122"status[gid] = 0;\n" 1123"break;\n" 1124"}\n" 1125"float b1 = 0;\n" 1126"float b2 = 0;\n" 1127"yBase=yid;\n" 1128"{\n" 1129"xBase=xid;\n" 1130"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1131"&I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],\n" 1132"&b1, &b2);\n" 1133"xBase+=xsize;\n" 1134"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1135"&I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],\n" 1136"&b1, &b2);\n" 1137"xBase+=xsize;\n" 1138"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1139"&I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],\n" 1140"&b1, &b2);\n" 1141"}\n" 1142"yBase+=ysize;\n" 1143"{\n" 1144"xBase=xid;\n" 1145"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1146"&I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],\n" 1147"&b1, &b2);\n" 1148"xBase+=xsize;\n" 1149"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1150"&I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],\n" 1151"&b1, &b2);\n" 1152"xBase+=xsize;\n" 1153"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1154"&I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],\n" 1155"&b1, &b2);\n" 1156"}\n" 1157"yBase+=ysize;\n" 1158"{\n" 1159"xBase=xid;\n" 1160"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1161"&I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],\n" 1162"&b1, &b2);\n" 1163"xBase+=xsize;\n" 1164"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1165"&I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],\n" 1166"&b1, &b2);\n" 1167"xBase+=xsize;\n" 1168"GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1169"&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],\n" 1170"&b1, &b2);\n" 1171"}\n" 1172"reduce2(b1, b2, smem1, smem2, tid);\n" 1173"b1 = smem1[0];\n" 1174"b2 = smem2[0];\n" 1175"barrier(CLK_LOCAL_MEM_FENCE);\n" 1176"float2 delta;\n" 1177"delta.x = A12 * b2 - A22 * b1;\n" 1178"delta.y = A12 * b1 - A11 * b2;\n" 1179"prevPt += delta;\n" 1180"if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD)\n" 1181"break;\n" 1182"}\n" 1183"D = 0.0f;\n" 1184"if (calcErr)\n" 1185"{\n" 1186"yBase=yid;\n" 1187"{\n" 1188"xBase=xid;\n" 1189"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1190"&I_patch[0][0], &D);\n" 1191"xBase+=xsize;\n" 1192"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1193"&I_patch[0][1], &D);\n" 1194"xBase+=xsize;\n" 1195"if(xBase<c_winSize_x)\n" 1196"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1197"&I_patch[0][2], &D);\n" 1198"}\n" 1199"yBase+=ysize;\n" 1200"{\n" 1201"xBase=xid;\n" 1202"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1203"&I_patch[1][0], &D);\n" 1204"xBase+=xsize;\n" 1205"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1206"&I_patch[1][1], &D);\n" 1207"xBase+=xsize;\n" 1208"if(xBase<c_winSize_x)\n" 1209"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1210"&I_patch[1][2], &D);\n" 1211"}\n" 1212"yBase+=ysize;\n" 1213"if(yBase<c_winSize_y)\n" 1214"{\n" 1215"xBase=xid;\n" 1216"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1217"&I_patch[2][0], &D);\n" 1218"xBase+=xsize;\n" 1219"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1220"&I_patch[2][1], &D);\n" 1221"xBase+=xsize;\n" 1222"if(xBase<c_winSize_x)\n" 1223"GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1224"&I_patch[2][2], &D);\n" 1225"}\n" 1226"reduce1(D, smem1, tid);\n" 1227"}\n" 1228"if (tid == 0)\n" 1229"{\n" 1230"prevPt += c_halfWin;\n" 1231"nextPts[gid] = prevPt;\n" 1232"if (calcErr)\n" 1233"err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y);\n" 1234"}\n" 1235"}\n" 1236, "b7099fcbc60bd5528dacc491eadd88c1"}; 1237ProgramSource pyrlk_oclsrc(pyrlk.programStr); 1238} 1239}} 1240