// This file is auto-generated. Do not edit! #include "opencv2/core.hpp" #include "cvconfig.h" #include "opencl_kernels_video.hpp" #ifdef HAVE_OPENCL namespace cv { namespace ocl { namespace video { static const char* const moduleName = "video"; struct cv::ocl::internal::ProgramEntry bgfg_knn_oclsrc={moduleName, "bgfg_knn", "#if CN==1\n" "#define T_MEAN float\n" "#define F_ZERO (0.0f)\n" "#define frameToMean(a, b) (b) = *(a);\n" "#define meanToFrame(a, b) *b = convert_uchar_sat(a);\n" "#else\n" "#define T_MEAN float4\n" "#define F_ZERO (0.0f, 0.0f, 0.0f, 0.0f)\n" "#define meanToFrame(a, b)\\\n" "b[0] = convert_uchar_sat(a.x); \\\n" "b[1] = convert_uchar_sat(a.y); \\\n" "b[2] = convert_uchar_sat(a.z);\n" "#define frameToMean(a, b)\\\n" "b.x = a[0]; \\\n" "b.y = a[1]; \\\n" "b.z = a[2]; \\\n" "b.w = 0.0f;\n" "#endif\n" "__kernel void knn_kernel(__global const uchar* frame, int frame_step, int frame_offset, int frame_row, int frame_col,\n" "__global const uchar* nNextLongUpdate,\n" "__global const uchar* nNextMidUpdate,\n" "__global const uchar* nNextShortUpdate,\n" "__global uchar* aModelIndexLong,\n" "__global uchar* aModelIndexMid,\n" "__global uchar* aModelIndexShort,\n" "__global uchar* flag,\n" "__global uchar* sample,\n" "__global uchar* fgmask, int fgmask_step, int fgmask_offset,\n" "int nLongCounter, int nMidCounter, int nShortCounter,\n" "float c_Tb, int c_nkNN, float c_tau\n" "#ifdef SHADOW_DETECT\n" ", uchar c_shadowVal\n" "#endif\n" ")\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if( x < frame_col && y < frame_row)\n" "{\n" "__global const uchar* _frame = (frame + mad24(y, frame_step, mad24(x, CN, frame_offset)));\n" "T_MEAN pix;\n" "frameToMean(_frame, pix);\n" "uchar foreground = 255;\n" "int Pbf = 0;\n" "int Pb = 0;\n" "uchar include = 0;\n" "int pt_idx = mad24(y, frame_col, x);\n" "int idx_step = frame_row * frame_col;\n" "__global T_MEAN* _sample = (__global T_MEAN*)(sample);\n" "for (uchar n = 0; n < (NSAMPLES) * 3 ; ++n)\n" "{\n" "int n_idx = mad24(n, idx_step, pt_idx);\n" "T_MEAN c_mean = _sample[n_idx];\n" "uchar c_flag = flag[n_idx];\n" "T_MEAN diff = c_mean - pix;\n" "float dist2 = dot(diff, diff);\n" "if (dist2 < c_Tb)\n" "{\n" "Pbf++;\n" "if (c_flag)\n" "{\n" "Pb++;\n" "if (Pb >= c_nkNN)\n" "{\n" "include = 1;\n" "foreground = 0;\n" "break;\n" "}\n" "}\n" "}\n" "}\n" "if (Pbf >= c_nkNN)\n" "{\n" "include = 1;\n" "}\n" "#ifdef SHADOW_DETECT\n" "if (foreground)\n" "{\n" "int Ps = 0;\n" "for (uchar n = 0; n < (NSAMPLES) * 3 ; ++n)\n" "{\n" "int n_idx = mad24(n, idx_step, pt_idx);\n" "uchar c_flag = flag[n_idx];\n" "if (c_flag)\n" "{\n" "T_MEAN c_mean = _sample[n_idx];\n" "float numerator = dot(pix, c_mean);\n" "float denominator = dot(c_mean, c_mean);\n" "if (denominator == 0)\n" "break;\n" "if (numerator <= denominator && numerator >= c_tau * denominator)\n" "{\n" "float a = numerator / denominator;\n" "T_MEAN dD = mad(a, c_mean, -pix);\n" "if (dot(dD, dD) < c_Tb * a * a)\n" "{\n" "Ps++;\n" "if (Ps >= c_nkNN)\n" "{\n" "foreground = c_shadowVal;\n" "break;\n" "}\n" "}\n" "}\n" "}\n" "}\n" "}\n" "#endif\n" "__global uchar* _fgmask = fgmask + mad24(y, fgmask_step, x + fgmask_offset);\n" "*_fgmask = (uchar)foreground;\n" "__global const uchar* _nNextLongUpdate = nNextLongUpdate + pt_idx;\n" "__global const uchar* _nNextMidUpdate = nNextMidUpdate + pt_idx;\n" "__global const uchar* _nNextShortUpdate = nNextShortUpdate + pt_idx;\n" "__global uchar* _aModelIndexLong = aModelIndexLong + pt_idx;\n" "__global uchar* _aModelIndexMid = aModelIndexMid + pt_idx;\n" "__global uchar* _aModelIndexShort = aModelIndexShort + pt_idx;\n" "uchar nextLongUpdate = _nNextLongUpdate[0];\n" "uchar nextMidUpdate = _nNextMidUpdate[0];\n" "uchar nextShortUpdate = _nNextShortUpdate[0];\n" "uchar modelIndexLong = _aModelIndexLong[0];\n" "uchar modelIndexMid = _aModelIndexMid[0];\n" "uchar modelIndexShort = _aModelIndexShort[0];\n" "int offsetLong = mad24(mad24(2, (NSAMPLES), modelIndexLong), idx_step, pt_idx);\n" "int offsetMid = mad24((NSAMPLES)+modelIndexMid, idx_step, pt_idx);\n" "int offsetShort = mad24(modelIndexShort, idx_step, pt_idx);\n" "if (nextLongUpdate == nLongCounter)\n" "{\n" "_sample[offsetLong] = _sample[offsetMid];\n" "flag[offsetLong] = flag[offsetMid];\n" "_aModelIndexLong[0] = (modelIndexLong >= ((NSAMPLES)-1)) ? 0 : (modelIndexLong + 1);\n" "}\n" "if (nextMidUpdate == nMidCounter)\n" "{\n" "_sample[offsetMid] = _sample[offsetShort];\n" "flag[offsetMid] = flag[offsetShort];\n" "_aModelIndexMid[0] = (modelIndexMid >= ((NSAMPLES)-1)) ? 0 : (modelIndexMid + 1);\n" "}\n" "if (nextShortUpdate == nShortCounter)\n" "{\n" "_sample[offsetShort] = pix;\n" "flag[offsetShort] = include;\n" "_aModelIndexShort[0] = (modelIndexShort >= ((NSAMPLES)-1)) ? 0 : (modelIndexShort + 1);\n" "}\n" "}\n" "}\n" "__kernel void getBackgroundImage2_kernel(__global const uchar* flag,\n" "__global const uchar* sample,\n" "__global uchar* dst, int dst_step, int dst_offset, int dst_row, int dst_col)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if(x < dst_col && y < dst_row)\n" "{\n" "int pt_idx = mad24(y, dst_col, x);\n" "T_MEAN meanVal = (T_MEAN)F_ZERO;\n" "__global T_MEAN* _sample = (__global T_MEAN*)(sample);\n" "int idx_step = dst_row * dst_col;\n" "for (uchar n = 0; n < (NSAMPLES) * 3 ; ++n)\n" "{\n" "int n_idx = mad24(n, idx_step, pt_idx);\n" "uchar c_flag = flag[n_idx];\n" "if(c_flag)\n" "{\n" "meanVal = _sample[n_idx];\n" "break;\n" "}\n" "}\n" "__global uchar* _dst = dst + mad24(y, dst_step, mad24(x, CN, dst_offset));\n" "meanToFrame(meanVal, _dst);\n" "}\n" "}\n" , "a192721ae727b25afde8d854a0679ee4", NULL}; struct cv::ocl::internal::ProgramEntry bgfg_mog2_oclsrc={moduleName, "bgfg_mog2", "#if CN==1\n" "#define T_MEAN float\n" "#define F_ZERO (0.0f)\n" "#define cnMode 1\n" "#define frameToMean(a, b) (b) = *(a);\n" "#if FL==0\n" "#define meanToFrame(a, b) *b = convert_uchar_sat(a);\n" "#else\n" "#define meanToFrame(a, b) *b = (float)a;\n" "#endif\n" "#else\n" "#define T_MEAN float4\n" "#define F_ZERO (0.0f, 0.0f, 0.0f, 0.0f)\n" "#define cnMode 4\n" "#if FL == 0\n" "#define meanToFrame(a, b)\\\n" "b[0] = convert_uchar_sat(a.x); \\\n" "b[1] = convert_uchar_sat(a.y); \\\n" "b[2] = convert_uchar_sat(a.z);\n" "#else\n" "#define meanToFrame(a, b)\\\n" "b[0] = a.x; \\\n" "b[1] = a.y; \\\n" "b[2] = a.z;\n" "#endif\n" "#define frameToMean(a, b)\\\n" "b.x = a[0]; \\\n" "b.y = a[1]; \\\n" "b.z = a[2]; \\\n" "b.w = 0.0f;\n" "#endif\n" "__kernel void mog2_kernel(__global const uchar* frame, int frame_step, int frame_offset, int frame_row, int frame_col,\n" "__global uchar* modesUsed,\n" "__global uchar* weight,\n" "__global uchar* mean,\n" "__global uchar* variance,\n" "__global uchar* fgmask, int fgmask_step, int fgmask_offset,\n" "float alphaT, float alpha1, float prune,\n" "float c_Tb, float c_TB, float c_Tg, float c_varMin,\n" "float c_varMax, float c_varInit, float c_tau\n" "#ifdef SHADOW_DETECT\n" ", uchar c_shadowVal\n" "#endif\n" ")\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if( x < frame_col && y < frame_row)\n" "{\n" "#if FL==0\n" "__global const uchar* _frame = (frame + mad24(y, frame_step, mad24(x, CN, frame_offset)));\n" "#else\n" "__global const float* _frame = ((__global const float*)( frame + mad24(y, frame_step, frame_offset)) + mad24(x, CN, 0));\n" "#endif\n" "T_MEAN pix;\n" "frameToMean(_frame, pix);\n" "uchar foreground = 255;\n" "bool fitsPDF = false;\n" "int pt_idx = mad24(y, frame_col, x);\n" "int idx_step = frame_row * frame_col;\n" "__global uchar* _modesUsed = modesUsed + pt_idx;\n" "uchar nmodes = _modesUsed[0];\n" "float totalWeight = 0.0f;\n" "__global float* _weight = (__global float*)(weight);\n" "__global float* _variance = (__global float*)(variance);\n" "__global T_MEAN* _mean = (__global T_MEAN*)(mean);\n" "uchar mode = 0;\n" "for (; mode < nmodes; ++mode)\n" "{\n" "int mode_idx = mad24(mode, idx_step, pt_idx);\n" "float c_weight = mad(alpha1, _weight[mode_idx], prune);\n" "float c_var = _variance[mode_idx];\n" "T_MEAN c_mean = _mean[mode_idx];\n" "T_MEAN diff = c_mean - pix;\n" "float dist2 = dot(diff, diff);\n" "if (totalWeight < c_TB && dist2 < c_Tb * c_var)\n" "foreground = 0;\n" "if (dist2 < c_Tg * c_var)\n" "{\n" "fitsPDF = true;\n" "c_weight += alphaT;\n" "float k = alphaT / c_weight;\n" "T_MEAN mean_new = mad((T_MEAN)-k, diff, c_mean);\n" "float variance_new = clamp(mad(k, (dist2 - c_var), c_var), c_varMin, c_varMax);\n" "for (int i = mode; i > 0; --i)\n" "{\n" "int prev_idx = mode_idx - idx_step;\n" "if (c_weight < _weight[prev_idx])\n" "break;\n" "_weight[mode_idx] = _weight[prev_idx];\n" "_variance[mode_idx] = _variance[prev_idx];\n" "_mean[mode_idx] = _mean[prev_idx];\n" "mode_idx = prev_idx;\n" "}\n" "_mean[mode_idx] = mean_new;\n" "_variance[mode_idx] = variance_new;\n" "_weight[mode_idx] = c_weight;\n" "totalWeight += c_weight;\n" "mode ++;\n" "break;\n" "}\n" "if (c_weight < -prune)\n" "c_weight = 0.0f;\n" "_weight[mode_idx] = c_weight;\n" "totalWeight += c_weight;\n" "}\n" "for (; mode < nmodes; ++mode)\n" "{\n" "int mode_idx = mad24(mode, idx_step, pt_idx);\n" "float c_weight = mad(alpha1, _weight[mode_idx], prune);\n" "if (c_weight < -prune)\n" "{\n" "c_weight = 0.0f;\n" "nmodes = mode;\n" "break;\n" "}\n" "_weight[mode_idx] = c_weight;\n" "totalWeight += c_weight;\n" "}\n" "if (0.f < totalWeight)\n" "{\n" "totalWeight = 1.f / totalWeight;\n" "for (int mode = 0; mode < nmodes; ++mode)\n" "_weight[mad24(mode, idx_step, pt_idx)] *= totalWeight;\n" "}\n" "if (!fitsPDF)\n" "{\n" "uchar mode = nmodes == (NMIXTURES) ? (NMIXTURES) - 1 : nmodes++;\n" "int mode_idx = mad24(mode, idx_step, pt_idx);\n" "if (nmodes == 1)\n" "_weight[mode_idx] = 1.f;\n" "else\n" "{\n" "_weight[mode_idx] = alphaT;\n" "for (int i = pt_idx; i < mode_idx; i += idx_step)\n" "_weight[i] *= alpha1;\n" "}\n" "for (int i = nmodes - 1; i > 0; --i)\n" "{\n" "int prev_idx = mode_idx - idx_step;\n" "if (alphaT < _weight[prev_idx])\n" "break;\n" "_weight[mode_idx] = _weight[prev_idx];\n" "_variance[mode_idx] = _variance[prev_idx];\n" "_mean[mode_idx] = _mean[prev_idx];\n" "mode_idx = prev_idx;\n" "}\n" "_mean[mode_idx] = pix;\n" "_variance[mode_idx] = c_varInit;\n" "}\n" "_modesUsed[0] = nmodes;\n" "#ifdef SHADOW_DETECT\n" "if (foreground)\n" "{\n" "float tWeight = 0.0f;\n" "for (uchar mode = 0; mode < nmodes; ++mode)\n" "{\n" "int mode_idx = mad24(mode, idx_step, pt_idx);\n" "T_MEAN c_mean = _mean[mode_idx];\n" "float numerator = dot(pix, c_mean);\n" "float denominator = dot(c_mean, c_mean);\n" "if (denominator == 0)\n" "break;\n" "if (numerator <= denominator && numerator >= c_tau * denominator)\n" "{\n" "float a = numerator / denominator;\n" "T_MEAN dD = mad(a, c_mean, -pix);\n" "if (dot(dD, dD) < c_Tb * _variance[mode_idx] * a * a)\n" "{\n" "foreground = c_shadowVal;\n" "break;\n" "}\n" "}\n" "tWeight += _weight[mode_idx];\n" "if (tWeight > c_TB)\n" "break;\n" "}\n" "}\n" "#endif\n" "__global uchar* _fgmask = fgmask + mad24(y, fgmask_step, x + fgmask_offset);\n" "*_fgmask = (uchar)foreground;\n" "}\n" "}\n" "__kernel void getBackgroundImage2_kernel(__global const uchar* modesUsed,\n" "__global const uchar* weight,\n" "__global const uchar* mean,\n" "__global uchar* dst, int dst_step, int dst_offset, int dst_row, int dst_col,\n" "float c_TB)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if(x < dst_col && y < dst_row)\n" "{\n" "int pt_idx = mad24(y, dst_col, x);\n" "__global const uchar* _modesUsed = modesUsed + pt_idx;\n" "uchar nmodes = _modesUsed[0];\n" "T_MEAN meanVal = (T_MEAN)F_ZERO;\n" "float totalWeight = 0.0f;\n" "__global const float* _weight = (__global const float*)weight;\n" "__global const T_MEAN* _mean = (__global const T_MEAN*)(mean);\n" "int idx_step = dst_row * dst_col;\n" "for (uchar mode = 0; mode < nmodes; ++mode)\n" "{\n" "int mode_idx = mad24(mode, idx_step, pt_idx);\n" "float c_weight = _weight[mode_idx];\n" "T_MEAN c_mean = _mean[mode_idx];\n" "meanVal = mad(c_weight, c_mean, meanVal);\n" "totalWeight += c_weight;\n" "if (totalWeight > c_TB)\n" "break;\n" "}\n" "if (0.f < totalWeight)\n" "meanVal = meanVal / totalWeight;\n" "else\n" "meanVal = (T_MEAN)(0.f);\n" "#if FL==0\n" "__global uchar* _dst = dst + mad24(y, dst_step, mad24(x, CN, dst_offset));\n" "meanToFrame(meanVal, _dst);\n" "#else\n" "__global float* _dst = ((__global float*)( dst + mad24(y, dst_step, dst_offset)) + mad24(x, CN, 0));\n" "meanToFrame(meanVal, _dst);\n" "#endif\n" "}\n" "}\n" , "39b7e7b52e8eb53029cf2337ae3d904f", NULL}; struct cv::ocl::internal::ProgramEntry dis_flow_oclsrc={moduleName, "dis_flow", "#define EPS 0.001f\n" "#define INF 1E+10F\n" "#define DIS_PATCH_SIZE_HALF (DIS_PATCH_SIZE / 2)\n" "#ifndef DIS_BORDER_SIZE\n" "__kernel void dis_precomputeStructureTensor_hor(__global const short *I0x,\n" "__global const short *I0y,\n" "int w, int h, int ws,\n" "__global float *I0xx_aux_ptr,\n" "__global float *I0yy_aux_ptr,\n" "__global float *I0xy_aux_ptr,\n" "__global float *I0x_aux_ptr,\n" "__global float *I0y_aux_ptr)\n" "{\n" "int i = get_global_id(0);\n" "if (i >= h) return;\n" "const __global short *x_row = I0x + i * w;\n" "const __global short *y_row = I0y + i * w;\n" "float sum_xx = 0.0f, sum_yy = 0.0f, sum_xy = 0.0f, sum_x = 0.0f, sum_y = 0.0f;\n" "float8 x_vec = convert_float8(vload8(0, x_row));\n" "float8 y_vec = convert_float8(vload8(0, y_row));\n" "sum_xx = dot(x_vec.lo, x_vec.lo) + dot(x_vec.hi, x_vec.hi);\n" "sum_yy = dot(y_vec.lo, y_vec.lo) + dot(y_vec.hi, y_vec.hi);\n" "sum_xy = dot(x_vec.lo, y_vec.lo) + dot(x_vec.hi, y_vec.hi);\n" "sum_x = dot(x_vec.lo, 1.0f) + dot(x_vec.hi, 1.0f);\n" "sum_y = dot(y_vec.lo, 1.0f) + dot(y_vec.hi, 1.0f);\n" "I0xx_aux_ptr[i * ws] = sum_xx;\n" "I0yy_aux_ptr[i * ws] = sum_yy;\n" "I0xy_aux_ptr[i * ws] = sum_xy;\n" "I0x_aux_ptr[i * ws] = sum_x;\n" "I0y_aux_ptr[i * ws] = sum_y;\n" "int js = 1;\n" "for (int j = DIS_PATCH_SIZE; j < w; j++)\n" "{\n" "short x_val1 = x_row[j];\n" "short x_val2 = x_row[j - DIS_PATCH_SIZE];\n" "short y_val1 = y_row[j];\n" "short y_val2 = y_row[j - DIS_PATCH_SIZE];\n" "sum_xx += (x_val1 * x_val1 - x_val2 * x_val2);\n" "sum_yy += (y_val1 * y_val1 - y_val2 * y_val2);\n" "sum_xy += (x_val1 * y_val1 - x_val2 * y_val2);\n" "sum_x += (x_val1 - x_val2);\n" "sum_y += (y_val1 - y_val2);\n" "if ((j - DIS_PATCH_SIZE + 1) % DIS_PATCH_STRIDE == 0)\n" "{\n" "int index = i * ws + js;\n" "I0xx_aux_ptr[index] = sum_xx;\n" "I0yy_aux_ptr[index] = sum_yy;\n" "I0xy_aux_ptr[index] = sum_xy;\n" "I0x_aux_ptr[index] = sum_x;\n" "I0y_aux_ptr[index] = sum_y;\n" "js++;\n" "}\n" "}\n" "}\n" "__kernel void dis_precomputeStructureTensor_ver(__global const float *I0xx_aux_ptr,\n" "__global const float *I0yy_aux_ptr,\n" "__global const float *I0xy_aux_ptr,\n" "__global const float *I0x_aux_ptr,\n" "__global const float *I0y_aux_ptr,\n" "int w, int h, int ws,\n" "__global float *I0xx_ptr,\n" "__global float *I0yy_ptr,\n" "__global float *I0xy_ptr,\n" "__global float *I0x_ptr,\n" "__global float *I0y_ptr)\n" "{\n" "int j = get_global_id(0);\n" "if (j >= ws) return;\n" "float sum_xx, sum_yy, sum_xy, sum_x, sum_y;\n" "sum_xx = sum_yy = sum_xy = sum_x = sum_y = 0.0f;\n" "for (int i = 0; i < DIS_PATCH_SIZE; i++)\n" "{\n" "sum_xx += I0xx_aux_ptr[i * ws + j];\n" "sum_yy += I0yy_aux_ptr[i * ws + j];\n" "sum_xy += I0xy_aux_ptr[i * ws + j];\n" "sum_x += I0x_aux_ptr[i * ws + j];\n" "sum_y += I0y_aux_ptr[i * ws + j];\n" "}\n" "I0xx_ptr[j] = sum_xx;\n" "I0yy_ptr[j] = sum_yy;\n" "I0xy_ptr[j] = sum_xy;\n" "I0x_ptr[j] = sum_x;\n" "I0y_ptr[j] = sum_y;\n" "int is = 1;\n" "for (int i = DIS_PATCH_SIZE; i < h; i++)\n" "{\n" "sum_xx += (I0xx_aux_ptr[i * ws + j] - I0xx_aux_ptr[(i - DIS_PATCH_SIZE) * ws + j]);\n" "sum_yy += (I0yy_aux_ptr[i * ws + j] - I0yy_aux_ptr[(i - DIS_PATCH_SIZE) * ws + j]);\n" "sum_xy += (I0xy_aux_ptr[i * ws + j] - I0xy_aux_ptr[(i - DIS_PATCH_SIZE) * ws + j]);\n" "sum_x += (I0x_aux_ptr[i * ws + j] - I0x_aux_ptr[(i - DIS_PATCH_SIZE) * ws + j]);\n" "sum_y += (I0y_aux_ptr[i * ws + j] - I0y_aux_ptr[(i - DIS_PATCH_SIZE) * ws + j]);\n" "if ((i - DIS_PATCH_SIZE + 1) % DIS_PATCH_STRIDE == 0)\n" "{\n" "I0xx_ptr[is * ws + j] = sum_xx;\n" "I0yy_ptr[is * ws + j] = sum_yy;\n" "I0xy_ptr[is * ws + j] = sum_xy;\n" "I0x_ptr[is * ws + j] = sum_x;\n" "I0y_ptr[is * ws + j] = sum_y;\n" "is++;\n" "}\n" "}\n" "}\n" "__kernel void dis_densification(__global const float2 *S_ptr,\n" "__global const uchar *i0, __global const uchar *i1,\n" "int w, int h, int ws,\n" "__global float2 *U_ptr)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "int i, j;\n" "if (x >= w || y >= h) return;\n" "int start_is, end_is;\n" "int start_js, end_js;\n" "end_is = min(y / DIS_PATCH_STRIDE, (h - DIS_PATCH_SIZE) / DIS_PATCH_STRIDE);\n" "start_is = max(0, y - DIS_PATCH_SIZE + DIS_PATCH_STRIDE) / DIS_PATCH_STRIDE;\n" "start_is = min(start_is, end_is);\n" "end_js = min(x / DIS_PATCH_STRIDE, (w - DIS_PATCH_SIZE) / DIS_PATCH_STRIDE);\n" "start_js = max(0, x - DIS_PATCH_SIZE + DIS_PATCH_STRIDE) / DIS_PATCH_STRIDE;\n" "start_js = min(start_js, end_js);\n" "float sum_coef = 0.0f;\n" "float2 sum_U = (float2)(0.0f, 0.0f);\n" "int i_l, i_u;\n" "int j_l, j_u;\n" "float i_m, j_m, diff;\n" "i = y;\n" "j = x;\n" "for (int is = start_is; is <= end_is; is++)\n" "for (int js = start_js; js <= end_js; js++)\n" "{\n" "float2 s_val = S_ptr[is * ws + js];\n" "uchar2 i1_vec1, i1_vec2;\n" "j_m = min(max(j + s_val.x, 0.0f), w - 1.0f - EPS);\n" "i_m = min(max(i + s_val.y, 0.0f), h - 1.0f - EPS);\n" "j_l = (int)j_m;\n" "j_u = j_l + 1;\n" "i_l = (int)i_m;\n" "i_u = i_l + 1;\n" "i1_vec1 = vload2(0, i1 + i_u * w + j_l);\n" "i1_vec2 = vload2(0, i1 + i_l * w + j_l);\n" "diff = (j_m - j_l) * (i_m - i_l) * i1_vec1.y +\n" "(j_u - j_m) * (i_m - i_l) * i1_vec1.x +\n" "(j_m - j_l) * (i_u - i_m) * i1_vec2.y +\n" "(j_u - j_m) * (i_u - i_m) * i1_vec2.x - i0[i * w + j];\n" "float coef = 1.0f / max(1.0f, fabs(diff));\n" "sum_U += coef * s_val;\n" "sum_coef += coef;\n" "}\n" "float inv_sum_coef = 1.0 / sum_coef;\n" "U_ptr[i * w + j] = sum_U * inv_sum_coef;\n" "}\n" "#else\n" "#define INIT_BILINEAR_WEIGHTS(Ux, Uy) \\\n" "i_I1 = clamp(i + Uy + DIS_BORDER_SIZE, i_lower_limit, i_upper_limit); \\\n" "j_I1 = clamp(j + Ux + DIS_BORDER_SIZE, j_lower_limit, j_upper_limit); \\\n" "{ \\\n" "float di = i_I1 - floor(i_I1); \\\n" "float dj = j_I1 - floor(j_I1); \\\n" "w11 = di * dj; \\\n" "w10 = di * (1 - dj); \\\n" "w01 = (1 - di) * dj; \\\n" "w00 = (1 - di) * (1 - dj); \\\n" "}\n" "float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,\n" "int I0_stride, int I1_stride,\n" "float w00, float w01, float w10, float w11, int i\n" "#ifndef CV_USE_SUBGROUPS\n" ", __local float2 *smem \n" "#endif\n" ")\n" "{\n" "float sum_diff = 0.0f, sum_diff_sq = 0.0f;\n" "int n = DIS_PATCH_SIZE * DIS_PATCH_SIZE;\n" "uchar8 I1_vec1, I1_vec2, I0_vec;\n" "uchar I1_val1, I1_val2;\n" "I0_vec = vload8(0, I0_ptr + i * I0_stride);\n" "I1_vec1 = vload8(0, I1_ptr + i * I1_stride);\n" "I1_vec2 = vload8(0, I1_ptr + (i + 1) * I1_stride);\n" "I1_val1 = I1_ptr[i * I1_stride + 8];\n" "I1_val2 = I1_ptr[(i + 1) * I1_stride + 8];\n" "float8 vec = w00 * convert_float8(I1_vec1) + w01 * convert_float8((uchar8)(I1_vec1.s123, I1_vec1.s4567, I1_val1)) +\n" "w10 * convert_float8(I1_vec2) + w11 * convert_float8((uchar8)(I1_vec2.s123, I1_vec2.s4567, I1_val2)) -\n" "convert_float8(I0_vec);\n" "sum_diff = (dot(vec.lo, 1.0) + dot(vec.hi, 1.0));\n" "sum_diff_sq = (dot(vec.lo, vec.lo) + dot(vec.hi, vec.hi));\n" "#ifdef CV_USE_SUBGROUPS\n" "sum_diff = sub_group_reduce_add(sum_diff);\n" "sum_diff_sq = sub_group_reduce_add(sum_diff_sq);\n" "#else\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "smem[i] = (float2)(sum_diff, sum_diff_sq);\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (i < 4)\n" "smem[i] += smem[i + 4];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (i < 2)\n" "smem[i] += smem[i + 2];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (i == 0)\n" "smem[0] += smem[1];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "float2 reduce_add_result = smem[0];\n" "sum_diff = reduce_add_result.x;\n" "sum_diff_sq = reduce_add_result.y;\n" "#endif\n" "return sum_diff_sq - sum_diff * sum_diff / n;\n" "}\n" "__attribute__((reqd_work_group_size(8, 1, 1)))\n" "__kernel void dis_patch_inverse_search_fwd_1(__global const float2 *U_ptr,\n" "__global const uchar *I0_ptr, __global const uchar *I1_ptr,\n" "int w, int h, int ws, int hs,\n" "__global float2 *S_ptr)\n" "{\n" "int id = get_global_id(0);\n" "int is = get_group_id(0);\n" "int i = is * DIS_PATCH_STRIDE;\n" "int j = 0;\n" "int w_ext = w + 2 * DIS_BORDER_SIZE;\n" "float i_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;\n" "float i_upper_limit = DIS_BORDER_SIZE + h - 1.0f;\n" "float j_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;\n" "float j_upper_limit = DIS_BORDER_SIZE + w - 1.0f;\n" "float2 prev_U = U_ptr[(i + DIS_PATCH_SIZE_HALF) * w + j + DIS_PATCH_SIZE_HALF];\n" "S_ptr[is * ws] = prev_U;\n" "j += DIS_PATCH_STRIDE;\n" "#ifdef CV_USE_SUBGROUPS\n" "int sid = get_sub_group_local_id();\n" "#define EXTRA_ARGS_computeSSDMeanNorm sid\n" "#else\n" "__local float2 smem[8];\n" "int sid = get_local_id(0);\n" "#define EXTRA_ARGS_computeSSDMeanNorm sid, smem\n" "#endif\n" "for (int js = 1; js < ws; js++, j += DIS_PATCH_STRIDE)\n" "{\n" "float2 U = U_ptr[(i + DIS_PATCH_SIZE_HALF) * w + j + DIS_PATCH_SIZE_HALF];\n" "float i_I1, j_I1, w00, w01, w10, w11;\n" "INIT_BILINEAR_WEIGHTS(U.x, U.y);\n" "float min_SSD = computeSSDMeanNorm(\n" "I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,\n" "w, w_ext, w00, w01, w10, w11, EXTRA_ARGS_computeSSDMeanNorm);\n" "INIT_BILINEAR_WEIGHTS(prev_U.x, prev_U.y);\n" "float cur_SSD = computeSSDMeanNorm(\n" "I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,\n" "w, w_ext, w00, w01, w10, w11, EXTRA_ARGS_computeSSDMeanNorm);\n" "prev_U = (cur_SSD < min_SSD) ? prev_U : U;\n" "S_ptr[is * ws + js] = prev_U;\n" "}\n" "#undef EXTRA_ARGS_computeSSDMeanNorm\n" "}\n" "#endif\n" "float4 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,\n" "const __global short *I0x_ptr, const __global short *I0y_ptr,\n" "int I0_stride, int I1_stride, float w00, float w01, float w10,\n" "float w11, float x_grad_sum, float y_grad_sum)\n" "{\n" "const float inv_n = 1.0f / (float)(DIS_PATCH_SIZE * DIS_PATCH_SIZE);\n" "float sum_diff = 0.0, sum_diff_sq = 0.0;\n" "float sum_I0x_mul = 0.0, sum_I0y_mul = 0.0;\n" "uchar8 I1_vec1;\n" "uchar8 I1_vec2 = vload8(0, I1_ptr);\n" "uchar I1_val1;\n" "uchar I1_val2 = I1_ptr[DIS_PATCH_SIZE];\n" "for (int i = 0; i < 8; i++)\n" "{\n" "uchar8 I0_vec = vload8(0, I0_ptr + i * I0_stride);\n" "I1_vec1 = I1_vec2;\n" "I1_vec2 = vload8(0, I1_ptr + (i + 1) * I1_stride);\n" "I1_val1 = I1_val2;\n" "I1_val2 = I1_ptr[(i + 1) * I1_stride + DIS_PATCH_SIZE];\n" "float8 vec = w00 * convert_float8(I1_vec1) + w01 * convert_float8((uchar8)(I1_vec1.s123, I1_vec1.s4567, I1_val1)) +\n" "w10 * convert_float8(I1_vec2) + w11 * convert_float8((uchar8)(I1_vec2.s123, I1_vec2.s4567, I1_val2)) -\n" "convert_float8(I0_vec);\n" "sum_diff += (dot(vec.lo, 1.0) + dot(vec.hi, 1.0));\n" "sum_diff_sq += (dot(vec.lo, vec.lo) + dot(vec.hi, vec.hi));\n" "short8 I0x_vec = vload8(0, I0x_ptr + i * I0_stride);\n" "short8 I0y_vec = vload8(0, I0y_ptr + i * I0_stride);\n" "sum_I0x_mul += dot(vec.lo, convert_float4(I0x_vec.lo));\n" "sum_I0x_mul += dot(vec.hi, convert_float4(I0x_vec.hi));\n" "sum_I0y_mul += dot(vec.lo, convert_float4(I0y_vec.lo));\n" "sum_I0y_mul += dot(vec.hi, convert_float4(I0y_vec.hi));\n" "}\n" "float dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum * inv_n;\n" "float dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum * inv_n;\n" "float SSD = sum_diff_sq - sum_diff * sum_diff * inv_n;\n" "return (float4)(SSD, dst_dUx, dst_dUy, 0);\n" "}\n" "#ifdef DIS_BORDER_SIZE\n" "__kernel void dis_patch_inverse_search_fwd_2(__global const float2 *U_ptr,\n" "__global const uchar *I0_ptr, __global const uchar *I1_ptr,\n" "__global const short *I0x_ptr, __global const short *I0y_ptr,\n" "__global const float *xx_ptr, __global const float *yy_ptr,\n" "__global const float *xy_ptr,\n" "__global const float *x_ptr, __global const float *y_ptr,\n" "int w, int h, int ws, int hs, int num_inner_iter,\n" "__global float2 *S_ptr)\n" "{\n" "int js = get_global_id(0);\n" "int is = get_global_id(1);\n" "int i = is * DIS_PATCH_STRIDE;\n" "int j = js * DIS_PATCH_STRIDE;\n" "const int psz = DIS_PATCH_SIZE;\n" "int w_ext = w + 2 * DIS_BORDER_SIZE;\n" "int index = is * ws + js;\n" "if (js >= ws || is >= hs) return;\n" "float2 U0 = S_ptr[index];\n" "float2 cur_U = U0;\n" "float cur_xx = xx_ptr[index];\n" "float cur_yy = yy_ptr[index];\n" "float cur_xy = xy_ptr[index];\n" "float detH = cur_xx * cur_yy - cur_xy * cur_xy;\n" "float inv_detH = (fabs(detH) < EPS) ? 1.0 / EPS : 1.0 / detH;\n" "float invH11 = cur_yy * inv_detH;\n" "float invH12 = -cur_xy * inv_detH;\n" "float invH22 = cur_xx * inv_detH;\n" "float prev_SSD = INF;\n" "float x_grad_sum = x_ptr[index];\n" "float y_grad_sum = y_ptr[index];\n" "const float i_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;\n" "const float i_upper_limit = DIS_BORDER_SIZE + h - 1.0f;\n" "const float j_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;\n" "const float j_upper_limit = DIS_BORDER_SIZE + w - 1.0f;\n" "for (int t = 0; t < num_inner_iter; t++)\n" "{\n" "float i_I1, j_I1, w00, w01, w10, w11;\n" "INIT_BILINEAR_WEIGHTS(cur_U.x, cur_U.y);\n" "float4 res = processPatchMeanNorm(\n" "I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,\n" "I0x_ptr + i * w + j, I0y_ptr + i * w + j,\n" "w, w_ext, w00, w01, w10, w11,\n" "x_grad_sum, y_grad_sum);\n" "float SSD = res.x;\n" "float dUx = res.y;\n" "float dUy = res.z;\n" "float dx = invH11 * dUx + invH12 * dUy;\n" "float dy = invH12 * dUx + invH22 * dUy;\n" "cur_U -= (float2)(dx, dy);\n" "if (SSD >= prev_SSD)\n" "break;\n" "prev_SSD = SSD;\n" "}\n" "float2 vec = cur_U - U0;\n" "S_ptr[index] = (dot(vec, vec) <= (float)(DIS_PATCH_SIZE * DIS_PATCH_SIZE)) ? cur_U : U0;\n" "}\n" "__attribute__((reqd_work_group_size(8, 1, 1)))\n" "__kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __global const uchar *I1_ptr,\n" "int w, int h, int ws, int hs,\n" "__global float2 *S_ptr)\n" "{\n" "int id = get_global_id(0);\n" "int is = get_group_id(0);\n" "is = (hs - 1 - is);\n" "int i = is * DIS_PATCH_STRIDE;\n" "int j = (ws - 2) * DIS_PATCH_STRIDE;\n" "const int w_ext = w + 2 * DIS_BORDER_SIZE;\n" "const float i_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;\n" "const float i_upper_limit = DIS_BORDER_SIZE + h - 1.0f;\n" "const float j_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;\n" "const float j_upper_limit = DIS_BORDER_SIZE + w - 1.0f;\n" "#ifdef CV_USE_SUBGROUPS\n" "int sid = get_sub_group_local_id();\n" "#define EXTRA_ARGS_computeSSDMeanNorm sid\n" "#else\n" "__local float2 smem[8];\n" "int sid = get_local_id(0);\n" "#define EXTRA_ARGS_computeSSDMeanNorm sid, smem\n" "#endif\n" "for (int js = (ws - 2); js > -1; js--, j -= DIS_PATCH_STRIDE)\n" "{\n" "float2 U0 = S_ptr[is * ws + js];\n" "float2 U1 = S_ptr[is * ws + js + 1];\n" "float i_I1, j_I1, w00, w01, w10, w11;\n" "INIT_BILINEAR_WEIGHTS(U0.x, U0.y);\n" "float min_SSD = computeSSDMeanNorm(\n" "I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,\n" "w, w_ext, w00, w01, w10, w11, EXTRA_ARGS_computeSSDMeanNorm);\n" "INIT_BILINEAR_WEIGHTS(U1.x, U1.y);\n" "float cur_SSD = computeSSDMeanNorm(\n" "I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,\n" "w, w_ext, w00, w01, w10, w11, EXTRA_ARGS_computeSSDMeanNorm);\n" "S_ptr[is * ws + js] = (cur_SSD < min_SSD) ? U1 : U0;\n" "}\n" "#undef EXTRA_ARGS_computeSSDMeanNorm\n" "}\n" "__kernel void dis_patch_inverse_search_bwd_2(__global const uchar *I0_ptr, __global const uchar *I1_ptr,\n" "__global const short *I0x_ptr, __global const short *I0y_ptr,\n" "__global const float *xx_ptr, __global const float *yy_ptr,\n" "__global const float *xy_ptr,\n" "__global const float *x_ptr, __global const float *y_ptr,\n" "int w, int h, int ws, int hs, int num_inner_iter,\n" "__global float2 *S_ptr)\n" "{\n" "int js = get_global_id(0);\n" "int is = get_global_id(1);\n" "if (js >= ws || is >= hs) return;\n" "js = (ws - 1 - js);\n" "is = (hs - 1 - is);\n" "int j = js * DIS_PATCH_STRIDE;\n" "int i = is * DIS_PATCH_STRIDE;\n" "int w_ext = w + 2 * DIS_BORDER_SIZE;\n" "int index = is * ws + js;\n" "float2 U0 = S_ptr[index];\n" "float2 cur_U = U0;\n" "float cur_xx = xx_ptr[index];\n" "float cur_yy = yy_ptr[index];\n" "float cur_xy = xy_ptr[index];\n" "float detH = cur_xx * cur_yy - cur_xy * cur_xy;\n" "float inv_detH = (fabs(detH) < EPS) ? 1.0 / EPS : 1.0 / detH;\n" "float invH11 = cur_yy * inv_detH;\n" "float invH12 = -cur_xy * inv_detH;\n" "float invH22 = cur_xx * inv_detH;\n" "float prev_SSD = INF;\n" "float x_grad_sum = x_ptr[index];\n" "float y_grad_sum = y_ptr[index];\n" "const float i_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;\n" "const float i_upper_limit = DIS_BORDER_SIZE + h - 1.0f;\n" "const float j_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;\n" "const float j_upper_limit = DIS_BORDER_SIZE + w - 1.0f;\n" "for (int t = 0; t < num_inner_iter; t++)\n" "{\n" "float i_I1, j_I1, w00, w01, w10, w11;\n" "INIT_BILINEAR_WEIGHTS(cur_U.x, cur_U.y);\n" "float4 res = processPatchMeanNorm(\n" "I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,\n" "I0x_ptr + i * w + j, I0y_ptr + i * w + j,\n" "w, w_ext, w00, w01, w10, w11,\n" "x_grad_sum, y_grad_sum);\n" "float SSD = res.x;\n" "float dUx = res.y;\n" "float dUy = res.z;\n" "float dx = invH11 * dUx + invH12 * dUy;\n" "float dy = invH12 * dUx + invH22 * dUy;\n" "cur_U -= (float2)(dx, dy);\n" "if (SSD >= prev_SSD)\n" "break;\n" "prev_SSD = SSD;\n" "}\n" "float2 vec = cur_U - U0;\n" "S_ptr[index] = ((dot(vec, vec)) <= (float)(DIS_PATCH_SIZE * DIS_PATCH_SIZE)) ? cur_U : U0;\n" "}\n" "#endif\n" , "39015c9b775dfd3350237f6206822f49", NULL}; struct cv::ocl::internal::ProgramEntry optical_flow_farneback_oclsrc={moduleName, "optical_flow_farneback", "#define tx (int)get_local_id(0)\n" "#define ty get_local_id(1)\n" "#define bx get_group_id(0)\n" "#define bdx (int)get_local_size(0)\n" "#define BORDER_SIZE 5\n" "#define MAX_KSIZE_HALF 100\n" "#ifndef polyN\n" "#define polyN 5\n" "#endif\n" "#if USE_DOUBLE\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#define TYPE double\n" "#define VECTYPE double4\n" "#else\n" "#define TYPE float\n" "#define VECTYPE float4\n" "#endif\n" "__kernel void polynomialExpansion(__global __const float * src, int srcStep,\n" "__global float * dst, int dstStep,\n" "const int rows, const int cols,\n" "__global __const float * c_g,\n" "__global __const float * c_xg,\n" "__global __const float * c_xxg,\n" "__local float * smem,\n" "const VECTYPE ig)\n" "{\n" "const int y = get_global_id(1);\n" "const int x = bx * (bdx - 2*polyN) + tx - polyN;\n" "int xWarped;\n" "__local float *row = smem + tx;\n" "if (y < rows && y >= 0)\n" "{\n" "xWarped = min(max(x, 0), cols - 1);\n" "row[0] = src[mad24(y, srcStep, xWarped)] * c_g[0];\n" "row[bdx] = 0.f;\n" "row[2*bdx] = 0.f;\n" "#pragma unroll\n" "for (int k = 1; k <= polyN; ++k)\n" "{\n" "float t0 = src[mad24(max(y - k, 0), srcStep, xWarped)];\n" "float t1 = src[mad24(min(y + k, rows - 1), srcStep, xWarped)];\n" "row[0] += c_g[k] * (t0 + t1);\n" "row[bdx] += c_xg[k] * (t1 - t0);\n" "row[2*bdx] += c_xxg[k] * (t0 + t1);\n" "}\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (y < rows && y >= 0 && tx >= polyN && tx + polyN < bdx && x < cols)\n" "{\n" "TYPE b1 = c_g[0] * row[0];\n" "TYPE b3 = c_g[0] * row[bdx];\n" "TYPE b5 = c_g[0] * row[2*bdx];\n" "TYPE b2 = 0, b4 = 0, b6 = 0;\n" "#pragma unroll\n" "for (int k = 1; k <= polyN; ++k)\n" "{\n" "b1 += (row[k] + row[-k]) * c_g[k];\n" "b4 += (row[k] + row[-k]) * c_xxg[k];\n" "b2 += (row[k] - row[-k]) * c_xg[k];\n" "b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];\n" "b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];\n" "b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];\n" "}\n" "dst[mad24(y, dstStep, xWarped)] = (float)(b3*ig.s0);\n" "dst[mad24(rows + y, dstStep, xWarped)] = (float)(b2*ig.s0);\n" "dst[mad24(2*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b5*ig.s2);\n" "dst[mad24(3*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b4*ig.s2);\n" "dst[mad24(4*rows + y, dstStep, xWarped)] = (float)(b6*ig.s3);\n" "}\n" "}\n" "inline int idx_row_low(const int y, const int last_row)\n" "{\n" "return abs(y) % (last_row + 1);\n" "}\n" "inline int idx_row_high(const int y, const int last_row)\n" "{\n" "return abs(last_row - abs(last_row - y)) % (last_row + 1);\n" "}\n" "inline int idx_col_low(const int x, const int last_col)\n" "{\n" "return abs(x) % (last_col + 1);\n" "}\n" "inline int idx_col_high(const int x, const int last_col)\n" "{\n" "return abs(last_col - abs(last_col - x)) % (last_col + 1);\n" "}\n" "inline int idx_col(const int x, const int last_col)\n" "{\n" "return idx_col_low(idx_col_high(x, last_col), last_col);\n" "}\n" "__kernel void gaussianBlur(__global const float * src, int srcStep,\n" "__global float * dst, int dstStep, const int rows, const int cols,\n" "__global const float * c_gKer, const int ksizeHalf,\n" "__local float * smem)\n" "{\n" "const int y = get_global_id(1);\n" "const int x = get_global_id(0);\n" "__local float *row = smem + ty * (bdx + 2*ksizeHalf);\n" "if (y < rows)\n" "{\n" "for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)\n" "{\n" "int xExt = (int)(bx * bdx) + i - ksizeHalf;\n" "xExt = idx_col(xExt, cols - 1);\n" "row[i] = src[mad24(y, srcStep, xExt)] * c_gKer[0];\n" "for (int j = 1; j <= ksizeHalf; ++j)\n" "row[i] += (src[mad24(idx_row_low(y - j, rows - 1), srcStep, xExt)]\n" "+ src[mad24(idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j];\n" "}\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (y < rows && y >= 0 && x < cols && x >= 0)\n" "{\n" "row += tx + ksizeHalf;\n" "float res = row[0] * c_gKer[0];\n" "for (int i = 1; i <= ksizeHalf; ++i)\n" "res += (row[-i] + row[i]) * c_gKer[i];\n" "dst[mad24(y, dstStep, x)] = res;\n" "}\n" "}\n" "__kernel void gaussianBlur5(__global const float * src, int srcStep,\n" "__global float * dst, int dstStep,\n" "const int rows, const int cols,\n" "__global const float * c_gKer, const int ksizeHalf,\n" "__local float * smem)\n" "{\n" "const int y = get_global_id(1);\n" "const int x = get_global_id(0);\n" "const int smw = bdx + 2*ksizeHalf;\n" "__local volatile float *row = smem + 5 * ty * smw;\n" "if (y < rows)\n" "{\n" "for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)\n" "{\n" "int xExt = (int)(bx * bdx) + i - ksizeHalf;\n" "xExt = idx_col(xExt, cols - 1);\n" "#pragma unroll\n" "for (int k = 0; k < 5; ++k)\n" "row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)] * c_gKer[0];\n" "for (int j = 1; j <= ksizeHalf; ++j)\n" "#pragma unroll\n" "for (int k = 0; k < 5; ++k)\n" "row[k*smw + i] +=\n" "(src[mad24(k*rows + idx_row_low(y - j, rows - 1), srcStep, xExt)] +\n" "src[mad24(k*rows + idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j];\n" "}\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (y < rows && y >= 0 && x < cols && x >= 0)\n" "{\n" "row += tx + ksizeHalf;\n" "float res[5];\n" "#pragma unroll\n" "for (int k = 0; k < 5; ++k)\n" "res[k] = row[k*smw] * c_gKer[0];\n" "for (int i = 1; i <= ksizeHalf; ++i)\n" "#pragma unroll\n" "for (int k = 0; k < 5; ++k)\n" "res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];\n" "#pragma unroll\n" "for (int k = 0; k < 5; ++k)\n" "dst[mad24(k*rows + y, dstStep, x)] = res[k];\n" "}\n" "}\n" "__constant float c_border[BORDER_SIZE + 1] = { 0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f };\n" "__kernel void updateMatrices(__global const float * flowx, int xStep,\n" "__global const float * flowy, int yStep,\n" "const int rows, const int cols,\n" "__global const float * R0, int R0Step,\n" "__global const float * R1, int R1Step,\n" "__global float * M, int mStep)\n" "{\n" "const int y = get_global_id(1);\n" "const int x = get_global_id(0);\n" "if (y < rows && y >= 0 && x < cols && x >= 0)\n" "{\n" "float dx = flowx[mad24(y, xStep, x)];\n" "float dy = flowy[mad24(y, yStep, x)];\n" "float fx = x + dx;\n" "float fy = y + dy;\n" "int x1 = convert_int(floor(fx));\n" "int y1 = convert_int(floor(fy));\n" "fx -= x1;\n" "fy -= y1;\n" "float r2, r3, r4, r5, r6;\n" "if (x1 >= 0 && y1 >= 0 && x1 < cols - 1 && y1 < rows - 1)\n" "{\n" "float a00 = (1.f - fx) * (1.f - fy);\n" "float a01 = fx * (1.f - fy);\n" "float a10 = (1.f - fx) * fy;\n" "float a11 = fx * fy;\n" "r2 = a00 * R1[mad24(y1, R1Step, x1)] +\n" "a01 * R1[mad24(y1, R1Step, x1 + 1)] +\n" "a10 * R1[mad24(y1 + 1, R1Step, x1)] +\n" "a11 * R1[mad24(y1 + 1, R1Step, x1 + 1)];\n" "r3 = a00 * R1[mad24(rows + y1, R1Step, x1)] +\n" "a01 * R1[mad24(rows + y1, R1Step, x1 + 1)] +\n" "a10 * R1[mad24(rows + y1 + 1, R1Step, x1)] +\n" "a11 * R1[mad24(rows + y1 + 1, R1Step, x1 + 1)];\n" "r4 = a00 * R1[mad24(2*rows + y1, R1Step, x1)] +\n" "a01 * R1[mad24(2*rows + y1, R1Step, x1 + 1)] +\n" "a10 * R1[mad24(2*rows + y1 + 1, R1Step, x1)] +\n" "a11 * R1[mad24(2*rows + y1 + 1, R1Step, x1 + 1)];\n" "r5 = a00 * R1[mad24(3*rows + y1, R1Step, x1)] +\n" "a01 * R1[mad24(3*rows + y1, R1Step, x1 + 1)] +\n" "a10 * R1[mad24(3*rows + y1 + 1, R1Step, x1)] +\n" "a11 * R1[mad24(3*rows + y1 + 1, R1Step, x1 + 1)];\n" "r6 = a00 * R1[mad24(4*rows + y1, R1Step, x1)] +\n" "a01 * R1[mad24(4*rows + y1, R1Step, x1 + 1)] +\n" "a10 * R1[mad24(4*rows + y1 + 1, R1Step, x1)] +\n" "a11 * R1[mad24(4*rows + y1 + 1, R1Step, x1 + 1)];\n" "r4 = (R0[mad24(2*rows + y, R0Step, x)] + r4) * 0.5f;\n" "r5 = (R0[mad24(3*rows + y, R0Step, x)] + r5) * 0.5f;\n" "r6 = (R0[mad24(4*rows + y, R0Step, x)] + r6) * 0.25f;\n" "}\n" "else\n" "{\n" "r2 = r3 = 0.f;\n" "r4 = R0[mad24(2*rows + y, R0Step, x)];\n" "r5 = R0[mad24(3*rows + y, R0Step, x)];\n" "r6 = R0[mad24(4*rows + y, R0Step, x)] * 0.5f;\n" "}\n" "r2 = (R0[mad24(y, R0Step, x)] - r2) * 0.5f;\n" "r3 = (R0[mad24(rows + y, R0Step, x)] - r3) * 0.5f;\n" "r2 += r4*dy + r6*dx;\n" "r3 += r6*dy + r5*dx;\n" "float scale =\n" "c_border[min(x, BORDER_SIZE)] *\n" "c_border[min(y, BORDER_SIZE)] *\n" "c_border[min(cols - x - 1, BORDER_SIZE)] *\n" "c_border[min(rows - y - 1, BORDER_SIZE)];\n" "r2 *= scale;\n" "r3 *= scale;\n" "r4 *= scale;\n" "r5 *= scale;\n" "r6 *= scale;\n" "M[mad24(y, mStep, x)] = r4*r4 + r6*r6;\n" "M[mad24(rows + y, mStep, x)] = (r4 + r5)*r6;\n" "M[mad24(2*rows + y, mStep, x)] = r5*r5 + r6*r6;\n" "M[mad24(3*rows + y, mStep, x)] = r4*r2 + r6*r3;\n" "M[mad24(4*rows + y, mStep, x)] = r6*r2 + r5*r3;\n" "}\n" "}\n" "__kernel void boxFilter5(__global const float * src, int srcStep,\n" "__global float * dst, int dstStep,\n" "const int rows, const int cols,\n" "const int ksizeHalf,\n" "__local float * smem)\n" "{\n" "const int y = get_global_id(1);\n" "const int x = get_global_id(0);\n" "const float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));\n" "const int smw = bdx + 2*ksizeHalf;\n" "__local float *row = smem + 5 * ty * smw;\n" "if (y < rows)\n" "{\n" "for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)\n" "{\n" "int xExt = (int)(bx * bdx) + i - ksizeHalf;\n" "xExt = min(max(xExt, 0), cols - 1);\n" "#pragma unroll\n" "for (int k = 0; k < 5; ++k)\n" "row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)];\n" "for (int j = 1; j <= ksizeHalf; ++j)\n" "#pragma unroll\n" "for (int k = 0; k < 5; ++k)\n" "row[k*smw + i] +=\n" "src[mad24(k*rows + max(y - j, 0), srcStep, xExt)] +\n" "src[mad24(k*rows + min(y + j, rows - 1), srcStep, xExt)];\n" "}\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (y < rows && y >= 0 && x < cols && x >= 0)\n" "{\n" "row += tx + ksizeHalf;\n" "float res[5];\n" "#pragma unroll\n" "for (int k = 0; k < 5; ++k)\n" "res[k] = row[k*smw];\n" "for (int i = 1; i <= ksizeHalf; ++i)\n" "#pragma unroll\n" "for (int k = 0; k < 5; ++k)\n" "res[k] += row[k*smw - i] + row[k*smw + i];\n" "#pragma unroll\n" "for (int k = 0; k < 5; ++k)\n" "dst[mad24(k*rows + y, dstStep, x)] = res[k] * boxAreaInv;\n" "}\n" "}\n" "__kernel void updateFlow(__global const float * M, int mStep,\n" "__global float * flowx, int xStep,\n" "__global float * flowy, int yStep,\n" "const int rows, const int cols)\n" "{\n" "const int y = get_global_id(1);\n" "const int x = get_global_id(0);\n" "if (y < rows && y >= 0 && x < cols && x >= 0)\n" "{\n" "float g11 = M[mad24(y, mStep, x)];\n" "float g12 = M[mad24(rows + y, mStep, x)];\n" "float g22 = M[mad24(2*rows + y, mStep, x)];\n" "float h1 = M[mad24(3*rows + y, mStep, x)];\n" "float h2 = M[mad24(4*rows + y, mStep, x)];\n" "float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);\n" "flowx[mad24(y, xStep, x)] = (g11*h2 - g12*h1) * detInv;\n" "flowy[mad24(y, yStep, x)] = (g22*h1 - g12*h2) * detInv;\n" "}\n" "}\n" , "529300e6242f574f83d11a089cc120c0", NULL}; struct cv::ocl::internal::ProgramEntry pyrlk_oclsrc={moduleName, "pyrlk", "#define GRIDSIZE 3\n" "#define LSx 8\n" "#define LSy 8\n" "#define LM_W (LSx*GRIDSIZE+2)\n" "#define LM_H (LSy*GRIDSIZE+2)\n" "#define BUFFER (LSx*LSy)\n" "#define BUFFER2 BUFFER>>1\n" "#ifdef CPU\n" "inline void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)\n" "{\n" "smem1[tid] = val1;\n" "smem2[tid] = val2;\n" "smem3[tid] = val3;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "for(int i = BUFFER2; i > 0; i >>= 1)\n" "{\n" "if(tid < i)\n" "{\n" "smem1[tid] += smem1[tid + i];\n" "smem2[tid] += smem2[tid + i];\n" "smem3[tid] += smem3[tid + i];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "}\n" "inline void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, int tid)\n" "{\n" "smem1[tid] = val1;\n" "smem2[tid] = val2;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "for(int i = BUFFER2; i > 0; i >>= 1)\n" "{\n" "if(tid < i)\n" "{\n" "smem1[tid] += smem1[tid + i];\n" "smem2[tid] += smem2[tid + i];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "}\n" "inline void reduce1(float val1, __local float* smem1, int tid)\n" "{\n" "smem1[tid] = val1;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "for(int i = BUFFER2; i > 0; i >>= 1)\n" "{\n" "if(tid < i)\n" "{\n" "smem1[tid] += smem1[tid + i];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "}\n" "#else\n" "inline void reduce3(float val1, float val2, float val3,\n" "__local float* smem1, __local float* smem2, __local float* smem3, int tid)\n" "{\n" "smem1[tid] = val1;\n" "smem2[tid] = val2;\n" "smem3[tid] = val3;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 32)\n" "{\n" "smem1[tid] += smem1[tid + 32];\n" "smem2[tid] += smem2[tid + 32];\n" "smem3[tid] += smem3[tid + 32];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 16)\n" "{\n" "smem1[tid] += smem1[tid + 16];\n" "smem2[tid] += smem2[tid + 16];\n" "smem3[tid] += smem3[tid + 16];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 8)\n" "{\n" "smem1[tid] += smem1[tid + 8];\n" "smem2[tid] += smem2[tid + 8];\n" "smem3[tid] += smem3[tid + 8];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 4)\n" "{\n" "smem1[tid] += smem1[tid + 4];\n" "smem2[tid] += smem2[tid + 4];\n" "smem3[tid] += smem3[tid + 4];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid == 0)\n" "{\n" "smem1[0] = (smem1[0] + smem1[1]) + (smem1[2] + smem1[3]);\n" "smem2[0] = (smem2[0] + smem2[1]) + (smem2[2] + smem2[3]);\n" "smem3[0] = (smem3[0] + smem3[1]) + (smem3[2] + smem3[3]);\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "inline void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, int tid)\n" "{\n" "smem1[tid] = val1;\n" "smem2[tid] = val2;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 32)\n" "{\n" "smem1[tid] += smem1[tid + 32];\n" "smem2[tid] += smem2[tid + 32];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 16)\n" "{\n" "smem1[tid] += smem1[tid + 16];\n" "smem2[tid] += smem2[tid + 16];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 8)\n" "{\n" "smem1[tid] += smem1[tid + 8];\n" "smem2[tid] += smem2[tid + 8];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 4)\n" "{\n" "smem1[tid] += smem1[tid + 4];\n" "smem2[tid] += smem2[tid + 4];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid == 0)\n" "{\n" "smem1[0] = (smem1[0] + smem1[1]) + (smem1[2] + smem1[3]);\n" "smem2[0] = (smem2[0] + smem2[1]) + (smem2[2] + smem2[3]);\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "inline void reduce1(float val1, __local float* smem1, int tid)\n" "{\n" "smem1[tid] = val1;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 32)\n" "{\n" "smem1[tid] += smem1[tid + 32];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 16)\n" "{\n" "smem1[tid] += smem1[tid + 16];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 8)\n" "{\n" "smem1[tid] += smem1[tid + 8];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 4)\n" "{\n" "smem1[tid] += smem1[tid + 4];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid == 0)\n" "{\n" "smem1[0] = (smem1[0] + smem1[1]) + (smem1[2] + smem1[3]);\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "#endif\n" "#define SCALE (1.0f / (1 << 20))\n" "#define THRESHOLD 0.01f\n" "__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n" "#define VAL(_y,_x,_yy,_xx) (IPatchLocal[mad24(((_y) + (_yy)), LM_W, ((_x) + (_xx)))])\n" "inline void SetPatch(local float* IPatchLocal, int TileY, int TileX,\n" "float* Pch, float* Dx, float* Dy,\n" "float* A11, float* A12, float* A22, float w)\n" "{\n" "int xid=get_local_id(0);\n" "int yid=get_local_id(1);\n" "int xBase = mad24(TileX, LSx, (xid + 1));\n" "int yBase = mad24(TileY, LSy, (yid + 1));\n" "*Pch = VAL(yBase,xBase,0,0);\n" "*Dx = mad((VAL(yBase,xBase,-1,1) + VAL(yBase,xBase,+1,1) - VAL(yBase,xBase,-1,-1) - VAL(yBase,xBase,+1,-1)), 3.0f, (VAL(yBase,xBase,0,1) - VAL(yBase,xBase,0,-1)) * 10.0f) * w;\n" "*Dy = mad((VAL(yBase,xBase,1,-1) + VAL(yBase,xBase,1,+1) - VAL(yBase,xBase,-1,-1) - VAL(yBase,xBase,-1,+1)), 3.0f, (VAL(yBase,xBase,1,0) - VAL(yBase,xBase,-1,0)) * 10.0f) * w;\n" "*A11 = mad(*Dx, *Dx, *A11);\n" "*A12 = mad(*Dx, *Dy, *A12);\n" "*A22 = mad(*Dy, *Dy, *A22);\n" "}\n" "#undef VAL\n" "inline void GetPatch(image2d_t J, float x, float y,\n" "float* Pch, float* Dx, float* Dy,\n" "float* b1, float* b2)\n" "{\n" "float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch;\n" "*b1 = mad(diff, *Dx, *b1);\n" "*b2 = mad(diff, *Dy, *b2);\n" "}\n" "inline void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval, float w)\n" "{\n" "float diff = ((((read_imagef(J, sampler, (float2)(x,y)).x * 16384) + 256) / 512) - (((*Pch * 16384) + 256) /512)) * w;\n" "*errval += fabs(diff);\n" "}\n" "#define READI(_y,_x) IPatchLocal[mad24(mad24((_y), LSy, yid), LM_W, mad24((_x), LSx, xid))] = read_imagef(I, sampler, (float2)(mad((float)(_x), (float)LSx, Point.x + xid - 0.5f), mad((float)(_y), (float)LSy, Point.y + yid - 0.5f))).x;\n" "void ReadPatchIToLocalMem(image2d_t I, float2 Point, local float* IPatchLocal)\n" "{\n" "int xid=get_local_id(0);\n" "int yid=get_local_id(1);\n" "READI(0,0);READI(0,1);READI(0,2);\n" "READI(1,0);READI(1,1);READI(1,2);\n" "READI(2,0);READI(2,1);READI(2,2);\n" "if(xid<2)\n" "{\n" "READI(0,3);\n" "READI(1,3);\n" "READI(2,3);\n" "}\n" "if(yid<2)\n" "{\n" "READI(3,0);READI(3,1);READI(3,2);\n" "}\n" "if(yid<2 && xid<2)\n" "{\n" "READI(3,3);\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "#undef READI\n" "__attribute__((reqd_work_group_size(LSx, LSy, 1)))\n" "__kernel void lkSparse(image2d_t I, image2d_t J,\n" "__global const float2* prevPts, __global float2* nextPts, __global uchar* status, __global float* err,\n" "const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)\n" "{\n" "__local float smem1[BUFFER];\n" "__local float smem2[BUFFER];\n" "__local float smem3[BUFFER];\n" "int xid=get_local_id(0);\n" "int yid=get_local_id(1);\n" "int gid=get_group_id(0);\n" "int xsize=get_local_size(0);\n" "int ysize=get_local_size(1);\n" "int k;\n" "#ifdef CPU\n" "float wx0 = 1.0f;\n" "float wy0 = 1.0f;\n" "int xBase = mad24(xsize, 2, xid);\n" "int yBase = mad24(ysize, 2, yid);\n" "float wx1 = (xBase < c_winSize_x) ? 1 : 0;\n" "float wy1 = (yBase < c_winSize_y) ? 1 : 0;\n" "#else\n" "#if WSX == 1\n" "float wx0 = 1.0f;\n" "int xBase = mad24(xsize, 2, xid);\n" "float wx1 = (xBase < c_winSize_x) ? 1 : 0;\n" "#else\n" "int xBase = mad24(xsize, 1, xid);\n" "float wx0 = (xBase < c_winSize_x) ? 1 : 0;\n" "float wx1 = 0.0f;\n" "#endif\n" "#if WSY == 1\n" "float wy0 = 1.0f;\n" "int yBase = mad24(ysize, 2, yid);\n" "float wy1 = (yBase < c_winSize_y) ? 1 : 0;\n" "#else\n" "int yBase = mad24(ysize, 1, yid);\n" "float wy0 = (yBase < c_winSize_y) ? 1 : 0;\n" "float wy1 = 0.0f;\n" "#endif\n" "#endif\n" "float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);\n" "const int tid = mad24(yid, xsize, xid);\n" "float2 prevPt = prevPts[gid] / (float2)(1 << level);\n" "if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)\n" "{\n" "if (tid == 0 && level == 0)\n" "{\n" "status[gid] = 0;\n" "}\n" "return;\n" "}\n" "prevPt -= c_halfWin;\n" "float A11 = 0;\n" "float A12 = 0;\n" "float A22 = 0;\n" "float I_patch[GRIDSIZE][GRIDSIZE];\n" "float dIdx_patch[GRIDSIZE][GRIDSIZE];\n" "float dIdy_patch[GRIDSIZE][GRIDSIZE];\n" "local float IPatchLocal[LM_W*LM_H];\n" "ReadPatchIToLocalMem(I,prevPt,IPatchLocal);\n" "{\n" "SetPatch(IPatchLocal, 0, 0,\n" "&I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],\n" "&A11, &A12, &A22,1);\n" "SetPatch(IPatchLocal, 0, 1,\n" "&I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],\n" "&A11, &A12, &A22,wx0);\n" "SetPatch(IPatchLocal, 0, 2,\n" "&I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],\n" "&A11, &A12, &A22,wx1);\n" "}\n" "{\n" "SetPatch(IPatchLocal, 1, 0,\n" "&I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],\n" "&A11, &A12, &A22,wy0);\n" "SetPatch(IPatchLocal, 1,1,\n" "&I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],\n" "&A11, &A12, &A22,wx0*wy0);\n" "SetPatch(IPatchLocal, 1,2,\n" "&I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],\n" "&A11, &A12, &A22,wx1*wy0);\n" "}\n" "{\n" "SetPatch(IPatchLocal, 2,0,\n" "&I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],\n" "&A11, &A12, &A22,wy1);\n" "SetPatch(IPatchLocal, 2,1,\n" "&I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],\n" "&A11, &A12, &A22,wx0*wy1);\n" "SetPatch(IPatchLocal, 2,2,\n" "&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],\n" "&A11, &A12, &A22,wx1*wy1);\n" "}\n" "reduce3(A11, A12, A22, smem1, smem2, smem3, tid);\n" "A11 = smem1[0];\n" "A12 = smem2[0];\n" "A22 = smem3[0];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "float D = mad(A11, A22, - A12 * A12);\n" "if (D < 1.192092896e-07f)\n" "{\n" "if (tid == 0 && level == 0)\n" "status[gid] = 0;\n" "return;\n" "}\n" "A11 /= D;\n" "A12 /= D;\n" "A22 /= D;\n" "prevPt = mad(nextPts[gid], 2.0f, - c_halfWin);\n" "float2 offset0 = (float2)(xid + 0.5f, yid + 0.5f);\n" "float2 offset1 = (float2)(xsize, ysize);\n" "float2 loc0 = prevPt + offset0;\n" "float2 loc1 = loc0 + offset1;\n" "float2 loc2 = loc1 + offset1;\n" "for (k = 0; k < c_iters; ++k)\n" "{\n" "if (prevPt.x < -c_halfWin.x || prevPt.x >= cols || prevPt.y < -c_halfWin.y || prevPt.y >= rows)\n" "{\n" "if (tid == 0 && level == 0)\n" "status[gid] = 0;\n" "break;\n" "}\n" "float b1 = 0;\n" "float b2 = 0;\n" "{\n" "GetPatch(J, loc0.x, loc0.y,\n" "&I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],\n" "&b1, &b2);\n" "GetPatch(J, loc1.x, loc0.y,\n" "&I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],\n" "&b1, &b2);\n" "GetPatch(J, loc2.x, loc0.y,\n" "&I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],\n" "&b1, &b2);\n" "}\n" "{\n" "GetPatch(J, loc0.x, loc1.y,\n" "&I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],\n" "&b1, &b2);\n" "GetPatch(J, loc1.x, loc1.y,\n" "&I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],\n" "&b1, &b2);\n" "GetPatch(J, loc2.x, loc1.y,\n" "&I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],\n" "&b1, &b2);\n" "}\n" "{\n" "GetPatch(J, loc0.x, loc2.y,\n" "&I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],\n" "&b1, &b2);\n" "GetPatch(J, loc1.x, loc2.y,\n" "&I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],\n" "&b1, &b2);\n" "GetPatch(J, loc2.x, loc2.y,\n" "&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],\n" "&b1, &b2);\n" "}\n" "reduce2(b1, b2, smem1, smem2, tid);\n" "b1 = smem1[0];\n" "b2 = smem2[0];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "float2 delta;\n" "delta.x = mad(A12, b2, - A22 * b1) * 32.0f;\n" "delta.y = mad(A12, b1, - A11 * b2) * 32.0f;\n" "prevPt += delta;\n" "loc0 += delta;\n" "loc1 += delta;\n" "loc2 += delta;\n" "if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD)\n" "break;\n" "}\n" "D = 0.0f;\n" "if (calcErr)\n" "{\n" "{\n" "GetError(J, loc0.x, loc0.y, &I_patch[0][0], &D, 1);\n" "GetError(J, loc1.x, loc0.y, &I_patch[0][1], &D, wx0);\n" "}\n" "{\n" "GetError(J, loc0.x, loc1.y, &I_patch[1][0], &D, wy0);\n" "GetError(J, loc1.x, loc1.y, &I_patch[1][1], &D, wx0*wy0);\n" "}\n" "if(xBase < c_winSize_x)\n" "{\n" "GetError(J, loc2.x, loc0.y, &I_patch[0][2], &D, wx1);\n" "GetError(J, loc2.x, loc1.y, &I_patch[1][2], &D, wx1*wy0);\n" "}\n" "if(yBase < c_winSize_y)\n" "{\n" "GetError(J, loc0.x, loc2.y, &I_patch[2][0], &D, wy1);\n" "GetError(J, loc1.x, loc2.y, &I_patch[2][1], &D, wx0*wy1);\n" "if(xBase < c_winSize_x)\n" "GetError(J, loc2.x, loc2.y, &I_patch[2][2], &D, wx1*wy1);\n" "}\n" "reduce1(D, smem1, tid);\n" "}\n" "if (tid == 0)\n" "{\n" "prevPt += c_halfWin;\n" "nextPts[gid] = prevPt;\n" "if (calcErr)\n" "err[gid] = smem1[0] / (float)(32 * c_winSize_x * c_winSize_y);\n" "}\n" "}\n" , "fa578a4fdcd31c3e32eedab3e323ac05", NULL}; }}} #endif