// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.

//#define CV_USE_SUBGROUPS

#define EPS 0.001f
#define INF 1E+10F

//#define DIS_BORDER_SIZE xxx
//#define DIS_PATCH_SIZE xxx
//#define DIS_PATCH_STRIDE xxx

#define DIS_PATCH_SIZE_HALF (DIS_PATCH_SIZE / 2)

#ifndef DIS_BORDER_SIZE

__kernel void dis_precomputeStructureTensor_hor(__global const short *I0x,
                                                __global const short *I0y,
                                                int w, int h, int ws,
                                                __global float *I0xx_aux_ptr,
                                                __global float *I0yy_aux_ptr,
                                                __global float *I0xy_aux_ptr,
                                                __global float *I0x_aux_ptr,
                                                __global float *I0y_aux_ptr)
{

    int i = get_global_id(0);

    if (i >= h) return;

    const __global short *x_row = I0x + i * w;
    const __global short *y_row = I0y + i * w;

    float sum_xx = 0.0f, sum_yy = 0.0f, sum_xy = 0.0f, sum_x = 0.0f, sum_y = 0.0f;
    float8 x_vec = convert_float8(vload8(0, x_row));
    float8 y_vec = convert_float8(vload8(0, y_row));
    sum_xx = dot(x_vec.lo, x_vec.lo) + dot(x_vec.hi, x_vec.hi);
    sum_yy = dot(y_vec.lo, y_vec.lo) + dot(y_vec.hi, y_vec.hi);
    sum_xy = dot(x_vec.lo, y_vec.lo) + dot(x_vec.hi, y_vec.hi);
    sum_x = dot(x_vec.lo, 1.0f) + dot(x_vec.hi, 1.0f);
    sum_y = dot(y_vec.lo, 1.0f) + dot(y_vec.hi, 1.0f);

    I0xx_aux_ptr[i * ws] = sum_xx;
    I0yy_aux_ptr[i * ws] = sum_yy;
    I0xy_aux_ptr[i * ws] = sum_xy;
    I0x_aux_ptr[i * ws] = sum_x;
    I0y_aux_ptr[i * ws] = sum_y;

    int js = 1;
    for (int j = DIS_PATCH_SIZE; j < w; j++)
    {
        short x_val1 = x_row[j];
        short x_val2 = x_row[j - DIS_PATCH_SIZE];
        short y_val1 = y_row[j];
        short y_val2 = y_row[j - DIS_PATCH_SIZE];
        sum_xx += (x_val1 * x_val1 - x_val2 * x_val2);
        sum_yy += (y_val1 * y_val1 - y_val2 * y_val2);
        sum_xy += (x_val1 * y_val1 - x_val2 * y_val2);
        sum_x += (x_val1 - x_val2);
        sum_y += (y_val1 - y_val2);
        if ((j - DIS_PATCH_SIZE + 1) % DIS_PATCH_STRIDE == 0)
        {
            int index = i * ws + js;
            I0xx_aux_ptr[index] = sum_xx;
            I0yy_aux_ptr[index] = sum_yy;
            I0xy_aux_ptr[index] = sum_xy;
            I0x_aux_ptr[index] = sum_x;
            I0y_aux_ptr[index] = sum_y;
            js++;
        }
    }
}

__kernel void dis_precomputeStructureTensor_ver(__global const float *I0xx_aux_ptr,
                                                __global const float *I0yy_aux_ptr,
                                                __global const float *I0xy_aux_ptr,
                                                __global const float *I0x_aux_ptr,
                                                __global const float *I0y_aux_ptr,
                                                int w, int h, int ws,
                                                __global float *I0xx_ptr,
                                                __global float *I0yy_ptr,
                                                __global float *I0xy_ptr,
                                                __global float *I0x_ptr,
                                                __global float *I0y_ptr)
{
    int j = get_global_id(0);

    if (j >= ws) return;

    float sum_xx, sum_yy, sum_xy, sum_x, sum_y;
    sum_xx = sum_yy = sum_xy = sum_x = sum_y = 0.0f;

    for (int i = 0; i < DIS_PATCH_SIZE; i++)
    {
        sum_xx += I0xx_aux_ptr[i * ws + j];
        sum_yy += I0yy_aux_ptr[i * ws + j];
        sum_xy += I0xy_aux_ptr[i * ws + j];
        sum_x  += I0x_aux_ptr[i * ws + j];
        sum_y  += I0y_aux_ptr[i * ws + j];
    }
    I0xx_ptr[j] = sum_xx;
    I0yy_ptr[j] = sum_yy;
    I0xy_ptr[j] = sum_xy;
    I0x_ptr[j] = sum_x;
    I0y_ptr[j] = sum_y;

    int is = 1;
    for (int i = DIS_PATCH_SIZE; i < h; i++)
    {
        sum_xx += (I0xx_aux_ptr[i * ws + j] - I0xx_aux_ptr[(i - DIS_PATCH_SIZE) * ws + j]);
        sum_yy += (I0yy_aux_ptr[i * ws + j] - I0yy_aux_ptr[(i - DIS_PATCH_SIZE) * ws + j]);
        sum_xy += (I0xy_aux_ptr[i * ws + j] - I0xy_aux_ptr[(i - DIS_PATCH_SIZE) * ws + j]);
        sum_x  += (I0x_aux_ptr[i * ws + j] - I0x_aux_ptr[(i - DIS_PATCH_SIZE) * ws + j]);
        sum_y  += (I0y_aux_ptr[i * ws + j] - I0y_aux_ptr[(i - DIS_PATCH_SIZE) * ws + j]);

        if ((i - DIS_PATCH_SIZE + 1) % DIS_PATCH_STRIDE == 0)
        {
            I0xx_ptr[is * ws + j] = sum_xx;
            I0yy_ptr[is * ws + j] = sum_yy;
            I0xy_ptr[is * ws + j] = sum_xy;
            I0x_ptr[is * ws + j] = sum_x;
            I0y_ptr[is * ws + j] = sum_y;
            is++;
        }
    }
}

__kernel void dis_densification(__global const float2 *S_ptr,
                                __global const uchar *i0, __global const uchar *i1,
                                int w, int h, int ws,
                                __global float2 *U_ptr)
{
    int x = get_global_id(0);
    int y = get_global_id(1);
    int i, j;

    if (x >= w || y >= h) return;

    int start_is, end_is;
    int start_js, end_js;

    end_is = min(y / DIS_PATCH_STRIDE, (h - DIS_PATCH_SIZE) / DIS_PATCH_STRIDE);
    start_is = max(0, y - DIS_PATCH_SIZE + DIS_PATCH_STRIDE) / DIS_PATCH_STRIDE;
    start_is = min(start_is, end_is);

    end_js = min(x / DIS_PATCH_STRIDE, (w - DIS_PATCH_SIZE) / DIS_PATCH_STRIDE);
    start_js = max(0, x - DIS_PATCH_SIZE + DIS_PATCH_STRIDE) / DIS_PATCH_STRIDE;
    start_js = min(start_js, end_js);

    float sum_coef = 0.0f;
    float2 sum_U = (float2)(0.0f, 0.0f);

    int i_l, i_u;
    int j_l, j_u;
    float i_m, j_m, diff;

    i = y;
    j = x;

    /* Iterate through all the patches that overlap the current location (i,j) */
    for (int is = start_is; is <= end_is; is++)
        for (int js = start_js; js <= end_js; js++)
        {
            float2 s_val = S_ptr[is * ws + js];
            uchar2 i1_vec1, i1_vec2;

            j_m = min(max(j + s_val.x, 0.0f), w - 1.0f - EPS);
            i_m = min(max(i + s_val.y, 0.0f), h - 1.0f - EPS);
            j_l = (int)j_m;
            j_u = j_l + 1;
            i_l = (int)i_m;
            i_u = i_l + 1;
            i1_vec1 = vload2(0, i1 + i_u * w + j_l);
            i1_vec2 = vload2(0, i1 + i_l * w + j_l);
            diff = (j_m - j_l) * (i_m - i_l) * i1_vec1.y +
                   (j_u - j_m) * (i_m - i_l) * i1_vec1.x +
                   (j_m - j_l) * (i_u - i_m) * i1_vec2.y +
                   (j_u - j_m) * (i_u - i_m) * i1_vec2.x - i0[i * w + j];
            float coef = 1.0f / max(1.0f, fabs(diff));
            sum_U += coef * s_val;
            sum_coef += coef;
        }

    float inv_sum_coef = 1.0 / sum_coef;
    U_ptr[i * w + j] = sum_U * inv_sum_coef;
}

#else // DIS_BORDER_SIZE

#define INIT_BILINEAR_WEIGHTS(Ux, Uy) \
    i_I1 = clamp(i + Uy + DIS_BORDER_SIZE, i_lower_limit, i_upper_limit); \
    j_I1 = clamp(j + Ux + DIS_BORDER_SIZE, j_lower_limit, j_upper_limit); \
    { \
        float di = i_I1 - floor(i_I1); \
        float dj = j_I1 - floor(j_I1); \
        w11 = di       * dj; \
        w10 = di       * (1 - dj); \
        w01 = (1 - di) * dj; \
        w00 = (1 - di) * (1 - dj); \
    }

float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
                         int I0_stride, int I1_stride,
                         float w00, float w01, float w10, float w11, int i
#ifndef CV_USE_SUBGROUPS
                         , __local float2 *smem /*[8]*/
#endif
)
{
    float sum_diff = 0.0f, sum_diff_sq = 0.0f;
    int n = DIS_PATCH_SIZE * DIS_PATCH_SIZE;

    uchar8 I1_vec1, I1_vec2, I0_vec;
    uchar I1_val1, I1_val2;

    I0_vec  = vload8(0, I0_ptr + i * I0_stride);
    I1_vec1 = vload8(0, I1_ptr + i * I1_stride);
    I1_vec2 = vload8(0, I1_ptr + (i + 1) * I1_stride);
    I1_val1 = I1_ptr[i * I1_stride + 8];
    I1_val2 = I1_ptr[(i + 1) * I1_stride + 8];

    float8 vec = w00 * convert_float8(I1_vec1) + w01 * convert_float8((uchar8)(I1_vec1.s123, I1_vec1.s4567, I1_val1)) +
                 w10 * convert_float8(I1_vec2) + w11 * convert_float8((uchar8)(I1_vec2.s123, I1_vec2.s4567, I1_val2)) -
                 convert_float8(I0_vec);

    sum_diff = (dot(vec.lo, 1.0) + dot(vec.hi, 1.0));
    sum_diff_sq = (dot(vec.lo, vec.lo) + dot(vec.hi, vec.hi));

#ifdef CV_USE_SUBGROUPS
    sum_diff = sub_group_reduce_add(sum_diff);
    sum_diff_sq = sub_group_reduce_add(sum_diff_sq);
#else
    barrier(CLK_LOCAL_MEM_FENCE);
    smem[i] = (float2)(sum_diff, sum_diff_sq);
    barrier(CLK_LOCAL_MEM_FENCE);
    if (i < 4)
        smem[i] += smem[i + 4];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (i < 2)
        smem[i] += smem[i + 2];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (i == 0)
        smem[0] += smem[1];
    barrier(CLK_LOCAL_MEM_FENCE);
    float2 reduce_add_result = smem[0];
    sum_diff = reduce_add_result.x;
    sum_diff_sq = reduce_add_result.y;
#endif

    return sum_diff_sq - sum_diff * sum_diff / n;
}

__attribute__((reqd_work_group_size(8, 1, 1)))
__kernel void dis_patch_inverse_search_fwd_1(__global const float2 *U_ptr,
                                             __global const uchar *I0_ptr, __global const uchar *I1_ptr,
                                             int w, int h, int ws, int hs,
                                             __global float2 *S_ptr)
{
    int id = get_global_id(0);
    int is = get_group_id(0);

    int i = is * DIS_PATCH_STRIDE;
    int j = 0;
    int w_ext = w + 2 * DIS_BORDER_SIZE;

    float i_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;
    float i_upper_limit = DIS_BORDER_SIZE + h - 1.0f;
    float j_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;
    float j_upper_limit = DIS_BORDER_SIZE + w - 1.0f;

    float2 prev_U = U_ptr[(i + DIS_PATCH_SIZE_HALF) * w + j + DIS_PATCH_SIZE_HALF];
    S_ptr[is * ws] = prev_U;
    j += DIS_PATCH_STRIDE;

#ifdef CV_USE_SUBGROUPS
    int sid = get_sub_group_local_id();
#define EXTRA_ARGS_computeSSDMeanNorm sid
#else
    __local float2 smem[8];
    int sid = get_local_id(0);
#define EXTRA_ARGS_computeSSDMeanNorm sid, smem
#endif
    for (int js = 1; js < ws; js++, j += DIS_PATCH_STRIDE)
    {
        float2 U = U_ptr[(i + DIS_PATCH_SIZE_HALF) * w + j + DIS_PATCH_SIZE_HALF];

        float i_I1, j_I1, w00, w01, w10, w11;

        INIT_BILINEAR_WEIGHTS(U.x, U.y);
        float min_SSD = computeSSDMeanNorm(
                I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
                w, w_ext, w00, w01, w10, w11, EXTRA_ARGS_computeSSDMeanNorm);

        INIT_BILINEAR_WEIGHTS(prev_U.x, prev_U.y);
        float cur_SSD = computeSSDMeanNorm(
                I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
                w, w_ext, w00, w01, w10, w11, EXTRA_ARGS_computeSSDMeanNorm);

        prev_U = (cur_SSD < min_SSD) ? prev_U : U;
        S_ptr[is * ws + js] = prev_U;
    }

#undef EXTRA_ARGS_computeSSDMeanNorm
}
#endif // DIS_BORDER_SIZE

float4 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
                            const __global short *I0x_ptr, const __global short *I0y_ptr,
                            int I0_stride, int I1_stride, float w00, float w01, float w10,
                            float w11, float x_grad_sum, float y_grad_sum)
{
    const float inv_n = 1.0f / (float)(DIS_PATCH_SIZE * DIS_PATCH_SIZE);

    float sum_diff = 0.0, sum_diff_sq = 0.0;
    float sum_I0x_mul = 0.0, sum_I0y_mul = 0.0;

    uchar8 I1_vec1;
    uchar8 I1_vec2 = vload8(0, I1_ptr);
    uchar I1_val1;
    uchar I1_val2 = I1_ptr[DIS_PATCH_SIZE];

    for (int i = 0; i < 8; i++)
    {
        uchar8 I0_vec = vload8(0, I0_ptr + i * I0_stride);

        I1_vec1 = I1_vec2;
        I1_vec2 = vload8(0, I1_ptr + (i + 1) * I1_stride);
        I1_val1 = I1_val2;
        I1_val2 = I1_ptr[(i + 1) * I1_stride + DIS_PATCH_SIZE];

        float8 vec = w00 * convert_float8(I1_vec1) + w01 * convert_float8((uchar8)(I1_vec1.s123, I1_vec1.s4567, I1_val1)) +
                     w10 * convert_float8(I1_vec2) + w11 * convert_float8((uchar8)(I1_vec2.s123, I1_vec2.s4567, I1_val2)) -
                     convert_float8(I0_vec);

        sum_diff += (dot(vec.lo, 1.0) + dot(vec.hi, 1.0));
        sum_diff_sq += (dot(vec.lo, vec.lo) + dot(vec.hi, vec.hi));

        short8 I0x_vec = vload8(0, I0x_ptr + i * I0_stride);
        short8 I0y_vec = vload8(0, I0y_ptr + i * I0_stride);

        sum_I0x_mul += dot(vec.lo, convert_float4(I0x_vec.lo));
        sum_I0x_mul += dot(vec.hi, convert_float4(I0x_vec.hi));
        sum_I0y_mul += dot(vec.lo, convert_float4(I0y_vec.lo));
        sum_I0y_mul += dot(vec.hi, convert_float4(I0y_vec.hi));
    }

    float dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum * inv_n;
    float dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum * inv_n;
    float SSD = sum_diff_sq - sum_diff * sum_diff * inv_n;

    return (float4)(SSD, dst_dUx, dst_dUy, 0);
}

#ifdef DIS_BORDER_SIZE
__kernel void dis_patch_inverse_search_fwd_2(__global const float2 *U_ptr,
                                             __global const uchar *I0_ptr, __global const uchar *I1_ptr,
                                             __global const short *I0x_ptr, __global const short *I0y_ptr,
                                             __global const float *xx_ptr, __global const float *yy_ptr,
                                             __global const float *xy_ptr,
                                             __global const float *x_ptr, __global const float *y_ptr,
                                             int w, int h, int ws, int hs, int num_inner_iter,
                                             __global float2 *S_ptr)
{
    int js = get_global_id(0);
    int is = get_global_id(1);
    int i = is * DIS_PATCH_STRIDE;
    int j = js * DIS_PATCH_STRIDE;
    const int psz = DIS_PATCH_SIZE;
    int w_ext = w + 2 * DIS_BORDER_SIZE;
    int index = is * ws + js;

    if (js >= ws || is >= hs) return;

    float2 U0 = S_ptr[index];
    float2 cur_U = U0;
    float cur_xx = xx_ptr[index];
    float cur_yy = yy_ptr[index];
    float cur_xy = xy_ptr[index];
    float detH = cur_xx * cur_yy - cur_xy * cur_xy;

    float inv_detH = (fabs(detH) < EPS) ? 1.0 / EPS : 1.0 / detH;
    float invH11 = cur_yy * inv_detH;
    float invH12 = -cur_xy * inv_detH;
    float invH22 = cur_xx * inv_detH;

    float prev_SSD = INF;
    float x_grad_sum = x_ptr[index];
    float y_grad_sum = y_ptr[index];

    const float i_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;
    const float i_upper_limit = DIS_BORDER_SIZE + h - 1.0f;
    const float j_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;
    const float j_upper_limit = DIS_BORDER_SIZE + w - 1.0f;

    for (int t = 0; t < num_inner_iter; t++)
    {
        float i_I1, j_I1, w00, w01, w10, w11;
        INIT_BILINEAR_WEIGHTS(cur_U.x, cur_U.y);
        float4 res = processPatchMeanNorm(
                I0_ptr  + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
                I0x_ptr + i * w + j, I0y_ptr + i * w + j,
                w, w_ext, w00, w01, w10, w11,
                x_grad_sum, y_grad_sum);

        float SSD = res.x;
        float dUx = res.y;
        float dUy = res.z;
        float dx = invH11 * dUx + invH12 * dUy;
        float dy = invH12 * dUx + invH22 * dUy;

        cur_U -= (float2)(dx, dy);

        if (SSD >= prev_SSD)
            break;
        prev_SSD = SSD;
    }

    float2 vec = cur_U - U0;
    S_ptr[index] = (dot(vec, vec) <= (float)(DIS_PATCH_SIZE * DIS_PATCH_SIZE)) ? cur_U : U0;
}

__attribute__((reqd_work_group_size(8, 1, 1)))
__kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
                                             int w, int h, int ws, int hs,
                                             __global float2 *S_ptr)
{
    int id = get_global_id(0);
    int is = get_group_id(0);

    is = (hs - 1 - is);
    int i = is * DIS_PATCH_STRIDE;
    int j = (ws - 2) * DIS_PATCH_STRIDE;
    const int w_ext = w + 2 * DIS_BORDER_SIZE;

    const float i_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;
    const float i_upper_limit = DIS_BORDER_SIZE + h - 1.0f;
    const float j_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;
    const float j_upper_limit = DIS_BORDER_SIZE + w - 1.0f;

#ifdef CV_USE_SUBGROUPS
    int sid = get_sub_group_local_id();
#define EXTRA_ARGS_computeSSDMeanNorm sid
#else
    __local float2 smem[8];
    int sid = get_local_id(0);
#define EXTRA_ARGS_computeSSDMeanNorm sid, smem
#endif

    for (int js = (ws - 2); js > -1; js--, j -= DIS_PATCH_STRIDE)
    {
        float2 U0 = S_ptr[is * ws + js];
        float2 U1 = S_ptr[is * ws + js + 1];

        float i_I1, j_I1, w00, w01, w10, w11;

        INIT_BILINEAR_WEIGHTS(U0.x, U0.y);
        float min_SSD = computeSSDMeanNorm(
                I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
                w, w_ext, w00, w01, w10, w11, EXTRA_ARGS_computeSSDMeanNorm);

        INIT_BILINEAR_WEIGHTS(U1.x, U1.y);
        float cur_SSD = computeSSDMeanNorm(
                I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
                w, w_ext, w00, w01, w10, w11, EXTRA_ARGS_computeSSDMeanNorm);

        S_ptr[is * ws + js] = (cur_SSD < min_SSD) ? U1 : U0;
    }

#undef EXTRA_ARGS_computeSSDMeanNorm
}

__kernel void dis_patch_inverse_search_bwd_2(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
                                             __global const short *I0x_ptr, __global const short *I0y_ptr,
                                             __global const float *xx_ptr, __global const float *yy_ptr,
                                             __global const float *xy_ptr,
                                             __global const float *x_ptr, __global const float *y_ptr,
                                             int w, int h, int ws, int hs, int num_inner_iter,
                                             __global float2 *S_ptr)
{
    int js = get_global_id(0);
    int is = get_global_id(1);
    if (js >= ws || is >= hs) return;

    js = (ws - 1 - js);
    is = (hs - 1 - is);

    int j = js * DIS_PATCH_STRIDE;
    int i = is * DIS_PATCH_STRIDE;
    int w_ext = w + 2 * DIS_BORDER_SIZE;
    int index = is * ws + js;

    float2 U0 = S_ptr[index];
    float2 cur_U = U0;
    float cur_xx = xx_ptr[index];
    float cur_yy = yy_ptr[index];
    float cur_xy = xy_ptr[index];
    float detH = cur_xx * cur_yy - cur_xy * cur_xy;

    float inv_detH = (fabs(detH) < EPS) ? 1.0 / EPS : 1.0 / detH;
    float invH11 = cur_yy * inv_detH;
    float invH12 = -cur_xy * inv_detH;
    float invH22 = cur_xx * inv_detH;

    float prev_SSD = INF;
    float x_grad_sum = x_ptr[index];
    float y_grad_sum = y_ptr[index];

    const float i_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;
    const float i_upper_limit = DIS_BORDER_SIZE + h - 1.0f;
    const float j_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;
    const float j_upper_limit = DIS_BORDER_SIZE + w - 1.0f;

    for (int t = 0; t < num_inner_iter; t++)
    {
        float i_I1, j_I1, w00, w01, w10, w11;
        INIT_BILINEAR_WEIGHTS(cur_U.x, cur_U.y);
        float4 res = processPatchMeanNorm(
                I0_ptr  + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
                I0x_ptr + i * w + j, I0y_ptr + i * w + j,
                w, w_ext, w00, w01, w10, w11,
                x_grad_sum, y_grad_sum);

        float SSD = res.x;
        float dUx = res.y;
        float dUy = res.z;
        float dx = invH11 * dUx + invH12 * dUy;
        float dy = invH12 * dUx + invH22 * dUy;

        cur_U -= (float2)(dx, dy);

        if (SSD >= prev_SSD)
            break;
        prev_SSD = SSD;
    }

    float2 vec = cur_U - U0;
    S_ptr[index] = ((dot(vec, vec)) <= (float)(DIS_PATCH_SIZE * DIS_PATCH_SIZE)) ? cur_U : U0;
}
#endif // DIS_BORDER_SIZE