// This file is auto-generated. Do not edit! #include "opencv2/core.hpp" #include "cvconfig.h" #include "opencl_kernels_dnn.hpp" #ifdef HAVE_OPENCL namespace cv { namespace ocl { namespace dnn { static const char* const moduleName = "dnn"; struct cv::ocl::internal::ProgramEntry activations_oclsrc={moduleName, "activations", "#define CONCAT(A,B) A##_##B\n" "#define TEMPLATE(name,type) CONCAT(name,type)\n" "#define KERNEL_ARG_DTYPE float\n" "#if defined(cl_khr_fp16)\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "__kernel void ReLUForward(const int count, __global const T* in, __global T* out\n" "#ifndef RELU_NO_SLOPE\n" ", KERNEL_ARG_DTYPE negative_slope\n" "#endif\n" ") {\n" "int index = get_global_id(0);\n" "if(index < count)\n" "#ifndef RELU_NO_SLOPE\n" "out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;\n" "#else\n" "out[index] = in[index] > 0 ? in[index] : 0;\n" "#endif\n" "}\n" "__kernel void ReLU6Forward(const int count, __global const T* in, __global T* out,\n" "const KERNEL_ARG_DTYPE minValue, const KERNEL_ARG_DTYPE maxValue)\n" "{\n" "int index = get_global_id(0);\n" "if(index < count)\n" "{\n" "T x = in[index];\n" "out[index] = clamp(x, convert_T(minValue), convert_T(maxValue));\n" "}\n" "}\n" "__kernel void PReLUForward(const int count, const int channels, const int plane_size,\n" "__global const T* in, __global T* out,\n" "__global const KERNEL_ARG_DTYPE* slope_data)\n" "{\n" "int index = get_global_id(0);\n" "int c = (index / plane_size) % channels;\n" "if(index < count)\n" "out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];\n" "}\n" "__kernel void TanHForward(const int count, __global T* in, __global T* out) {\n" "int index = get_global_id(0);\n" "if(index < count)\n" "out[index] = tanh(in[index]);\n" "}\n" "__kernel void SigmoidForward(const int count, __global const T* in, __global T* out) {\n" "int index = get_global_id(0);\n" "if(index < count)\n" "out[index] = 1.0f / (1.0f + exp(-in[index]));\n" "}\n" "__kernel void SwishForward(const int count, __global const T* in, __global T* out) {\n" "int index = get_global_id(0);\n" "if(index < count)\n" "out[index] = in[index] / (1.0f + exp(-in[index]));\n" "}\n" "__kernel void MishForward(const int count, __global const T* in, __global T* out) {\n" "int index = get_global_id(0);\n" "if(index < count)\n" "out[index] = in[index] * tanh(log(1.0f + exp(in[index])));\n" "}\n" "__kernel void BNLLForward(const int n, __global const T* in, __global T* out) {\n" "int index = get_global_id(0);\n" "if (index < n) {\n" "T x = in[index];\n" "out[index] = x > 0 ? x + log(1.0f + exp(-x)) : log(1.0f + exp(x));\n" "}\n" "}\n" "__kernel void AbsValForward(const int n, __global const T* in, __global T* out) {\n" "int index = get_global_id(0);\n" "if (index < n)\n" "out[index] = fabs(in[index]);\n" "}\n" "__kernel void PowForward(const int n, __global const T* in, __global T* out,\n" "const KERNEL_ARG_DTYPE power,\n" "const KERNEL_ARG_DTYPE scale,\n" "const KERNEL_ARG_DTYPE shift)\n" "{\n" "int index = get_global_id(0);\n" "if (index < n)\n" "out[index] = pow(shift + scale * in[index], power);\n" "}\n" "__kernel void ELUForward(const int n, __global const T* in, __global T* out)\n" "{\n" "int index = get_global_id(0);\n" "if (index < n)\n" "{\n" "T src = in[index];\n" "out[index] = (src >= 0.f) ? src : exp(src) - 1;\n" "}\n" "}\n" "__kernel void ExpForward(const int n, __global const T* in, __global T* out,\n" "const KERNEL_ARG_DTYPE normScale,\n" "const KERNEL_ARG_DTYPE normShift)\n" "{\n" "int index = get_global_id(0);\n" "if (index < n)\n" "{\n" "out[index] = exp(normShift + normScale * in[index]);\n" "}\n" "}\n" , "69e28bd964980d395339a63e2aabfe86", NULL}; struct cv::ocl::internal::ProgramEntry batchnorm_oclsrc={moduleName, "batchnorm", "#if defined(cl_khr_fp16)\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "#if NUM == 8\n" "#define load(src, index) vload8(0, src + index)\n" "#define store(vec, dst, index) vstore8(vec, 0, dst + index)\n" "#define float_type float8\n" "#define convert_f convert_float8\n" "#define BATCH_NORM batch_norm8\n" "#elif NUM == 4\n" "#define load(src, index) vload4(0, src + index)\n" "#define store(vec, dst, index) vstore4(vec, 0, dst + index)\n" "#define float_type float4\n" "#define convert_f convert_float4\n" "#define BATCH_NORM batch_norm4\n" "#elif NUM == 1\n" "#define load(src, index) src[index]\n" "#define store(vec, dst, index) dst[index] = vec\n" "#define float_type float\n" "#define convert_f convert_float\n" "#define BATCH_NORM batch_norm1\n" "#endif\n" "__kernel void BATCH_NORM(__global const Dtype* src,\n" "const int rows,\n" "const int cols,\n" "const int channels,\n" "__global const float* weight,\n" "__global const float* bias,\n" "__global Dtype* dst)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * NUM;\n" "int index = x * cols + y;\n" "if (x >= rows || y >= cols)\n" "return;\n" "float w = weight[x % channels];\n" "float b = bias[x % channels];\n" "float_type src_vec = convert_f(load(src, index));\n" "float_type dst_vec = src_vec * w + (float_type)b;\n" "store(convert_T(dst_vec), dst, index);\n" "}\n" , "c84913b518980a1dc7a4f1f41f7f95fc", NULL}; struct cv::ocl::internal::ProgramEntry col2im_oclsrc={moduleName, "col2im", "__kernel void col2im(const int n, __global const T* data_col,\n" "const int data_col_offset,\n" "const int channels,\n" "const int height, const int width,\n" "const int height_col, const int width_col,\n" "const int coeff_h, const int coeff_w,\n" "__global const T* biasvec,\n" "const int bias_offset,\n" "__global T* data_im,\n" "const int data_im_offset)\n" "{\n" "data_col = data_col + data_col_offset;\n" "biasvec = biasvec + bias_offset;\n" "data_im = data_im + data_im_offset;\n" "int index = get_global_id(0);\n" "if(index < n)\n" "{\n" "T val = 0.f;\n" "int w = index % width + PAD_W;\n" "int h = (index / width) % height + PAD_H;\n" "int c = index / (width * height);\n" "int h_col_start = (h < KERNEL_H) ? 0 : (h - KERNEL_H) / STRIDE_H + 1;\n" "int h_col_end = min(h / STRIDE_H + 1, height_col);\n" "int plane_size_col = height_col * width_col;\n" "int offset = (c * KERNEL_H * KERNEL_W + h * KERNEL_W + w) * plane_size_col;\n" "int w_col_start = (w < KERNEL_W) ? 0 : (w - KERNEL_W) / STRIDE_W + 1;\n" "int w_col_end = min(w / STRIDE_W + 1, width_col);\n" "for (int h_col = h_col_start; h_col < h_col_end; ++h_col)\n" "for (int w_col = w_col_start; w_col < w_col_end; ++w_col)\n" "val += data_col[offset + h_col * coeff_h + w_col * coeff_w];\n" "data_im[index] = val + biasvec[c];\n" "}\n" "}\n" , "ce817c6699c25771483253b686f98562", NULL}; struct cv::ocl::internal::ProgramEntry concat_oclsrc={moduleName, "concat", "#if defined(cl_khr_fp16)\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "#define CONCAT(A,B) A##_##B\n" "#define TEMPLATE(name,type) CONCAT(name,type)\n" "__kernel void TEMPLATE(concat, Dtype)(const int nthreads,\n" "__global const Dtype* in_data,\n" "const int num_concats,\n" "const int concat_size,\n" "const int top_concat_axis,\n" "const int bottom_concat_axis,\n" "const int offset_concat_axis,\n" "__global Dtype* out_data)\n" "{\n" "for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))\n" "{\n" "const int total_concat_size = concat_size * bottom_concat_axis;\n" "const int concat_num = index / total_concat_size;\n" "const int concat_index = index % total_concat_size;\n" "const int top_index = concat_index +\n" "(concat_num * top_concat_axis + offset_concat_axis) * concat_size;\n" "out_data[top_index] = in_data[index];\n" "}\n" "}\n" , "504946fb5e8e715dcede68425e93486a", NULL}; struct cv::ocl::internal::ProgramEntry conv_layer_spatial_oclsrc={moduleName, "conv_layer_spatial", "#if defined(cl_khr_fp16)\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "#define KERNEL_ARG_DTYPE float\n" "#define TYPE_FLOAT 1\n" "#define TYPE_HALF 2\n" "#if defined(FUSED_CONV_RELU)\n" "#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope)))\n" "#define FUSED_ARG KERNEL_ARG_DTYPE negative_slope,\n" "#elif defined(FUSED_CONV_PRELU)\n" "#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope[c])))\n" "#define FUSED_ARG __global const KERNEL_ARG_DTYPE* negative_slope,\n" "#elif defined(FUSED_CONV_POWER)\n" "#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, (Dtype)power)\n" "#define FUSED_ARG KERNEL_ARG_DTYPE power,\n" "#elif defined(FUSED_CONV_TANH)\n" "#define ACTIVATION_RELU_FUNCTION(x, c) tanh(x)\n" "#define FUSED_ARG\n" "#elif defined(FUSED_CONV_RELU6)\n" "#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), (Dtype)min_value, (Dtype)max_value))\n" "#define FUSED_ARG KERNEL_ARG_DTYPE min_value, KERNEL_ARG_DTYPE max_value,\n" "#else\n" "#define ACTIVATION_RELU_FUNCTION(x, c) (x)\n" "#define FUSED_ARG\n" "#endif\n" "#ifdef FUSED_CONV_ELTWISE\n" "#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \\\n" "const Dtype _x_ = eltwise_data[(_offset_)] + (_data_); \\\n" "(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \\\n" "} while(0)\n" "#define ELTWISE_DATA_ARG __global Dtype* eltwise_data,\n" "#define ELTWISE_DATA_ARG_WITH_OFFSET __global Dtype* eltwise_ptr, int eltwise_offset,\n" "#else\n" "#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \\\n" "const Dtype _x_ = (_data_); \\\n" "(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \\\n" "} while(0)\n" "#define ELTWISE_DATA_ARG\n" "#define ELTWISE_DATA_ARG_WITH_OFFSET\n" "#endif\n" "#if APPLY_BIAS\n" "#define BIAS_KERNEL_ARG __global Dtype * biases_base,\n" "#define BIAS_KERNEL_ARG_WITH_OFFSET __global Dtype * biases_base_ptr, int biases_base_offset,\n" "#else\n" "#define BIAS_KERNEL_ARG\n" "#define BIAS_KERNEL_ARG_WITH_OFFSET\n" "#endif\n" "#define __CAT(x, y) x##y\n" "#define CAT(x, y) __CAT(x, y)\n" "#define LOOP0(VAR, STMT)\n" "#define LOOP1(VAR, STMT) (STMT); (VAR)++;\n" "#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))\n" "#if defined(convolve_simd) || defined(Conv_Interleaved)\n" "#if TYPE == TYPE_HALF\n" "#define INT_TYPE ushort\n" "#define INT_TYPE2 ushort2\n" "#define INT_TYPE4 ushort4\n" "#define INT_TYPE8 ushort8\n" "#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read_us2\n" "#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read_us4\n" "#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read_us8\n" "#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read_us\n" "#else\n" "#define INT_TYPE uint\n" "#define INT_TYPE2 uint2\n" "#define INT_TYPE4 uint4\n" "#define INT_TYPE8 uint8\n" "#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read2\n" "#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read4\n" "#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8\n" "#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read\n" "#endif\n" "#endif\n" "#ifdef KERNEL_BASIC\n" "__kernel void ConvolveBasic(\n" "ELTWISE_DATA_ARG\n" "FUSED_ARG\n" "__global Dtype* image_data,\n" "int image_offset,\n" "__global Dtype* kernel_data,\n" "int kernel_offset,\n" "__global Dtype* bias,\n" "const int bias_offset,\n" "__global Dtype* convolved_image_base,\n" "const int convolved_image_base_offset,\n" "const int convolved_image_offset,\n" "const ushort input_width,\n" "const ushort input_height,\n" "const ushort output_width,\n" "const ushort output_height,\n" "const ushort pad_w,\n" "const ushort pad_h\n" ")\n" "{\n" "__global Dtype* convolved_image = convolved_image_base + convolved_image_base_offset;\n" "const int out_idx = get_global_id(0);\n" "const int plane_size = output_width * output_height;\n" "const int out_plane_idx = out_idx % plane_size;\n" "const int outputZ = out_idx / plane_size;\n" "const int outputY = out_plane_idx / output_width;\n" "const int outputX = out_plane_idx % output_width;\n" "if (outputZ < OUTPUT_Z)\n" "{\n" "Dtype sum = 0.0f;\n" "const int org_y = outputY * STRIDE_Y - pad_h;\n" "const int org_x = outputX * STRIDE_X - pad_w;\n" "const int currentKernelOffset = kernel_offset + outputZ*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;\n" "const int local_image_offset = org_y * input_width + org_x;\n" "const int imageSize = input_width * input_height;\n" "__global Dtype* image_dataPtr = (image_data + (image_offset + local_image_offset));\n" "__global Dtype* kernel_dataPtr = (kernel_data + (currentKernelOffset));\n" "for (int c = 0; c < CHANNELS; c++)\n" "{\n" "for (int y = 0; y < KERNEL_HEIGHT; y++)\n" "{\n" "int y_ = org_y + y * DILATION_Y;\n" "for (int x = 0; x < KERNEL_WIDTH; x++)\n" "{\n" "int x_ = org_x + x * DILATION_X;\n" "if (y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width)\n" "{\n" "sum = mad(image_dataPtr[x * DILATION_X], kernel_dataPtr[x], sum);\n" "}\n" "}\n" "image_dataPtr += input_width * DILATION_Y;\n" "kernel_dataPtr += KERNEL_WIDTH;\n" "}\n" "image_dataPtr += imageSize - input_width*KERNEL_HEIGHT*DILATION_Y;\n" "}\n" "int offset = convolved_image_offset + out_idx;\n" "#if APPLY_BIAS\n" "int biasIndex = bias_offset + outputZ;\n" "ACTIVATION_FUNCTION(convolved_image, offset, sum + bias[biasIndex], biasIndex);\n" "#else\n" "ACTIVATION_FUNCTION(convolved_image, offset, sum, outputZ);\n" "#endif\n" "}\n" "}\n" "#elif defined KERNEL_IDLF\n" "__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))\n" "__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))\n" "__kernel void\n" "convolve_simd(\n" "ELTWISE_DATA_ARG_WITH_OFFSET\n" "FUSED_ARG\n" "__global Dtype* inputs_ptr, const int inputs_offset,\n" "__global Dtype* weights_ptr, const int weights_offset,\n" "BIAS_KERNEL_ARG_WITH_OFFSET\n" "__global Dtype* outputs_base, const int outputs_offset,\n" "const ushort input_width,\n" "const ushort input_height,\n" "const ushort output_width,\n" "const ushort output_height)\n" "{\n" "__global Dtype* inputs = inputs_ptr + inputs_offset;\n" "__global Dtype* weights = weights_ptr + weights_offset;\n" "#if APPLY_BIAS\n" "__global Dtype* biases_base = biases_base_ptr + biases_base_offset;\n" "#endif\n" "__global Dtype* outputs = outputs_base + outputs_offset;\n" "#ifdef FUSED_CONV_ELTWISE\n" "__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;\n" "#endif\n" "unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH;\n" "unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT;\n" "unsigned int fm = get_global_id(2);\n" "unsigned int fmg = get_group_id(2);\n" "unsigned int lid = get_local_id(2);\n" "Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0.0f };\n" "unsigned int weight_addr = (fmg % FILTERS_IN_GROUP) *\n" "INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;\n" "unsigned int num_in_batch = fm / ALIGNED_NUM_FILTERS;\n" "unsigned int input_batch_offset = num_in_batch * INPUT_PITCH * TOTAL_INPUT_DEPTH_SIZE;\n" "int curr_y = or * STRIDE_Y;\n" "int curr_x = oc * STRIDE_X + lid;\n" "int in_addr = input_batch_offset\n" "+ (curr_y - INPUT_PAD_H) * INPUT_WIDTH\n" "+ curr_x - INPUT_PAD_W;\n" "const int in_limit = (get_global_size(2) / ALIGNED_NUM_FILTERS) * TOTAL_INPUT_DEPTH_SIZE * INPUT_PITCH - 1;\n" "Dtype in_buf[INVEC_SIZE];\n" "for(int kd = 0; kd < INPUT_DEPTH; kd++)\n" "{\n" "#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n" "const bool cx_out_of_range = !(curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W);\n" "int in_offset = in_addr;\n" "__attribute__((opencl_unroll_hint(INVEC_SIZE)))\n" "for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH)\n" "{\n" "Dtype input = inputs[clamp(in_offset, 0, in_limit)];\n" "int cy = curr_y + reg;\n" "in_buf[reg] = (cx_out_of_range || cy < INPUT_PAD_H || cy >= INPUT_HEIGHT + INPUT_PAD_H) ? 0 : input;\n" "}\n" "#else\n" "int in_offset = in_addr;\n" "__attribute__((opencl_unroll_hint(INVEC_SIZE)))\n" "for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH)\n" "{\n" "in_buf[reg] = inputs[min(in_offset, in_limit)];\n" "}\n" "#endif\n" "in_addr += INPUT_PITCH;\n" "#define BLOCK_IN(n, c) intel_sub_group_shuffle(in_buf[n], (c))\n" "int kr = 0;\n" "LOOP(KERNEL_HEIGHT, kr,\n" "{\n" "int kc = 0;\n" "LOOP(KERNEL_WIDTH, kc,\n" "{\n" "Dtype weight_value = weights[weight_addr];\n" "weight_addr += SIMD_SIZE;\n" "for (int br=0; br < OUT_BLOCK_HEIGHT; br++)\n" "{\n" "for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++)\n" "{\n" "Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y), bc * STRIDE_X + kc * DILATION_X);\n" "out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_value, input, out[br * OUT_BLOCK_WIDTH + bc]);\n" "}\n" "}\n" "});\n" "});\n" "}\n" "fm = fm % ALIGNED_NUM_FILTERS;\n" "#if LEFT_FILTERS > 0\n" "if (fm < NUM_FILTERS)\n" "#endif\n" "{\n" "unsigned int out_addr = (num_in_batch * TOTAL_OUTPUT_DEPTH + fm) * OUTPUT_PITCH;\n" "out_addr += or * output_width + oc;\n" "#if APPLY_BIAS\n" "Dtype bias = biases_base[fm];\n" "#else\n" "Dtype bias = 0;\n" "#endif\n" "for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++)\n" "{\n" "if (r + or >= output_height) break;\n" "for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++)\n" "{\n" "if (c + oc >= output_width) break;\n" "ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c], fm);\n" "}\n" "}\n" "}\n" "}\n" "#elif defined KERNEL_GEMM_LIKE\n" "#if APPLY_BIAS\n" "#define SUBGROUP_GET_BIAS(k, i) intel_sub_group_shuffle(bias[k], i)\n" "#else\n" "#define SUBGROUP_GET_BIAS(k, i) ((Dtype)0)\n" "#endif\n" "#ifdef Conv_Interleaved\n" "typedef struct float1 { float s0; } float1;\n" "typedef struct float5 { float s0; float s1; float s2; float s3; float s4; } float5;\n" "typedef struct float6 { float s0; float s1; float s2; float s3; float s4; float s5; } float6;\n" "typedef struct float7 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; } float7;\n" "typedef struct float9 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; } float9;\n" "typedef struct float10 { float s0; float s1; float s2; float s3; float s4; float s5;\n" "float s6; float s7; float s8; float s9;} float10;\n" "typedef struct float11 { float s0; float s1; float s2; float s3; float s4; float s5;\n" "float s6; float s7; float s8; float s9; float sa;} float11;\n" "typedef struct float12 { float s0; float s1; float s2; float s3; float s4; float s5;\n" "float s6; float s7; float s8; float s9; float sa; float sb; } float12;\n" "typedef struct float13 { float s0; float s1; float s2; float s3; float s4; float s5;\n" "float s6; float s7; float s8; float s9; float sa; float sb; float sc;} float13;\n" "typedef struct float14 { float s0; float s1; float s2; float s3; float s4; float s5;\n" "float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; } float14;\n" "typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float s5;\n" "float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15;\n" "typedef struct float0 { float s0; } float0;\n" "typedef struct half1 { half s0; } half1;\n" "typedef struct half5 { half s0; half s1; half s2; half s3; half s4; } half5;\n" "typedef struct half6 { half s0; half s1; half s2; half s3; half s4; half s5; } half6;\n" "typedef struct half7 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; } half7;\n" "typedef struct half9 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; } half9;\n" "typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5;\n" "half s6; half s7; half s8; half s9; } half10;\n" "typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5;\n" "half s6; half s7; half s8; half s9; half sa; } half11;\n" "typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5;\n" "half s6; half s7; half s8; half s9; half sa; half sb; } half12;\n" "typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5;\n" "half s6; half s7; half s8; half s9; half sa; half sb; half sc; } half13;\n" "typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5;\n" "half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; } half14;\n" "typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5;\n" "half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; half se; } half15;\n" "typedef struct half0 { half s0; } half0;\n" "#define OUT_PITCH_X output_width\n" "#define ROW_PITCH input_width\n" "#define GEMM_LIKE_KERNEL_ARGS \\\n" "ELTWISE_DATA_ARG_WITH_OFFSET \\\n" "FUSED_ARG \\\n" "const __global Dtype *src0_ptr, const unsigned int src0_offset, const unsigned int src0_limit, \\\n" "const __global Dtype *src1_ptr, const unsigned int src1_offset, const unsigned int src1_limit, \\\n" "BIAS_KERNEL_ARG_WITH_OFFSET \\\n" "__global Dtype *dst_base, const unsigned int dst_offset, const unsigned int dst_limit, \\\n" "const ushort input_width, \\\n" "const ushort input_height, \\\n" "const ushort output_width, \\\n" "const ushort output_height, \\\n" "const int out_pitch_y, \\\n" "const int out_pitch_z, \\\n" "const int aligned_input_size, \\\n" "const int slice_pitch\n" "#endif\n" "#ifdef GEMM_LIKE_CONV_32_1\n" "#define TILE_M 1\n" "#define TILE_K KERNEL_WIDTH\n" "#define TILE_N 32\n" "__attribute__((intel_reqd_sub_group_size(8)))\n" "__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)\n" "{\n" "const __global Dtype *src0 = src0_ptr + src0_offset;\n" "const __global Dtype *src1 = src1_ptr + src1_offset;\n" "#if APPLY_BIAS\n" "__global Dtype* biases_base = biases_base_ptr + biases_base_offset;\n" "#endif\n" "__global Dtype *dst = dst_base + dst_offset;\n" "#ifdef FUSED_CONV_ELTWISE\n" "__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;\n" "#endif\n" "const int group_x = get_group_id(0);\n" "const int group_y = get_group_id(1);\n" "const int global_x = get_global_id(0);\n" "const int global_y = get_global_id(1);\n" "const int global_z = get_global_id(2);\n" "int interleaved_y;\n" "int kernel_y;\n" "int kernel_idx;\n" "#define DOT_PRODUCT_8( _result, _rowA, colB ) \\\n" "{ \\\n" "_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \\\n" "_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \\\n" "_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \\\n" "_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \\\n" "_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \\\n" "_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \\\n" "_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \\\n" "_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \\\n" "}\n" "typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;\n" "#if 0\n" "#define OPTIMIZE_READ 1\n" "#else\n" "#define OPTIMIZE_READ 0\n" "#endif\n" "if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )\n" "{\n" "Dtype8 blockC00 = 0.f;\n" "Dtype8 blockC10 = 0.f;\n" "Dtype8 blockC20 = 0.f;\n" "Dtype8 blockC30 = 0.f;\n" "int curr_x = ( global_y % output_width ) * STRIDE_X;\n" "int curr_y = ( global_y / output_width ) * STRIDE_Y;\n" "#if !OPTIMIZE_READ\n" "int saved_y = curr_y;\n" "#endif\n" "const __global Dtype *src0_read = src0\n" "+ aligned_input_size * global_z\n" "+ (curr_y - INPUT_PAD_H) * ROW_PITCH\n" "+ (curr_x - INPUT_PAD_W);\n" "const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);\n" "int patch_depth = 0;\n" "do\n" "{\n" "int patch_row = 0;\n" "#if !OPTIMIZE_READ\n" "curr_y = saved_y;\n" "#endif\n" "do\n" "{\n" "const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;\n" "#if OPTIMIZE_READ\n" "#if KERNEL_WIDTH == 3\n" "Dtype_t blockA00 = vload3(0, src0_read);\n" "Dtype* pblockA00 = (Dtype*)(&blockA00);\n" "#else\n" "#if 0\n" "if ((int)(src0_read - src0) >= src0_limit - KERNEL_WIDTH)\n" "{\n" "printf(\"CATCH: src0_read-src0: %d limit=%d curr_y,curr_x=%d,%d\\n\", (int)(src0_read - src0), src0_limit, curr_y, curr_x);\n" "}\n" "#endif\n" "Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];\n" "Dtype* pblockA00 = (Dtype*)(&blockA00);\n" "#endif\n" "#else\n" "Dtype_t blockA00;\n" "Dtype* pblockA00 = (Dtype*)(&blockA00);\n" "int pos = 0;\n" "LOOP(KERNEL_WIDTH, pos,\n" "{\n" "if (curr_y >= INPUT_PAD_H &&\n" "curr_y < input_height + INPUT_PAD_H &&\n" "curr_x + pos * DILATION_X >= INPUT_PAD_W &&\n" "curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)\n" "pblockA00[pos] = src0_read[pos * DILATION_X];\n" "else\n" "pblockA00[pos] = 0;\n" "})\n" "curr_y += DILATION_Y;\n" "#endif\n" "src0_read += (ROW_PITCH * DILATION_Y);\n" "Dtype blockB00[KERNEL_WIDTH*4];\n" "Dtype8* p8BlockB00 = (Dtype8*)blockB00;\n" "Dtype4* p4BlockB00 = (Dtype4*)blockB00;\n" "Dtype* pBlockB00 = (Dtype* )blockB00;\n" "interleaved_y = 0;\n" "LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n" "{\n" "p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE *)src1_read ) );\n" "src1_read += WIDTH1 * 2;\n" "} )\n" "if ( kernel_width_is_odd )\n" "{\n" "p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE *)src1_read ) );\n" "src1_read += WIDTH1 * 2;\n" "}\n" "kernel_idx = 0;\n" "interleaved_y = 0;\n" "LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n" "{\n" "kernel_y = interleaved_y * 2;\n" "DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n" "} )\n" "kernel_y = interleaved_y * 2;\n" "if ( kernel_width_is_odd )\n" "{\n" "DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n" "}\n" "}\n" "while( ++patch_row < KERNEL_HEIGHT );\n" "src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);\n" "}\n" "while ( ++patch_depth < INPUT_DEPTH );\n" "int out_offset = global_z * out_pitch_z\n" "+ ( group_x * TILE_N ) * out_pitch_y\n" "+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X\n" "+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;\n" "__global Dtype *out = dst + out_offset;\n" "#if APPLY_BIAS\n" "Dtype bias[4];\n" "Dtype4 *bias_vec;\n" "bias_vec = (Dtype4*)bias;\n" "*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));\n" "if (group_x > 0xFFFFFFFEul) {\n" "dst[0] = bias[0] + bias[1] + bias[2] + bias[3];\n" "}\n" "#else\n" "const Dtype bias[4] = {0, 0, 0, 0};\n" "#endif\n" "if (global_y * TILE_M < output_width * output_height )\n" "{\n" "for (int i = 0; i < 8; i++)\n" "{\n" "ACTIVATION_FUNCTION(dst, out_offset + ( 0 + i ) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);\n" "ACTIVATION_FUNCTION(dst, out_offset + ( 8 + i ) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + 8 + i);\n" "ACTIVATION_FUNCTION(dst, out_offset + ( 16 + i ) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + 16 + i);\n" "ACTIVATION_FUNCTION(dst, out_offset + ( 24 + i ) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + 24 + i);\n" "}\n" "}\n" "}\n" "#if TILE_N_LAST > 0\n" "else\n" "{\n" "int i = 0;\n" "Dtype8 blockC[TILE_N_LAST_DIV8];\n" "LOOP(TILE_N_LAST_DIV8, i,\n" "{\n" "blockC[i] = 0.f;\n" "} )\n" "int curr_x = ( global_y % output_width ) * STRIDE_X;\n" "int curr_y = ( global_y / output_width ) * STRIDE_Y;\n" "#if !OPTIMIZE_READ\n" "int saved_y = curr_y;\n" "#endif\n" "const __global Dtype *src0_read = src0\n" "+ aligned_input_size * global_z\n" "+ (curr_y - INPUT_PAD_H) * ROW_PITCH\n" "+ (curr_x - INPUT_PAD_W);\n" "const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);\n" "int patch_depth = 0;\n" "do\n" "{\n" "int patch_row = 0;\n" "#if !OPTIMIZE_READ\n" "curr_y = saved_y;\n" "#endif\n" "do\n" "{\n" "const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;\n" "#if OPTIMIZE_READ\n" "Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];\n" "Dtype* pblockA00 = (Dtype*)(&blockA00);\n" "#else\n" "Dtype_t blockA00;\n" "Dtype* pblockA00 = (Dtype*)(&blockA00);\n" "int pos = 0;\n" "LOOP(KERNEL_WIDTH, pos,\n" "{\n" "if (curr_y >= INPUT_PAD_H &&\n" "curr_y < input_height + INPUT_PAD_H &&\n" "curr_x + pos * DILATION_X >= INPUT_PAD_W &&\n" "curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)\n" "pblockA00[pos] = src0_read[pos * DILATION_X];\n" "else\n" "pblockA00[pos] = 0;\n" "})\n" "curr_y += DILATION_Y;\n" "#endif\n" "src0_read += (ROW_PITCH * DILATION_Y);\n" "Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8];\n" "interleaved_y = 0;\n" "LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n" "{\n" "#if TILE_N_LAST_DIV8 == 1\n" "Dtype2* p2BlockB = (Dtype2* )blockB;\n" "p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );\n" "#elif TILE_N_LAST_DIV8 == 2\n" "Dtype4* p4BlockB = (Dtype4* )blockB;\n" "p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );\n" "#elif TILE_N_LAST_DIV8 == 3\n" "Dtype6* p6BlockB = (Dtype6* )blockB;\n" "(*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );\n" "(*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) );\n" "#endif\n" "src1_read += WIDTH1 * 2;\n" "} )\n" "if ( kernel_width_is_odd )\n" "{\n" "#if TILE_N_LAST_DIV8 == 1\n" "Dtype* pBlockB = (Dtype* )blockB;\n" "pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) );\n" "#elif TILE_N_LAST_DIV8 == 2\n" "Dtype2* p2BlockB = (Dtype2* )blockB;\n" "p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );\n" "#elif TILE_N_LAST_DIV8 == 3\n" "Dtype3* p3BlockB = (Dtype3* )blockB;\n" "p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );\n" "p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 2 * 8) ) );\n" "#endif\n" "src1_read += WIDTH1 * 2;\n" "}\n" "Dtype* pBlockB = (Dtype*)blockB;\n" "kernel_idx = 0;\n" "interleaved_y = 0;\n" "LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n" "{\n" "kernel_y = interleaved_y * 2;\n" "DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;\n" "#if TILE_N_LAST_DIV8 >= 2\n" "DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;\n" "#if TILE_N_LAST_DIV8 >= 3\n" "DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;\n" "#endif\n" "#endif\n" "} )\n" "kernel_y = interleaved_y * 2;\n" "if ( kernel_width_is_odd )\n" "{\n" "DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;\n" "#if TILE_N_LAST_DIV8 >= 2\n" "DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;\n" "#if TILE_N_LAST_DIV8 >= 3\n" "DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;\n" "#endif\n" "#endif\n" "}\n" "}\n" "while( ++patch_row < KERNEL_HEIGHT );\n" "src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );\n" "}\n" "while ( ++patch_depth < INPUT_DEPTH );\n" "int out_offset = global_z * out_pitch_z\n" "+ ( group_x * TILE_N ) * out_pitch_y\n" "+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X\n" "+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;\n" "__global Dtype *out = dst + out_offset;\n" "#if APPLY_BIAS\n" "Dtype bias[4];\n" "Dtype4 *bias_vec;\n" "bias_vec = (Dtype4*)bias;\n" "*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));\n" "if (group_x > 0xFFFFFFFEul) {\n" "dst[0] = bias[0] + bias[1] + bias[2] + bias[3];\n" "}\n" "#else\n" "const Dtype bias[4] = {0, 0, 0, 0};\n" "#endif\n" "if (global_y * TILE_M < output_width * output_height )\n" "{\n" "for (int i = 0; i < 8; i++)\n" "{\n" "if ( TILE_N_LAST_DIV8 > 0 )\n" "{\n" "ACTIVATION_FUNCTION(dst, out_offset + ( 0+i) * out_pitch_y, blockC[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);\n" "}\n" "if ( TILE_N_LAST_DIV8 > 1 )\n" "{\n" "ACTIVATION_FUNCTION(dst, out_offset + ( 8+i) * out_pitch_y, blockC[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);\n" "}\n" "if ( TILE_N_LAST_DIV8 > 2 )\n" "{\n" "ACTIVATION_FUNCTION(dst, out_offset + (16+i) * out_pitch_y, blockC[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);\n" "}\n" "if ( TILE_N_LAST_DIV8 > 3 )\n" "{\n" "ACTIVATION_FUNCTION(dst, out_offset + (24+i) * out_pitch_y, blockC[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);\n" "}\n" "}\n" "}\n" "}\n" "#endif\n" "}\n" "#endif\n" "#ifdef GEMM_LIKE_CONV_32_2\n" "#define TILE_M 2\n" "#define TILE_K KERNEL_WIDTH\n" "#define TILE_N 32\n" "__attribute__((intel_reqd_sub_group_size(8)))\n" "__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)\n" "{\n" "const __global Dtype *src0 = src0_ptr + src0_offset;\n" "const __global Dtype *src1 = src1_ptr + src1_offset;\n" "#if APPLY_BIAS\n" "__global Dtype* biases_base = biases_base_ptr + biases_base_offset;\n" "#endif\n" "__global Dtype *dst = dst_base + dst_offset;\n" "#ifdef FUSED_CONV_ELTWISE\n" "__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;\n" "#endif\n" "const int group_x = get_group_id(0);\n" "const int group_y = get_group_id(1);\n" "const int global_x = get_global_id(0);\n" "const int global_y = get_global_id(1);\n" "const int global_z = get_global_id(2);\n" "int interleaved_y;\n" "int kernel_y;\n" "int kernel_idx;\n" "#define DOT_PRODUCT_8( _result, _rowA, colB ) \\\n" "{ \\\n" "_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \\\n" "_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \\\n" "_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \\\n" "_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \\\n" "_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \\\n" "_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \\\n" "_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \\\n" "_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \\\n" "}\n" "typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;\n" "if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )\n" "{\n" "Dtype8 blockC00 = 0.f;\n" "Dtype8 blockC10 = 0.f;\n" "Dtype8 blockC20 = 0.f;\n" "Dtype8 blockC30 = 0.f;\n" "Dtype8 blockC01 = 0.f;\n" "Dtype8 blockC11 = 0.f;\n" "Dtype8 blockC21 = 0.f;\n" "Dtype8 blockC31 = 0.f;\n" "int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;\n" "int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;\n" "int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;\n" "int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;\n" "#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n" "int saved_y0 = curr_y0;\n" "int saved_y1 = curr_y1;\n" "#endif\n" "const __global Dtype *src0_read0 = src0\n" "+ aligned_input_size * global_z\n" "+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH\n" "+ curr_x0 - INPUT_PAD_W;\n" "const __global Dtype *src0_read1 = src0\n" "+ aligned_input_size * global_z\n" "+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH\n" "+ curr_x1 - INPUT_PAD_W;\n" "const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);\n" "int patch_depth = 0;\n" "do\n" "{\n" "int patch_row = 0;\n" "do\n" "{\n" "const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;\n" "#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0\n" "#if KERNEL_WIDTH == 3\n" "Dtype_t blockA00 = vload3(0, src0_read0); src0_read0 += ROW_PITCH;\n" "Dtype_t blockA01 = vload3(0, src0_read1); src0_read1 += ROW_PITCH;\n" "Dtype* pblockA00 = (Dtype*)(&blockA00);\n" "Dtype* pblockA01 = (Dtype*)(&blockA01);\n" "#else\n" "Dtype_t blockA00 = { (Dtype)0.f };\n" "Dtype_t blockA01 = { (Dtype)0.f };\n" "Dtype* pblockA00 = (Dtype*)(&blockA00);\n" "Dtype* pblockA01 = (Dtype*)(&blockA01);\n" "int pos = 0;\n" "LOOP(KERNEL_WIDTH, pos,\n" "{\n" "if (curr_x0 + pos < input_width)\n" "pblockA00[pos] = src0_read0[pos];\n" "if (curr_x1 + pos < input_width)\n" "pblockA01[pos] = src0_read1[pos];\n" "})\n" "src0_read0 += ROW_PITCH;\n" "src0_read1 += ROW_PITCH;\n" "#endif\n" "#else\n" "Dtype_t blockA00;\n" "Dtype* pblockA00 = (Dtype*)(&blockA00);\n" "int pos = 0;\n" "LOOP(KERNEL_WIDTH, pos,\n" "{\n" "if (curr_y0 >= INPUT_PAD_H &&\n" "curr_y0 < input_height + INPUT_PAD_H &&\n" "curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&\n" "curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)\n" "pblockA00[pos] = src0_read0[pos * DILATION_X];\n" "else\n" "pblockA00[pos] = 0;\n" "})\n" "curr_y0 += DILATION_Y;\n" "Dtype_t blockA01;\n" "Dtype* pblockA01 = (Dtype*)(&blockA01);\n" "pos = 0;\n" "LOOP(KERNEL_WIDTH, pos,\n" "{\n" "if (curr_y1 >= INPUT_PAD_H &&\n" "curr_y1 < input_height + INPUT_PAD_H &&\n" "curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&\n" "curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)\n" "pblockA01[pos] = src0_read1[pos * DILATION_X];\n" "else\n" "pblockA01[pos] = 0;\n" "})\n" "curr_y1 += DILATION_Y;\n" "src0_read0 += (ROW_PITCH * DILATION_Y);\n" "src0_read1 += (ROW_PITCH * DILATION_Y);\n" "#endif\n" "Dtype blockB00[KERNEL_WIDTH*4];\n" "Dtype8* p8BlockB00 = (Dtype8*)blockB00;\n" "Dtype4* p4BlockB00 = (Dtype4*)blockB00;\n" "Dtype* pBlockB00 = (Dtype* )blockB00;\n" "interleaved_y = 0;\n" "LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n" "{\n" "p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE*)src1_read ) );\n" "src1_read += WIDTH1 * 2;\n" "} )\n" "if ( kernel_width_is_odd )\n" "{\n" "p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );\n" "src1_read += WIDTH1 * 2;\n" "}\n" "kernel_idx = 0;\n" "interleaved_y = 0;\n" "LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n" "{\n" "kernel_y = interleaved_y * 2;\n" "DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_8( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_8( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_8( blockC21, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_8( blockC31, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n" "} )\n" "if ( kernel_width_is_odd )\n" "{\n" "kernel_y = interleaved_y * 2;\n" "DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n" "}\n" "}\n" "while( ++patch_row < KERNEL_HEIGHT );\n" "#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n" "curr_y0 = saved_y0;\n" "curr_y1 = saved_y1;\n" "#endif\n" "src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );\n" "src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );\n" "}\n" "while ( ++patch_depth < INPUT_DEPTH );\n" "int out0_offset = global_z * out_pitch_z\n" "+ ( group_x * TILE_N ) * out_pitch_y\n" "+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X\n" "+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;\n" "int out1_offset = global_z * out_pitch_z\n" "+ ( group_x * TILE_N ) * out_pitch_y\n" "+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X\n" "+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;\n" "#if APPLY_BIAS\n" "Dtype bias[4];\n" "Dtype4 *bias_vec;\n" "bias_vec = (Dtype4*)bias;\n" "*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));\n" "if (group_x > 0xFFFFFFFEul) {\n" "dst[0] = bias[0] + bias[1] + bias[2] + bias[3];\n" "}\n" "#else\n" "const Dtype bias[4] = {0, 0, 0, 0};\n" "#endif\n" "if( global_y * TILE_M < output_width * output_height )\n" "{\n" "for( int i = 0; i < 8; i++ )\n" "{\n" "ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);\n" "ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);\n" "ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);\n" "ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);\n" "}\n" "}\n" "if( global_y * TILE_M + 1 < output_width * output_height )\n" "{\n" "for( int i = 0; i < 8; i++ )\n" "{\n" "ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC01[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);\n" "ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC11[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);\n" "ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC21[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);\n" "ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC31[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);\n" "}\n" "}\n" "}\n" "#if TILE_N_LAST > 0\n" "else\n" "{\n" "int i = 0;\n" "Dtype8 blockC0[TILE_N_LAST_DIV8];\n" "Dtype8 blockC1[TILE_N_LAST_DIV8];\n" "LOOP(TILE_N_LAST_DIV8, i,\n" "{\n" "blockC0[i] = 0.f;\n" "blockC1[i] = 0.f;\n" "} )\n" "int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;\n" "int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;\n" "int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;\n" "int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;\n" "#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n" "int saved_y0 = curr_y0;\n" "int saved_y1 = curr_y1;\n" "#endif\n" "const __global Dtype *src0_read0 = src0\n" "+ aligned_input_size * global_z\n" "+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH\n" "+ curr_x0 - INPUT_PAD_W;\n" "const __global Dtype *src0_read1 = src0\n" "+ aligned_input_size * global_z\n" "+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH\n" "+ curr_x1 - INPUT_PAD_W;\n" "const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);\n" "int patch_depth = 0;\n" "do\n" "{\n" "int patch_row = 0;\n" "do\n" "{\n" "const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;\n" "#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0\n" "Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH;\n" "Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH;\n" "Dtype* pblockA00 = (Dtype*)(&blockA00);\n" "Dtype* pblockA01 = (Dtype*)(&blockA01);\n" "#else\n" "Dtype_t blockA00;\n" "Dtype* pblockA00 = (Dtype*)(&blockA00);\n" "int pos = 0;\n" "LOOP(KERNEL_WIDTH, pos,\n" "{\n" "if (curr_y0 >= INPUT_PAD_H &&\n" "curr_y0 < input_height + INPUT_PAD_H &&\n" "curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&\n" "curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)\n" "pblockA00[pos] = src0_read0[pos * DILATION_X];\n" "else\n" "pblockA00[pos] = 0;\n" "})\n" "curr_y0 += DILATION_Y;\n" "Dtype_t blockA01;\n" "Dtype* pblockA01 = (Dtype*)(&blockA01);\n" "pos = 0;\n" "LOOP(KERNEL_WIDTH, pos,\n" "{\n" "if (curr_y1 >= INPUT_PAD_H &&\n" "curr_y1 < input_height + INPUT_PAD_H &&\n" "curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&\n" "curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)\n" "pblockA01[pos] = src0_read1[pos * DILATION_X];\n" "else\n" "pblockA01[pos] = 0;\n" "})\n" "curr_y1 += DILATION_Y;\n" "src0_read0 += (ROW_PITCH * DILATION_Y);\n" "src0_read1 += (ROW_PITCH * DILATION_Y);\n" "#endif\n" "Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8];\n" "interleaved_y = 0;\n" "LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n" "{\n" "#if TILE_N_LAST_DIV8 == 1\n" "Dtype2* p2BlockB = (Dtype2* )blockB;\n" "p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );\n" "#elif TILE_N_LAST_DIV8 == 2\n" "Dtype4* p4BlockB = (Dtype4* )blockB;\n" "p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );\n" "#elif TILE_N_LAST_DIV8 == 3\n" "Dtype6* p6BlockB = (Dtype6* )blockB;\n" "(*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );\n" "(*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) );\n" "#endif\n" "src1_read += WIDTH1 * 2;\n" "} )\n" "if ( kernel_width_is_odd )\n" "{\n" "#if TILE_N_LAST_DIV8 == 1\n" "Dtype* pBlockB = (Dtype* )blockB;\n" "pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) );\n" "#elif TILE_N_LAST_DIV8 == 2\n" "Dtype2* p2BlockB = (Dtype2* )blockB;\n" "p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );\n" "#elif TILE_N_LAST_DIV8 == 3\n" "Dtype3* p3BlockB = (Dtype3* )blockB;\n" "p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );\n" "p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 8) ) );\n" "#endif\n" "src1_read += WIDTH1 * 2;\n" "}\n" "Dtype* pBlockB = (Dtype*)blockB;\n" "kernel_idx = 0;\n" "interleaved_y = 0;\n" "LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n" "{\n" "kernel_y = interleaved_y * 2;\n" "DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y ], pBlockB[kernel_idx] );\n" "DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );\n" "DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;\n" "#if TILE_N_LAST_DIV8 >= 2\n" "DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y ], pBlockB[kernel_idx] );\n" "DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );\n" "DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;\n" "#if TILE_N_LAST_DIV8 >= 3\n" "DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y ], pBlockB[kernel_idx] );\n" "DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );\n" "DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;\n" "#endif\n" "#endif\n" "} )\n" "kernel_y = interleaved_y * 2;\n" "if ( kernel_width_is_odd )\n" "{\n" "DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y], pBlockB[kernel_idx] );\n" "DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;\n" "#if TILE_N_LAST_DIV8 >= 2\n" "DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y], pBlockB[kernel_idx] );\n" "DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;\n" "#if TILE_N_LAST_DIV8 >= 3\n" "DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y], pBlockB[kernel_idx] );\n" "DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;\n" "#endif\n" "#endif\n" "}\n" "}\n" "while( ++patch_row < KERNEL_HEIGHT );\n" "#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n" "curr_y0 = saved_y0;\n" "curr_y1 = saved_y1;\n" "#endif\n" "src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );\n" "src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );\n" "}\n" "while ( ++patch_depth < INPUT_DEPTH );\n" "int out0_offset = global_z * out_pitch_z\n" "+ ( group_x * TILE_N ) * out_pitch_y\n" "+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X\n" "+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;\n" "int out1_offset = global_z * out_pitch_z\n" "+ ( group_x * TILE_N ) * out_pitch_y\n" "+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X\n" "+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;\n" "__global Dtype *out1 = dst + out1_offset;\n" "#if APPLY_BIAS\n" "Dtype bias[4];\n" "Dtype4 *bias_vec;\n" "bias_vec = (Dtype4*)bias;\n" "*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));\n" "if (group_x > 0xFFFFFFFEul) {\n" "dst[0] = bias[0] + bias[1] + bias[2] + bias[3];\n" "}\n" "#else\n" "const Dtype bias[4] = {0, 0, 0, 0};\n" "#endif\n" "if( global_y * TILE_M < output_width * output_height )\n" "{\n" "for( int i = 0; i < 8; i++ )\n" "{\n" "if ( TILE_N_LAST_DIV8 > 0 )\n" "{\n" "ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC0[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);\n" "}\n" "if ( TILE_N_LAST_DIV8 > 1 )\n" "{\n" "ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC0[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);\n" "}\n" "if ( TILE_N_LAST_DIV8 > 2 )\n" "{\n" "ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC0[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);\n" "}\n" "if ( TILE_N_LAST_DIV8 > 3 )\n" "{\n" "ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC0[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);\n" "}\n" "}\n" "}\n" "if( global_y * TILE_M + 1 < output_width * output_height )\n" "{\n" "for( int i = 0; i < 8; i++ )\n" "{\n" "if ( TILE_N_LAST_DIV8 > 0 )\n" "{\n" "ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC1[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);\n" "}\n" "if ( TILE_N_LAST_DIV8 > 1 )\n" "{\n" "ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC1[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);\n" "}\n" "if ( TILE_N_LAST_DIV8 > 2 )\n" "{\n" "ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC1[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);\n" "}\n" "if ( TILE_N_LAST_DIV8 > 3 )\n" "{\n" "ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC1[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);\n" "}\n" "}\n" "}\n" "}\n" "#endif\n" "}\n" "#endif\n" "#if defined(GEMM_LIKE_CONV_32_2_SIMD16) || defined(GEMM_LIKE_CONV_32_1_SIMD16)\n" "#define INTERLEAVED_SIMD16_OUTPUT(_out_, _offset_, _m_) do {\\\n" "if (global_y * TILE_M < output_width * output_height ) \\\n" "{ \\\n" "if ( ( OUT_DEPTH % TILE_N ) == 0 ) {\\\n" "for (int i = 0; i < 16; i++) \\\n" "{ \\\n" "ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \\\n" "ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \\\n" "} \\\n" "} \\\n" "else if( ( OUT_DEPTH % 16 ) == 0 ) { \\\n" "if ( ( global_x + 1 ) < get_global_size(0) ) { \\\n" "for ( int i = 0; i < 16; i++ ) \\\n" "{ \\\n" "ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \\\n" "ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \\\n" "} \\\n" "} \\\n" "else { \\\n" "for (int i = 0; i < 16; i++) \\\n" "{ \\\n" "ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \\\n" "} \\\n" "} \\\n" "} \\\n" "else { \\\n" "if ( ( global_x + 1 ) < get_global_size(0) ) \\\n" "{ \\\n" "for ( int i = 0; i < 16; i++ ) \\\n" "{ \\\n" "ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \\\n" "ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \\\n" "} \\\n" "} \\\n" "else { \\\n" "if ( (OUT_DEPTH % TILE_N) > 16 ) { \\\n" "for (int i = 0; i < 16 ; i++) \\\n" "{ \\\n" "ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \\\n" "} \\\n" "for (int i = 0; i < OUT_DEPTH % 16 ; i++) \\\n" "{ \\\n" "ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \\\n" "} \\\n" "} \\\n" "else { \\\n" "for (int i = 0; i < OUT_DEPTH % 16 ; i++) \\\n" "{ \\\n" "ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \\\n" "} \\\n" "} \\\n" "} \\\n" "} \\\n" "} \\\n" "}while(0)\n" "#endif\n" "#ifdef GEMM_LIKE_CONV_32_1_SIMD16\n" "#define TILE_M 1\n" "#define TILE_K KERNEL_WIDTH\n" "#define TILE_N 32\n" "__attribute__((intel_reqd_sub_group_size(16)))\n" "__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)\n" "{\n" "const __global Dtype *src0 = src0_ptr + src0_offset;\n" "const __global Dtype *src1 = src1_ptr + src1_offset;\n" "#if APPLY_BIAS\n" "__global Dtype* biases_base = biases_base_ptr + biases_base_offset;\n" "#endif\n" "__global Dtype *dst = dst_base + dst_offset;\n" "#ifdef FUSED_CONV_ELTWISE\n" "__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;\n" "#endif\n" "const int group_x = get_group_id(0);\n" "const int group_y = get_group_id(1);\n" "const int global_x = get_global_id(0);\n" "const int global_y = get_global_id(1);\n" "const int global_z = get_global_id(2);\n" "int interleaved_y;\n" "int kernel_y;\n" "int kernel_idx;\n" "Dtype16 blockC00 = 0.f;\n" "Dtype16 blockC10 = 0.f;\n" "int curr_x = ( global_y % output_width ) * STRIDE_X;\n" "int curr_y = ( global_y / output_width ) * STRIDE_Y;\n" "#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n" "int saved_y = curr_y;\n" "#endif\n" "const __global Dtype *src0_read = src0\n" "+ aligned_input_size * global_z\n" "+ (curr_y - INPUT_PAD_H) * ROW_PITCH\n" "+ curr_x - INPUT_PAD_W;\n" "const __global Dtype *src0_read_orig = src0_read;\n" "const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2 );\n" "#define DOT_PRODUCT_16( _result, _rowA, colB ) \\\n" "{ \\\n" "_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \\\n" "_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \\\n" "_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \\\n" "_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \\\n" "_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \\\n" "_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \\\n" "_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \\\n" "_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \\\n" "_result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); \\\n" "_result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); \\\n" "_result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); \\\n" "_result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); \\\n" "_result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); \\\n" "_result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); \\\n" "_result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); \\\n" "_result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); \\\n" "}\n" "typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;\n" "int patch_depth = 0;\n" "__attribute__((opencl_unroll_hint(1)))\n" "do\n" "{\n" "int patch_row = 0;\n" "#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n" "curr_y = saved_y;\n" "#endif\n" "__attribute__((opencl_unroll_hint(1)))\n" "do\n" "{\n" "const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;\n" "#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0\n" "#if KERNEL_WIDTH == 3\n" "Dtype_t blockA00 = vload3(0, src0_read);\n" "Dtype* pblockA00 = (Dtype*)(&blockA00);\n" "#else\n" "Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];\n" "Dtype* pblockA00 = (Dtype*)(&blockA00);\n" "#endif\n" "#else\n" "Dtype_t blockA00;\n" "Dtype* pblockA00 = (Dtype*)(&blockA00);\n" "int pos = 0;\n" "LOOP(KERNEL_WIDTH, pos,\n" "{\n" "if (curr_y >= INPUT_PAD_H &&\n" "curr_y < input_height + INPUT_PAD_H &&\n" "curr_x + pos * DILATION_X >= INPUT_PAD_W &&\n" "curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)\n" "pblockA00[pos] = src0_read[pos * DILATION_X];\n" "else\n" "pblockA00[pos] = 0;\n" "})\n" "curr_y += DILATION_Y;\n" "#endif\n" "src0_read += ROW_PITCH * DILATION_Y;\n" "INT_TYPE blockB00[KERNEL_WIDTH * 2];\n" "INT_TYPE4* p4BlockB00 = (INT_TYPE4*)blockB00;\n" "INT_TYPE2* p2BlockB00 = (INT_TYPE2*)blockB00;\n" "Dtype* pBlockB00 = (Dtype*)blockB00;\n" "interleaved_y = 0;\n" "LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n" "{\n" "p4BlockB00[interleaved_y] = SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read );\n" "src1_read += WIDTH1 * 2;\n" "} )\n" "if ( kernel_width_is_odd )\n" "{\n" "p2BlockB00[KERNEL_WIDTH - 1] = SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read );\n" "src1_read += WIDTH1 * 2;\n" "}\n" "kernel_idx = 0;\n" "interleaved_y = 0;\n" "LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n" "{\n" "kernel_y = interleaved_y * 2;\n" "DOT_PRODUCT_16( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_16( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n" "} )\n" "if ( kernel_width_is_odd )\n" "{\n" "kernel_y = interleaved_y * 2;\n" "DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n" "}\n" "}\n" "while( ++patch_row < KERNEL_HEIGHT );\n" "src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );\n" "}\n" "while ( ++patch_depth < INPUT_DEPTH );\n" "int out_offset = global_z * out_pitch_z\n" "+ ( group_x * TILE_N ) * out_pitch_y\n" "+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X\n" "+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;\n" "__global Dtype *out = dst + out_offset;\n" "#if APPLY_BIAS\n" "Dtype bias[2];\n" "Dtype2 *bias_vec;\n" "bias_vec = (Dtype2*)bias;\n" "*bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));\n" "if (group_x > 0xFFFFFFFEul) {\n" "dst[0] = bias[0] + bias[1];\n" "}\n" "#else\n" "const Dtype bias[2] = {0, 0};\n" "#endif\n" "INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);\n" "}\n" "#endif\n" "#ifdef GEMM_LIKE_CONV_32_2_SIMD16\n" "#define TILE_M 2\n" "#define TILE_K KERNEL_WIDTH\n" "#define TILE_N 32\n" "__attribute__((intel_reqd_sub_group_size(16)))\n" "__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)\n" "{\n" "const __global Dtype *src0 = src0_ptr + src0_offset;\n" "const __global Dtype *src1 = src1_ptr + src1_offset;\n" "#if APPLY_BIAS\n" "__global Dtype* biases_base = biases_base_ptr + biases_base_offset;\n" "#endif\n" "__global Dtype *dst = dst_base + dst_offset;\n" "#ifdef FUSED_CONV_ELTWISE\n" "__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;\n" "#endif\n" "const int group_x = get_group_id(0);\n" "const int group_y = get_group_id(1);\n" "const int global_x = get_global_id(0);\n" "const int global_y = get_global_id(1);\n" "const int global_z = get_global_id(2);\n" "int interleaved_y;\n" "int kernel_y;\n" "int kernel_idx;\n" "#define DOT_PRODUCT_16( _result, _rowA, colB ) \\\n" "{ \\\n" "_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \\\n" "_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \\\n" "_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \\\n" "_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \\\n" "_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \\\n" "_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \\\n" "_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \\\n" "_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \\\n" "_result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); \\\n" "_result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); \\\n" "_result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); \\\n" "_result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); \\\n" "_result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); \\\n" "_result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); \\\n" "_result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); \\\n" "_result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); \\\n" "}\n" "typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;\n" "{\n" "Dtype16 blockC00 = 0.f;\n" "Dtype16 blockC10 = 0.f;\n" "Dtype16 blockC01 = 0.f;\n" "Dtype16 blockC11 = 0.f;\n" "int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;\n" "int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;\n" "int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;\n" "int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;\n" "#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n" "int saved_y0 = curr_y0;\n" "int saved_y1 = curr_y1;\n" "#endif\n" "const __global Dtype *src0_read0 = src0\n" "+ aligned_input_size * global_z\n" "+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH\n" "+ curr_x0 - INPUT_PAD_W;\n" "const __global Dtype *src0_read1 = src0\n" "+ aligned_input_size * global_z\n" "+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH\n" "+ curr_x1 - INPUT_PAD_W;\n" "const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);\n" "int patch_depth = 0;\n" "do\n" "{\n" "int patch_row = 0;\n" "do\n" "{\n" "const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;\n" "#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0\n" "Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH;\n" "Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH;\n" "Dtype* pblockA00 = (Dtype*)(&blockA00);\n" "Dtype* pblockA01 = (Dtype*)(&blockA01);\n" "#else\n" "Dtype_t blockA00;\n" "Dtype* pblockA00 = (Dtype*)(&blockA00);\n" "int pos = 0;\n" "LOOP(KERNEL_WIDTH, pos,\n" "{\n" "if (curr_y0 >= INPUT_PAD_H &&\n" "curr_y0 < input_height + INPUT_PAD_H &&\n" "curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&\n" "curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)\n" "pblockA00[pos] = src0_read0[pos * DILATION_X];\n" "else\n" "pblockA00[pos] = 0;\n" "})\n" "curr_y0 += DILATION_Y;\n" "Dtype_t blockA01;\n" "Dtype* pblockA01 = (Dtype*)(&blockA01);\n" "pos = 0;\n" "LOOP(KERNEL_WIDTH, pos,\n" "{\n" "if (curr_y1 >= INPUT_PAD_H &&\n" "curr_y1 < input_height + INPUT_PAD_H &&\n" "curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&\n" "curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)\n" "pblockA01[pos] = src0_read1[pos * DILATION_X];\n" "else\n" "pblockA01[pos] = 0;\n" "})\n" "curr_y1 += DILATION_Y;\n" "src0_read0 += (ROW_PITCH * DILATION_Y);\n" "src0_read1 += (ROW_PITCH * DILATION_Y);\n" "#endif\n" "Dtype blockB00[KERNEL_WIDTH*2];\n" "Dtype4* p4BlockB00 = (Dtype4*)blockB00;\n" "Dtype2* p2BlockB00 = (Dtype2*)blockB00;\n" "Dtype* pBlockB00 = (Dtype* )blockB00;\n" "interleaved_y = 0;\n" "LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n" "{\n" "p4BlockB00[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );\n" "src1_read += WIDTH1 * 2;\n" "} )\n" "if ( kernel_width_is_odd )\n" "{\n" "p2BlockB00[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );\n" "src1_read += WIDTH1 * 2;\n" "}\n" "kernel_idx = 0;\n" "interleaved_y = 0;\n" "LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n" "{\n" "kernel_y = interleaved_y * 2;\n" "DOT_PRODUCT_16( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_16( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_16( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_16( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_16( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_16( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n" "} )\n" "if ( kernel_width_is_odd )\n" "{\n" "kernel_y = interleaved_y * 2;\n" "DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_16( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n" "DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );\n" "DOT_PRODUCT_16( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n" "}\n" "}\n" "while( ++patch_row < KERNEL_HEIGHT );\n" "#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n" "curr_y0 = saved_y0;\n" "curr_y1 = saved_y1;\n" "#endif\n" "src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);\n" "src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);\n" "}\n" "while ( ++patch_depth < INPUT_DEPTH );\n" "int out0_offset = global_z * out_pitch_z\n" "+ ( group_x * TILE_N ) * out_pitch_y\n" "+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X\n" "+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;\n" "int out1_offset = global_z * out_pitch_z\n" "+ ( group_x * TILE_N ) * out_pitch_y\n" "+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X\n" "+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;\n" "#if APPLY_BIAS\n" "Dtype bias[2];\n" "Dtype2 *bias_vec;\n" "bias_vec = (Dtype2*)bias;\n" "*bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));\n" "if (group_x > 0xFFFFFFFEul) {\n" "dst[0] = bias[0] + bias[1];\n" "}\n" "#else\n" "const Dtype bias[2] = {0, 0};\n" "#endif\n" "INTERLEAVED_SIMD16_OUTPUT(dst, out0_offset, 0);\n" "INTERLEAVED_SIMD16_OUTPUT(dst, out1_offset, 1);\n" "}\n" "}\n" "#endif\n" "#elif defined KERNEL_DWCONV\n" "__kernel void DWCONV(\n" "ELTWISE_DATA_ARG\n" "FUSED_ARG\n" "__global Dtype* image_data,\n" "__global Dtype* kernel_data,\n" "BIAS_KERNEL_ARG\n" "__global Dtype* convolved_image_base,\n" "const int convolved_image_offset,\n" "const ushort input_width,\n" "const ushort input_height,\n" "const ushort output_width,\n" "const ushort output_height) {\n" "__global Dtype* convolved_image = convolved_image_base + convolved_image_offset;\n" "const int out_idx = get_global_id(0);\n" "const int plane_size = output_width * output_height;\n" "const int out_plane_idx = out_idx % plane_size;\n" "const int outputZ = out_idx / plane_size;\n" "const int outputY = out_plane_idx / output_width;\n" "const int outputX = out_plane_idx % output_width;\n" "if (outputZ < OUTPUT_Z)\n" "{\n" "Dtype sum = 0.;\n" "const int org_y = outputY * STRIDE_Y - INPUT_PAD_H;\n" "const int org_x = outputX * STRIDE_X - INPUT_PAD_W;\n" "const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS);\n" "const int biasIndex=outputZ%CHANNELS;\n" "const int local_image_offset = org_y*input_width + org_x;\n" "const int imageSize = input_width*input_height;\n" "__global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset));\n" "__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));\n" "for(int y = 0; y < KERNEL_H; y++)\n" "{\n" "for(int x = 0; x < KERNEL_W; x++)\n" "{\n" "if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width))\n" "{\n" "continue;\n" "}\n" "sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x];\n" "}\n" "image_dataPtrFloat += input_width * DILATION_Y;\n" "kernel_dataPtrFloat += KERNEL_W;\n" "}\n" "#if APPLY_BIAS\n" "int offset = outputZ*output_height*output_width + outputY*output_width + outputX;\n" "ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex);\n" "#else\n" "int offset = outputZ*output_height*output_width + outputY*output_width + outputX;\n" "ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex);\n" "#endif\n" "}\n" "}\n" "#endif\n" , "3c78cbca36e239b2dec7831380734a49", NULL}; struct cv::ocl::internal::ProgramEntry conv_spatial_helper_oclsrc={moduleName, "conv_spatial_helper", "#ifdef HALF_SUPPORT\n" "#ifdef cl_khr_fp16\n" "#pragma OPENCL EXTENSION cl_khr_fp16:enable\n" "#endif\n" "#endif\n" "#define CONCAT(A,B) A##_##B\n" "#define TEMPLATE(name,type) CONCAT(name,type)\n" "__kernel void TEMPLATE(copyWeightsSwizzled, Dtype)\n" "(__global Dtype* weightIn,\n" "__global Dtype* weightOut,\n" "const int kernel_w,\n" "const int kernel_h,\n" "const int channels,\n" "const int outputs,\n" "const int swizzleFactor) {\n" "unsigned int sX = get_global_id(0);\n" "int filter = sX / (kernel_w*kernel_h*channels);\n" "int kernel_X = sX % kernel_w;\n" "int kernel_Y = (sX / kernel_w) % kernel_h;\n" "int kernel_C = (sX / (kernel_w * kernel_h)) % channels;\n" "int FP = filter / swizzleFactor;\n" "int F1 = filter % swizzleFactor;\n" "int idxOut = FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1;\n" "int idxIn = filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X;\n" "Dtype v = (filter < outputs) ? weightIn[idxIn] : (Dtype)0;\n" "weightOut[idxOut] = v;\n" "}\n" , "e973c981815e5a6c4cc7675de1232b3b", NULL}; struct cv::ocl::internal::ProgramEntry detection_output_oclsrc={moduleName, "detection_output", "#define Dtype float\n" "#define Dtype4 float4\n" "__kernel void DecodeBBoxesCORNER(const int nthreads,\n" "__global const Dtype* loc_data,\n" "__global const Dtype* prior_data,\n" "const int variance_encoded_in_target,\n" "const int num_priors,\n" "const int share_location,\n" "const int num_loc_classes,\n" "const int background_label_id,\n" "const int clip_bbox,\n" "const int locPredTransposed,\n" "__global Dtype* bbox_data)\n" "{\n" "for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))\n" "{\n" "Dtype bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax;\n" "const int i = index % 4;\n" "const int p = ((index / 4 / num_loc_classes) % num_priors) * 4;\n" "const int c = (index / 4) % num_loc_classes;\n" "int label = share_location ? -1 : c;\n" "if (label == background_label_id)\n" "return;\n" "Dtype4 loc_vec = vload4(0, loc_data + index - i);\n" "Dtype4 bbox_vec, prior_variance;\n" "if (variance_encoded_in_target)\n" "{\n" "bbox_vec = loc_vec;\n" "} else {\n" "const int start_index = num_priors * 4 + p;\n" "prior_variance = vload4(0, prior_data + start_index);\n" "bbox_vec = loc_vec * prior_variance;\n" "}\n" "if (locPredTransposed)\n" "{\n" "bbox_ymin = bbox_vec.x;\n" "bbox_xmin = bbox_vec.y;\n" "bbox_ymax = bbox_vec.z;\n" "bbox_xmax = bbox_vec.w;\n" "} else {\n" "bbox_xmin = bbox_vec.x;\n" "bbox_ymin = bbox_vec.y;\n" "bbox_xmax = bbox_vec.z;\n" "bbox_ymax = bbox_vec.w;\n" "}\n" "Dtype4 prior_vec = vload4(0, prior_data + p);\n" "Dtype val;\n" "switch (i)\n" "{\n" "case 0:\n" "val = prior_vec.x + bbox_xmin;\n" "break;\n" "case 1:\n" "val = prior_vec.y + bbox_ymin;\n" "break;\n" "case 2:\n" "val = prior_vec.z + bbox_xmax;\n" "break;\n" "case 3:\n" "val = prior_vec.w + bbox_ymax;\n" "break;\n" "}\n" "if (clip_bbox)\n" "val = max(min(val, (Dtype)1.), (Dtype)0.);\n" "bbox_data[index] = val;\n" "}\n" "}\n" "__kernel void DecodeBBoxesCENTER_SIZE(const int nthreads,\n" "__global const Dtype* loc_data,\n" "__global const Dtype* prior_data,\n" "const int variance_encoded_in_target,\n" "const int num_priors,\n" "const int share_location,\n" "const int num_loc_classes,\n" "const int background_label_id,\n" "const int clip_bbox,\n" "const int locPredTransposed,\n" "__global Dtype* bbox_data)\n" "{\n" "for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))\n" "{\n" "Dtype bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax;\n" "const int i = index % 4;\n" "const int p = ((index / 4 / num_loc_classes) % num_priors) * 4;\n" "const int c = (index / 4) % num_loc_classes;\n" "int label = share_location ? -1 : c;\n" "if (label == background_label_id)\n" "return;\n" "Dtype4 loc_vec = vload4(0, loc_data + index - i);\n" "Dtype4 bbox_vec, prior_variance;\n" "if (variance_encoded_in_target)\n" "{\n" "bbox_vec = loc_vec;\n" "} else {\n" "const int start_index = num_priors * 4 + p;\n" "prior_variance = vload4(0, prior_data + start_index);\n" "bbox_vec = loc_vec * prior_variance;\n" "}\n" "if (locPredTransposed)\n" "{\n" "bbox_ymin = bbox_vec.x;\n" "bbox_xmin = bbox_vec.y;\n" "bbox_ymax = bbox_vec.z;\n" "bbox_xmax = bbox_vec.w;\n" "} else {\n" "bbox_xmin = bbox_vec.x;\n" "bbox_ymin = bbox_vec.y;\n" "bbox_xmax = bbox_vec.z;\n" "bbox_ymax = bbox_vec.w;\n" "}\n" "Dtype4 prior_vec = vload4(0, prior_data + p);\n" "Dtype prior_width = prior_vec.z - prior_vec.x;\n" "Dtype prior_height = prior_vec.w - prior_vec.y;\n" "Dtype prior_center_x = (prior_vec.x + prior_vec.z) * .5;\n" "Dtype prior_center_y = (prior_vec.y + prior_vec.w) * .5;\n" "Dtype decode_bbox_center_x, decode_bbox_center_y;\n" "Dtype decode_bbox_width, decode_bbox_height;\n" "decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;\n" "decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;\n" "decode_bbox_width = exp(bbox_xmax) * prior_width;\n" "decode_bbox_height = exp(bbox_ymax) * prior_height;\n" "Dtype val;\n" "switch (i)\n" "{\n" "case 0:\n" "val = decode_bbox_center_x - decode_bbox_width * .5;\n" "break;\n" "case 1:\n" "val = decode_bbox_center_y - decode_bbox_height * .5;\n" "break;\n" "case 2:\n" "val = decode_bbox_center_x + decode_bbox_width * .5;\n" "break;\n" "case 3:\n" "val = decode_bbox_center_y + decode_bbox_height * .5;\n" "break;\n" "}\n" "if (clip_bbox)\n" "val = max(min(val, (Dtype)1.), (Dtype)0.);\n" "bbox_data[index] = val;\n" "}\n" "}\n" , "0817e73f5a1af5ed94be692d3f7a2ee3", NULL}; struct cv::ocl::internal::ProgramEntry dummy_oclsrc={moduleName, "dummy", "__kernel void dummy_kernel()\n" "{\n" "}\n" , "697bd1a0f09685d066b8946e159d42bc", NULL}; struct cv::ocl::internal::ProgramEntry eltwise_oclsrc={moduleName, "eltwise", "#if defined(cl_khr_fp16)\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "__kernel void op_sum4(__global const Dtype * A,\n" "__global const Dtype * B,\n" "unsigned int A_col_size,\n" "const float coeff1,\n" "const float coeff2,\n" "__global Dtype * C)\n" "{\n" "unsigned int row_gid = get_group_id(0);\n" "unsigned int lid = get_local_id(0);\n" "const __global Dtype *src0_read = A + row_gid * 4 * A_col_size;\n" "const __global Dtype *src1_read = B + row_gid * 4 * A_col_size;\n" "__global Dtype *dst0_read = C + row_gid * 4 * A_col_size;\n" "Dtype4 a0, a1, a2, a3;\n" "Dtype4 dot0, dot1, dot2, dot3;\n" "unsigned int i = lid;\n" "while( i < A_col_size / 4)\n" "{\n" "const Dtype4 b0 = vload4(i, src1_read);\n" "const Dtype4 b1 = vload4(i, src1_read + A_col_size);\n" "const Dtype4 b2 = vload4(i, src1_read + 2 * A_col_size);\n" "const Dtype4 b3 = vload4(i, src1_read + 3 * A_col_size);\n" "#if LOOP == 0\n" "a0 = vload4(i, src0_read);\n" "a1 = vload4(i, src0_read + A_col_size);\n" "a2 = vload4(i, src0_read + 2 * A_col_size);\n" "a3 = vload4(i, src0_read + 3 * A_col_size);\n" "dot0 = a0 * (Dtype4)coeff1 + b0 * (Dtype4)coeff2;\n" "dot1 = a1 * (Dtype4)coeff1 + b1 * (Dtype4)coeff2;\n" "dot2 = a2 * (Dtype4)coeff1 + b2 * (Dtype4)coeff2;\n" "dot3 = a3 * (Dtype4)coeff1 + b3 * (Dtype4)coeff2;\n" "#else\n" "a0 = vload4(i, dst0_read);\n" "a1 = vload4(i, dst0_read + A_col_size);\n" "a2 = vload4(i, dst0_read + 2 * A_col_size);\n" "a3 = vload4(i, dst0_read + 3 * A_col_size);\n" "dot0 = a0 + b0 * (Dtype4)coeff2;\n" "dot1 = a1 + b1 * (Dtype4)coeff2;\n" "dot2 = a2 + b2 * (Dtype4)coeff2;\n" "dot3 = a3 + b3 * (Dtype4)coeff2;\n" "#endif\n" "vstore4(dot0, i, dst0_read);\n" "vstore4(dot1, i, dst0_read + A_col_size);\n" "vstore4(dot2, i, dst0_read + 2 * A_col_size);\n" "vstore4(dot3, i, dst0_read + 3 * A_col_size);\n" "i += get_local_size(0);\n" "}\n" "}\n" , "c01078058d3ab56727d0b26c2965434e", NULL}; struct cv::ocl::internal::ProgramEntry gemm_buffer_oclsrc={moduleName, "gemm_buffer", "#if defined(cl_khr_fp16)\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "#define CONCAT(A,B) A##_##B\n" "#define TEMPLATE(name,type) CONCAT(name,type)\n" "#define KERNEL_ARG_DTYPE float\n" "#define TYPE_FLOAT 1\n" "#define TYPE_HALF 2\n" "#if TYPE == TYPE_HALF\n" "#define Dtype half\n" "#define Dtype2 half2\n" "#define Dtype4 half4\n" "#define Dtype8 half8\n" "#define Dtype16 half16\n" "#define as_Dtype as_half\n" "#define as_Dtype2 as_half2\n" "#define as_Dtype4 as_half4\n" "#define as_Dtype8 as_half8\n" "#define as_Dtype16 as_half16\n" "#else\n" "#define Dtype float\n" "#define Dtype2 float2\n" "#define Dtype4 float4\n" "#define Dtype8 float8\n" "#define Dtype16 float16\n" "#define as_Dtype as_float\n" "#define as_Dtype2 as_float2\n" "#define as_Dtype4 as_float4\n" "#define as_Dtype8 as_float8\n" "#define as_Dtype16 as_float16\n" "#endif\n" "#if TYPE == TYPE_HALF\n" "#define SHUFFLE_TYPE2(val) as_ushort2(val)\n" "#define SHUFFLE_TYPE8(val) as_ushort8(val)\n" "#define SIMD_SIZE_GEMM 16\n" "#else\n" "#define SHUFFLE_TYPE2(val) val\n" "#define SHUFFLE_TYPE8(val) val\n" "#define SIMD_SIZE_GEMM 8\n" "#endif\n" "#if defined(cl_intel_subgroups)\n" "#pragma OPENCL EXTENSION cl_intel_subgroups : enable\n" "#endif\n" "#ifdef ZERO_BETA\n" "#define BETA_ZERO_CHECK(b0, v) (b0)\n" "#else\n" "#define BETA_ZERO_CHECK(b0, v) (v)\n" "#endif\n" "#define VEC_SIZE 4\n" "#define LWG_HEIGHT 4\n" "#define TILE_M 8\n" "#if TYPE == TYPE_HALF\n" "#define TILE_K 32\n" "#define TILE_N 64\n" "#else\n" "#define TILE_K 16\n" "#define TILE_N 32\n" "#endif\n" "__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, LWG_HEIGHT, 1)))\n" "__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM)))\n" "__kernel void TEMPLATE(gemm_buffer_NN, Dtype)(\n" "const __global Dtype *src0, int off0,\n" "const __global Dtype *src1, int off1,\n" "__global Dtype *dst, int offd,\n" "int M,\n" "int N,\n" "int K,\n" "KERNEL_ARG_DTYPE alpha_in,\n" "KERNEL_ARG_DTYPE beta_in,\n" "int start_index)\n" "{\n" "const Dtype alpha = (Dtype)alpha_in;\n" "const Dtype beta = (Dtype)beta_in;\n" "const int group_x = get_group_id(0);\n" "const int group_y = get_group_id(1);\n" "const int local_x = get_local_id(0);\n" "const int local_y = get_local_id(1);\n" "const int global_x = get_global_id(0);\n" "const int global_y = get_global_id(1);\n" "Dtype4 brow;\n" "Dtype2 arow0, arow1, arow2, arow3, arow4, arow5, arow6, arow7;\n" "__global Dtype *dst_write0 = dst + local_x * VEC_SIZE + (group_x * TILE_N) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * N + offd;\n" "const __global Dtype *src0_read = src0 + local_x * (TILE_K / SIMD_SIZE_GEMM) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * K + start_index + off0;\n" "const __global Dtype *src1_read0 = src1 + local_x * VEC_SIZE + (group_x * TILE_N) + start_index * N + off1;\n" "int border = -(group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M);\n" "int row0 = mad24(global_y, TILE_M, 0) < M ? 0 : border;\n" "int row1 = mad24(global_y, TILE_M, 1) < M ? 1 : border;\n" "int row2 = mad24(global_y, TILE_M, 2) < M ? 2 : border;\n" "int row3 = mad24(global_y, TILE_M, 3) < M ? 3 : border;\n" "int row4 = mad24(global_y, TILE_M, 4) < M ? 4 : border;\n" "int row5 = mad24(global_y, TILE_M, 5) < M ? 5 : border;\n" "int row6 = mad24(global_y, TILE_M, 6) < M ? 6 : border;\n" "int row7 = mad24(global_y, TILE_M, 7) < M ? 7 : border;\n" "Dtype4 dot00 = (start_index != 0) ? vload4(0, dst_write0) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0));\n" "Dtype4 dot01 = (start_index != 0) ? vload4(0, dst_write0 + 1 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 1 * N));\n" "Dtype4 dot02 = (start_index != 0) ? vload4(0, dst_write0 + 2 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 2 * N));\n" "Dtype4 dot03 = (start_index != 0) ? vload4(0, dst_write0 + 3 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 3 * N));\n" "Dtype4 dot04 = (start_index != 0) ? vload4(0, dst_write0 + 4 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 4 * N));\n" "Dtype4 dot05 = (start_index != 0) ? vload4(0, dst_write0 + 5 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 5 * N));\n" "Dtype4 dot06 = (start_index != 0) ? vload4(0, dst_write0 + 6 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 6 * N));\n" "Dtype4 dot07 = (start_index != 0) ? vload4(0, dst_write0 + 7 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 7 * N));\n" "int end_index = min(start_index + 256, K);\n" "int w = start_index;\n" "while( w + TILE_K <= end_index ) {\n" "arow0 = alpha * vload2(0, src0_read + row0 * K);\n" "arow1 = alpha * vload2(0, src0_read + row1 * K);\n" "arow2 = alpha * vload2(0, src0_read + row2 * K);\n" "arow3 = alpha * vload2(0, src0_read + row3 * K);\n" "arow4 = alpha * vload2(0, src0_read + row4 * K);\n" "arow5 = alpha * vload2(0, src0_read + row5 * K);\n" "arow6 = alpha * vload2(0, src0_read + row6 * K);\n" "arow7 = alpha * vload2(0, src0_read + row7 * K);\n" "#define MM_DOT_PRODUCT( index, suffix ) \\\n" "brow = vload4(0, src1_read0); src1_read0 += N; \\\n" "dot00 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow0), index )).s##suffix), brow, dot00 ); \\\n" "dot01 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow1), index )).s##suffix), brow, dot01 ); \\\n" "dot02 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow2), index )).s##suffix), brow, dot02 ); \\\n" "dot03 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow3), index )).s##suffix), brow, dot03 ); \\\n" "dot04 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow4), index )).s##suffix), brow, dot04 ); \\\n" "dot05 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow5), index )).s##suffix), brow, dot05 ); \\\n" "dot06 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow6), index )).s##suffix), brow, dot06 ); \\\n" "dot07 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow7), index )).s##suffix), brow, dot07 );\n" "MM_DOT_PRODUCT(0, 0);\n" "MM_DOT_PRODUCT(0, 1);\n" "MM_DOT_PRODUCT(1, 0);\n" "MM_DOT_PRODUCT(1, 1);\n" "MM_DOT_PRODUCT(2, 0);\n" "MM_DOT_PRODUCT(2, 1);\n" "MM_DOT_PRODUCT(3, 0);\n" "MM_DOT_PRODUCT(3, 1);\n" "MM_DOT_PRODUCT(4, 0);\n" "MM_DOT_PRODUCT(4, 1);\n" "MM_DOT_PRODUCT(5, 0);\n" "MM_DOT_PRODUCT(5, 1);\n" "MM_DOT_PRODUCT(6, 0);\n" "MM_DOT_PRODUCT(6, 1);\n" "MM_DOT_PRODUCT(7, 0);\n" "MM_DOT_PRODUCT(7, 1);\n" "#if TYPE == TYPE_HALF\n" "MM_DOT_PRODUCT(8, 0);\n" "MM_DOT_PRODUCT(8, 1);\n" "MM_DOT_PRODUCT(9, 0);\n" "MM_DOT_PRODUCT(9, 1);\n" "MM_DOT_PRODUCT(10, 0);\n" "MM_DOT_PRODUCT(10, 1);\n" "MM_DOT_PRODUCT(11, 0);\n" "MM_DOT_PRODUCT(11, 1);\n" "MM_DOT_PRODUCT(12, 0);\n" "MM_DOT_PRODUCT(12, 1);\n" "MM_DOT_PRODUCT(13, 0);\n" "MM_DOT_PRODUCT(13, 1);\n" "MM_DOT_PRODUCT(14, 0);\n" "MM_DOT_PRODUCT(14, 1);\n" "MM_DOT_PRODUCT(15, 0);\n" "MM_DOT_PRODUCT(15, 1);\n" "#endif\n" "#undef MM_DOT_PRODUCT\n" "src0_read += TILE_K;\n" "w += TILE_K;\n" "}\n" "if(w < end_index) {\n" "arow0.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row0 * K)[0] : 0.0f;\n" "arow0.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row0 * K)[1] : 0.0f;\n" "arow1.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row1 * K)[0] : 0.0f;\n" "arow1.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row1 * K)[1] : 0.0f;\n" "arow2.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row2 * K)[0] : 0.0f;\n" "arow2.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row2 * K)[1] : 0.0f;\n" "arow3.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row3 * K)[0] : 0.0f;\n" "arow3.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row3 * K)[1] : 0.0f;\n" "arow4.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row4 * K)[0] : 0.0f;\n" "arow4.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row4 * K)[1] : 0.0f;\n" "arow5.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row5 * K)[0] : 0.0f;\n" "arow5.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row5 * K)[1] : 0.0f;\n" "arow6.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row6 * K)[0] : 0.0f;\n" "arow6.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row6 * K)[1] : 0.0f;\n" "arow7.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row7 * K)[0] : 0.0f;\n" "arow7.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row7 * K)[1] : 0.0f;\n" "#define MM_DOT_PRODUCT( index, suffix ) \\\n" "brow = (w < K) ? vload4(0, src1_read0) : (Dtype4)0.0f; src1_read0 += N; w++; \\\n" "dot00 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow0), index )).s##suffix), brow, dot00 ); \\\n" "dot01 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow1), index )).s##suffix), brow, dot01 ); \\\n" "dot02 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow2), index )).s##suffix), brow, dot02 ); \\\n" "dot03 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow3), index )).s##suffix), brow, dot03 ); \\\n" "dot04 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow4), index )).s##suffix), brow, dot04 ); \\\n" "dot05 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow5), index )).s##suffix), brow, dot05 ); \\\n" "dot06 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow6), index )).s##suffix), brow, dot06 ); \\\n" "dot07 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow7), index )).s##suffix), brow, dot07 );\n" "MM_DOT_PRODUCT(0, 0);\n" "MM_DOT_PRODUCT(0, 1);\n" "MM_DOT_PRODUCT(1, 0);\n" "MM_DOT_PRODUCT(1, 1);\n" "MM_DOT_PRODUCT(2, 0);\n" "MM_DOT_PRODUCT(2, 1);\n" "MM_DOT_PRODUCT(3, 0);\n" "MM_DOT_PRODUCT(3, 1);\n" "MM_DOT_PRODUCT(4, 0);\n" "MM_DOT_PRODUCT(4, 1);\n" "MM_DOT_PRODUCT(5, 0);\n" "MM_DOT_PRODUCT(5, 1);\n" "MM_DOT_PRODUCT(6, 0);\n" "MM_DOT_PRODUCT(6, 1);\n" "MM_DOT_PRODUCT(7, 0);\n" "MM_DOT_PRODUCT(7, 1);\n" "#if TYPE == TYPE_HALF\n" "MM_DOT_PRODUCT(8, 0);\n" "MM_DOT_PRODUCT(8, 1);\n" "MM_DOT_PRODUCT(9, 0);\n" "MM_DOT_PRODUCT(9, 1);\n" "MM_DOT_PRODUCT(10, 0);\n" "MM_DOT_PRODUCT(10, 1);\n" "MM_DOT_PRODUCT(11, 0);\n" "MM_DOT_PRODUCT(11, 1);\n" "MM_DOT_PRODUCT(12, 0);\n" "MM_DOT_PRODUCT(12, 1);\n" "MM_DOT_PRODUCT(13, 0);\n" "MM_DOT_PRODUCT(13, 1);\n" "MM_DOT_PRODUCT(14, 0);\n" "MM_DOT_PRODUCT(14, 1);\n" "MM_DOT_PRODUCT(15, 0);\n" "MM_DOT_PRODUCT(15, 1);\n" "#endif\n" "#undef MM_DOT_PRODUCT\n" "}\n" "if(global_x * 4 < N && global_y * 8 < M) {\n" "if(mad24(global_x, 4, 3) < N) {\n" "vstore4(dot00, 0, dst_write0); dst_write0 += N;\n" "if(mad24(global_y, 8, 1) < M) { vstore4(dot01, 0, dst_write0); dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 2) < M) { vstore4(dot02, 0, dst_write0); dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 3) < M) { vstore4(dot03, 0, dst_write0); dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 4) < M) { vstore4(dot04, 0, dst_write0); dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 5) < M) { vstore4(dot05, 0, dst_write0); dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 6) < M) { vstore4(dot06, 0, dst_write0); dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 7) < M) { vstore4(dot07, 0, dst_write0); }\n" "} else if(mad24(global_x, 4, 2) < N) {\n" "vstore2(dot00.xy, 0, dst_write0);\n" "dst_write0[2] = dot00.z;\n" "dst_write0 += N;\n" "if(mad24(global_y, 8, 1) < M) {\n" "vstore2(dot01.xy, 0, dst_write0);\n" "dst_write0[2] = dot01.z;\n" "dst_write0 += N;\n" "} else\n" "return;\n" "if(mad24(global_y, 8, 2) < M) {\n" "vstore2(dot02.xy, 0, dst_write0);\n" "dst_write0[2] = dot02.z;\n" "dst_write0 += N;\n" "} else\n" "return;\n" "if(mad24(global_y, 8, 3) < M) {\n" "vstore2(dot03.xy, 0, dst_write0);\n" "dst_write0[2] = dot03.z;\n" "dst_write0 += N;\n" "} else\n" "return;\n" "if(mad24(global_y, 8, 4) < M) {\n" "vstore2(dot04.xy, 0, dst_write0);\n" "dst_write0[2] = dot04.z;\n" "dst_write0 += N;\n" "} else\n" "return;\n" "if(mad24(global_y, 8, 5) < M) {\n" "vstore2(dot05.xy, 0, dst_write0);\n" "dst_write0[2] = dot05.z;\n" "dst_write0 += N;\n" "} else\n" "return;\n" "if(mad24(global_y, 8, 6) < M) {\n" "vstore2(dot06.xy, 0, dst_write0);\n" "dst_write0[2] = dot06.z;\n" "dst_write0 += N;\n" "} else\n" "return;\n" "if(mad24(global_y, 8, 7) < M) {\n" "vstore2(dot07.xy, 0, dst_write0);\n" "dst_write0[2] = dot07.z;\n" "}\n" "} else if(mad24(global_x, 4, 1) < N) {\n" "vstore2(dot00.xy, 0, dst_write0); dst_write0 += N;\n" "if(mad24(global_y, 8, 1) < M) { vstore2(dot01.xy, 0, dst_write0); dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 2) < M) { vstore2(dot02.xy, 0, dst_write0); dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 3) < M) { vstore2(dot03.xy, 0, dst_write0); dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 4) < M) { vstore2(dot04.xy, 0, dst_write0); dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 5) < M) { vstore2(dot05.xy, 0, dst_write0); dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 6) < M) { vstore2(dot06.xy, 0, dst_write0); dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 7) < M) { vstore2(dot07.xy, 0, dst_write0); }\n" "} else {\n" "dst_write0[0] = dot00.x; dst_write0 += N;\n" "if(mad24(global_y, 8, 1) < M) { dst_write0[0] = dot01.x; dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 2) < M) { dst_write0[0] = dot02.x; dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 3) < M) { dst_write0[0] = dot03.x; dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 4) < M) { dst_write0[0] = dot04.x; dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 5) < M) { dst_write0[0] = dot05.x; dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 6) < M) { dst_write0[0] = dot06.x; dst_write0 += N; }\n" "else return;\n" "if(mad24(global_y, 8, 7) < M) { dst_write0[0] = dot07.x; }\n" "}\n" "}\n" "}\n" "#undef VEC_SIZE\n" "#undef LWG_HEIGHT\n" "#undef TILE_M\n" "#undef TILE_K\n" "#undef TILE_N\n" "#define VEC_SIZE 1\n" "#define TILE_M 8\n" "#define TILE_N 8\n" "#define SLM_BLOCK 128\n" "#if TYPE == TYPE_HALF\n" "#define LWG_HEIGHT 2\n" "#define TILE_K 64\n" "#else\n" "#define LWG_HEIGHT 4\n" "#define TILE_K 32\n" "#endif\n" "#if TYPE == TYPE_HALF\n" "__attribute__((reqd_work_group_size(8, LWG_HEIGHT, 1)))\n" "__attribute__((intel_reqd_sub_group_size(8)))\n" "__kernel void TEMPLATE(gemm_buffer_NT, Dtype)(\n" "const __global Dtype *src0, int off0,\n" "const __global Dtype *src1, int off1,\n" "__global Dtype *dst, int offd,\n" "int M,\n" "int N,\n" "int K,\n" "KERNEL_ARG_DTYPE alpha_in,\n" "KERNEL_ARG_DTYPE beta_in)\n" "{\n" "const Dtype alpha = (Dtype)alpha_in;\n" "const Dtype beta = (Dtype)beta_in;\n" "const int group_x = get_group_id(0);\n" "const int group_y = get_group_id(1);\n" "const int local_x = get_local_id(0);\n" "const int local_y = get_local_id(1);\n" "const int global_x = get_global_id(0);\n" "const int global_y = get_global_id(1);\n" "Dtype8 dot00 = 0.f;\n" "Dtype8 dot01 = 0.f;\n" "Dtype8 dot02 = 0.f;\n" "Dtype8 dot03 = 0.f;\n" "Dtype8 dot04 = 0.f;\n" "Dtype8 dot05 = 0.f;\n" "Dtype8 dot06 = 0.f;\n" "Dtype8 dot07 = 0.f;\n" "Dtype8 brow0;\n" "Dtype8 brow1;\n" "Dtype8 brow2;\n" "Dtype8 brow3;\n" "Dtype8 brow4;\n" "Dtype8 brow5;\n" "Dtype8 brow6;\n" "Dtype8 brow7;\n" "__global Dtype *dst_write0 = dst + local_x * VEC_SIZE + (group_x * TILE_N) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * N + offd;\n" "const __global Dtype *src0_read = src0 + local_x * (TILE_K / 8) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * K + off0;\n" "const __global Dtype *src1_read0 = src1 + (group_x * TILE_N) * K + off1;\n" "__local Dtype slm_brow[8 * SLM_BLOCK];\n" "__local Dtype* slm_brow0;\n" "int local_index = mad24(local_y, 8, local_x) * 8;\n" "int w;\n" "for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) {\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "vstore4(vload4(0, (__global float *)(src1_read0 + mad24(0, K, local_index))), 0, (__local float *)(slm_brow + mad24(0, SLM_BLOCK, local_index)));\n" "vstore4(vload4(0, (__global float *)(src1_read0 + mad24(1, K, local_index))), 0, (__local float *)(slm_brow + mad24(1, SLM_BLOCK, local_index)));\n" "vstore4(vload4(0, (__global float *)(src1_read0 + mad24(2, K, local_index))), 0, (__local float *)(slm_brow + mad24(2, SLM_BLOCK, local_index)));\n" "vstore4(vload4(0, (__global float *)(src1_read0 + mad24(3, K, local_index))), 0, (__local float *)(slm_brow + mad24(3, SLM_BLOCK, local_index)));\n" "vstore4(vload4(0, (__global float *)(src1_read0 + mad24(4, K, local_index))), 0, (__local float *)(slm_brow + mad24(4, SLM_BLOCK, local_index)));\n" "vstore4(vload4(0, (__global float *)(src1_read0 + mad24(5, K, local_index))), 0, (__local float *)(slm_brow + mad24(5, SLM_BLOCK, local_index)));\n" "vstore4(vload4(0, (__global float *)(src1_read0 + mad24(6, K, local_index))), 0, (__local float *)(slm_brow + mad24(6, SLM_BLOCK, local_index)));\n" "vstore4(vload4(0, (__global float *)(src1_read0 + mad24(7, K, local_index))), 0, (__local float *)(slm_brow + mad24(7, SLM_BLOCK, local_index)));\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "slm_brow0 = slm_brow + local_x * (TILE_K / 8);\n" "w = b_tile;\n" "int end_w = min(b_tile + SLM_BLOCK, K);\n" "while( w + TILE_K <= end_w ) {\n" "Dtype8 arow;\n" "brow0 = as_half8(vload4(0, (__local float *)(slm_brow0 + 0 * SLM_BLOCK)));\n" "brow1 = as_half8(vload4(0, (__local float *)(slm_brow0 + 1 * SLM_BLOCK)));\n" "brow2 = as_half8(vload4(0, (__local float *)(slm_brow0 + 2 * SLM_BLOCK)));\n" "brow3 = as_half8(vload4(0, (__local float *)(slm_brow0 + 3 * SLM_BLOCK)));\n" "brow4 = as_half8(vload4(0, (__local float *)(slm_brow0 + 4 * SLM_BLOCK)));\n" "brow5 = as_half8(vload4(0, (__local float *)(slm_brow0 + 5 * SLM_BLOCK)));\n" "brow6 = as_half8(vload4(0, (__local float *)(slm_brow0 + 6 * SLM_BLOCK)));\n" "brow7 = as_half8(vload4(0, (__local float *)(slm_brow0 + 7 * SLM_BLOCK)));\n" "#define MM_DOT_PRODUCT( _row, _dot ) \\\n" "arow = as_half8(vload4(0, (__global float *)(src0_read + _row * K))); \\\n" "_dot = mad( (Dtype8)(arow.s0), (Dtype8)(brow0.s0, brow1.s0, brow2.s0, brow3.s0, brow4.s0, brow5.s0, brow6.s0, brow7.s0), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.s1), (Dtype8)(brow0.s1, brow1.s1, brow2.s1, brow3.s1, brow4.s1, brow5.s1, brow6.s1, brow7.s1), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.s2), (Dtype8)(brow0.s2, brow1.s2, brow2.s2, brow3.s2, brow4.s2, brow5.s2, brow6.s2, brow7.s2), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.s3), (Dtype8)(brow0.s3, brow1.s3, brow2.s3, brow3.s3, brow4.s3, brow5.s3, brow6.s3, brow7.s3), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.s4), (Dtype8)(brow0.s4, brow1.s4, brow2.s4, brow3.s4, brow4.s4, brow5.s4, brow6.s4, brow7.s4), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.s5), (Dtype8)(brow0.s5, brow1.s5, brow2.s5, brow3.s5, brow4.s5, brow5.s5, brow6.s5, brow7.s5), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.s6), (Dtype8)(brow0.s6, brow1.s6, brow2.s6, brow3.s6, brow4.s6, brow5.s6, brow6.s6, brow7.s6), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.s7), (Dtype8)(brow0.s7, brow1.s7, brow2.s7, brow3.s7, brow4.s7, brow5.s7, brow6.s7, brow7.s7), _dot );\n" "MM_DOT_PRODUCT( 0, dot00 );\n" "MM_DOT_PRODUCT( 1, dot01 );\n" "MM_DOT_PRODUCT( 2, dot02 );\n" "MM_DOT_PRODUCT( 3, dot03 );\n" "MM_DOT_PRODUCT( 4, dot04 );\n" "MM_DOT_PRODUCT( 5, dot05 );\n" "MM_DOT_PRODUCT( 6, dot06 );\n" "MM_DOT_PRODUCT( 7, dot07 );\n" "#undef MM_DOT_PRODUCT\n" "src0_read += TILE_K;\n" "slm_brow0 += TILE_K;\n" "w += TILE_K;\n" "}\n" "src1_read0 += SLM_BLOCK;\n" "}\n" "if(w < K) {\n" "Dtype8 arow;\n" "#define READ_BROW(_brow, _row) \\\n" "_brow = as_half8(vload4(0, (__local float *)(slm_brow0 + _row * SLM_BLOCK))); \\\n" "_brow.s0 = (mad24(local_x, 8, w) < K) ? _brow.s0 : 0.0f; \\\n" "_brow.s1 = (mad24(local_x, 8, w + 1) < K) ? _brow.s1 : 0.0f; \\\n" "_brow.s2 = (mad24(local_x, 8, w + 2) < K) ? _brow.s2 : 0.0f; \\\n" "_brow.s3 = (mad24(local_x, 8, w + 3) < K) ? _brow.s3 : 0.0f; \\\n" "_brow.s4 = (mad24(local_x, 8, w + 4) < K) ? _brow.s4 : 0.0f; \\\n" "_brow.s5 = (mad24(local_x, 8, w + 5) < K) ? _brow.s5 : 0.0f; \\\n" "_brow.s6 = (mad24(local_x, 8, w + 6) < K) ? _brow.s6 : 0.0f; \\\n" "_brow.s7 = (mad24(local_x, 8, w + 7) < K) ? _brow.s7 : 0.0f;\n" "READ_BROW(brow0, 0);\n" "READ_BROW(brow1, 1);\n" "READ_BROW(brow2, 2);\n" "READ_BROW(brow3, 3);\n" "READ_BROW(brow4, 4);\n" "READ_BROW(brow5, 5);\n" "READ_BROW(brow6, 6);\n" "READ_BROW(brow7, 7);\n" "#undef READ_BROW\n" "#define MM_DOT_PRODUCT( _row, _dot ) \\\n" "arow = as_half8(vload4(0, (__global float *)(src0_read + _row * K))); \\\n" "arow.s0 = (mad24(local_x, 8, w) < K) ? arow.s0 : 0.0f; \\\n" "arow.s1 = (mad24(local_x, 8, w + 1) < K) ? arow.s1 : 0.0f; \\\n" "arow.s2 = (mad24(local_x, 8, w + 2) < K) ? arow.s2 : 0.0f; \\\n" "arow.s3 = (mad24(local_x, 8, w + 3) < K) ? arow.s3 : 0.0f; \\\n" "arow.s4 = (mad24(local_x, 8, w + 4) < K) ? arow.s4 : 0.0f; \\\n" "arow.s5 = (mad24(local_x, 8, w + 5) < K) ? arow.s5 : 0.0f; \\\n" "arow.s6 = (mad24(local_x, 8, w + 6) < K) ? arow.s6 : 0.0f; \\\n" "arow.s7 = (mad24(local_x, 8, w + 7) < K) ? arow.s7 : 0.0f; \\\n" "_dot = mad( (Dtype8)(arow.s0), (Dtype8)(brow0.s0, brow1.s0, brow2.s0, brow3.s0, brow4.s0, brow5.s0, brow6.s0, brow7.s0), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.s1), (Dtype8)(brow0.s1, brow1.s1, brow2.s1, brow3.s1, brow4.s1, brow5.s1, brow6.s1, brow7.s1), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.s2), (Dtype8)(brow0.s2, brow1.s2, brow2.s2, brow3.s2, brow4.s2, brow5.s2, brow6.s2, brow7.s2), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.s3), (Dtype8)(brow0.s3, brow1.s3, brow2.s3, brow3.s3, brow4.s3, brow5.s3, brow6.s3, brow7.s3), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.s4), (Dtype8)(brow0.s4, brow1.s4, brow2.s4, brow3.s4, brow4.s4, brow5.s4, brow6.s4, brow7.s4), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.s5), (Dtype8)(brow0.s5, brow1.s5, brow2.s5, brow3.s5, brow4.s5, brow5.s5, brow6.s5, brow7.s5), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.s6), (Dtype8)(brow0.s6, brow1.s6, brow2.s6, brow3.s6, brow4.s6, brow5.s6, brow6.s6, brow7.s6), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.s7), (Dtype8)(brow0.s7, brow1.s7, brow2.s7, brow3.s7, brow4.s7, brow5.s7, brow6.s7, brow7.s7), _dot );\n" "MM_DOT_PRODUCT( 0, dot00 );\n" "MM_DOT_PRODUCT( 1, dot01 );\n" "MM_DOT_PRODUCT( 2, dot02 );\n" "MM_DOT_PRODUCT( 3, dot03 );\n" "MM_DOT_PRODUCT( 4, dot04 );\n" "MM_DOT_PRODUCT( 5, dot05 );\n" "MM_DOT_PRODUCT( 6, dot06 );\n" "MM_DOT_PRODUCT( 7, dot07 );\n" "#undef MM_DOT_PRODUCT\n" "}\n" "#define REDUCE(_dot) \\\n" "_dot = as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 0)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 1)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 2)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 3)) + \\\n" "as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 4)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 5)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 6)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 7));\n" "REDUCE(dot00);\n" "REDUCE(dot01);\n" "REDUCE(dot02);\n" "REDUCE(dot03);\n" "REDUCE(dot04);\n" "REDUCE(dot05);\n" "REDUCE(dot06);\n" "REDUCE(dot07);\n" "#undef REDUCE\n" "Dtype output = 0.0f;\n" "#define OUTPUT( _dot) \\\n" "output = (local_x == 0) ? _dot.s0 : output; \\\n" "output = (local_x == 1) ? _dot.s1 : output; \\\n" "output = (local_x == 2) ? _dot.s2 : output; \\\n" "output = (local_x == 3) ? _dot.s3 : output; \\\n" "output = (local_x == 4) ? _dot.s4 : output; \\\n" "output = (local_x == 5) ? _dot.s5 : output; \\\n" "output = (local_x == 6) ? _dot.s6 : output; \\\n" "output = (local_x == 7) ? _dot.s7 : output; \\\n" "dst_write0[0] = BETA_ZERO_CHECK(alpha * output, mad(output, alpha, beta * dst_write0[0])); \\\n" "dst_write0 += N;\n" "if(global_x < N && global_y * 8 < M) {\n" "OUTPUT(dot00);\n" "if(mad24(global_y, 8, 1) < M) { OUTPUT(dot01); }\n" "if(mad24(global_y, 8, 2) < M) { OUTPUT(dot02); }\n" "if(mad24(global_y, 8, 3) < M) { OUTPUT(dot03); }\n" "if(mad24(global_y, 8, 4) < M) { OUTPUT(dot04); }\n" "if(mad24(global_y, 8, 5) < M) { OUTPUT(dot05); }\n" "if(mad24(global_y, 8, 6) < M) { OUTPUT(dot06); }\n" "if(mad24(global_y, 8, 7) < M) { OUTPUT(dot07); }\n" "}\n" "#undef OUTPUT\n" "}\n" "#else\n" "__attribute__((reqd_work_group_size(8, LWG_HEIGHT, 1)))\n" "__attribute__((intel_reqd_sub_group_size(8)))\n" "__kernel void TEMPLATE(gemm_buffer_NT, Dtype)(\n" "const __global Dtype *src0, int off0,\n" "const __global Dtype *src1, int off1,\n" "__global Dtype *dst, int offd,\n" "int M,\n" "int N,\n" "int K,\n" "KERNEL_ARG_DTYPE alpha_in,\n" "KERNEL_ARG_DTYPE beta_in)\n" "{\n" "const Dtype alpha = (Dtype)alpha_in;\n" "const Dtype beta = (Dtype)beta_in;\n" "const int group_x = get_group_id(0);\n" "const int group_y = get_group_id(1);\n" "const int local_x = get_local_id(0);\n" "const int local_y = get_local_id(1);\n" "const int global_x = get_global_id(0);\n" "const int global_y = get_global_id(1);\n" "Dtype8 dot00 = 0.f;\n" "Dtype8 dot01 = 0.f;\n" "Dtype8 dot02 = 0.f;\n" "Dtype8 dot03 = 0.f;\n" "Dtype8 dot04 = 0.f;\n" "Dtype8 dot05 = 0.f;\n" "Dtype8 dot06 = 0.f;\n" "Dtype8 dot07 = 0.f;\n" "Dtype4 brow0;\n" "Dtype4 brow1;\n" "Dtype4 brow2;\n" "Dtype4 brow3;\n" "Dtype4 brow4;\n" "Dtype4 brow5;\n" "Dtype4 brow6;\n" "Dtype4 brow7;\n" "__global Dtype *dst_write0 = dst + local_x * VEC_SIZE + (group_x * TILE_N) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * N + offd;\n" "const __global Dtype *src0_read = src0 + local_x * (TILE_K / 8) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * K + off0;\n" "const __global Dtype *src1_read0 = src1 + (group_x * TILE_N) * K + off1;\n" "__local Dtype slm_brow[8 * SLM_BLOCK];\n" "__local Dtype* slm_brow0;\n" "int local_index = mad24(local_y, 8, local_x) * 4;\n" "int w;\n" "for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) {\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "vstore4(vload4(0, src1_read0 + mad24(0, K, local_index)), 0, slm_brow + mad24(0, SLM_BLOCK, local_index));\n" "vstore4(vload4(0, src1_read0 + mad24(1, K, local_index)), 0, slm_brow + mad24(1, SLM_BLOCK, local_index));\n" "vstore4(vload4(0, src1_read0 + mad24(2, K, local_index)), 0, slm_brow + mad24(2, SLM_BLOCK, local_index));\n" "vstore4(vload4(0, src1_read0 + mad24(3, K, local_index)), 0, slm_brow + mad24(3, SLM_BLOCK, local_index));\n" "vstore4(vload4(0, src1_read0 + mad24(4, K, local_index)), 0, slm_brow + mad24(4, SLM_BLOCK, local_index));\n" "vstore4(vload4(0, src1_read0 + mad24(5, K, local_index)), 0, slm_brow + mad24(5, SLM_BLOCK, local_index));\n" "vstore4(vload4(0, src1_read0 + mad24(6, K, local_index)), 0, slm_brow + mad24(6, SLM_BLOCK, local_index));\n" "vstore4(vload4(0, src1_read0 + mad24(7, K, local_index)), 0, slm_brow + mad24(7, SLM_BLOCK, local_index));\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "slm_brow0 = slm_brow + local_x * (TILE_K / 8);\n" "w = b_tile;\n" "int end_w = min(b_tile + SLM_BLOCK, K);\n" "while( w + TILE_K <= end_w ) {\n" "Dtype4 arow;\n" "brow0 = vload4(0, slm_brow0 + 0 * SLM_BLOCK);\n" "brow1 = vload4(0, slm_brow0 + 1 * SLM_BLOCK);\n" "brow2 = vload4(0, slm_brow0 + 2 * SLM_BLOCK);\n" "brow3 = vload4(0, slm_brow0 + 3 * SLM_BLOCK);\n" "brow4 = vload4(0, slm_brow0 + 4 * SLM_BLOCK);\n" "brow5 = vload4(0, slm_brow0 + 5 * SLM_BLOCK);\n" "brow6 = vload4(0, slm_brow0 + 6 * SLM_BLOCK);\n" "brow7 = vload4(0, slm_brow0 + 7 * SLM_BLOCK);\n" "#define MM_DOT_PRODUCT( _row, _dot ) \\\n" "arow = vload4(0, src0_read + _row * K); \\\n" "_dot = mad( (Dtype8)(arow.x), (Dtype8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.y), (Dtype8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.z), (Dtype8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.w), (Dtype8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot );\n" "MM_DOT_PRODUCT( 0, dot00 );\n" "MM_DOT_PRODUCT( 1, dot01 );\n" "MM_DOT_PRODUCT( 2, dot02 );\n" "MM_DOT_PRODUCT( 3, dot03 );\n" "MM_DOT_PRODUCT( 4, dot04 );\n" "MM_DOT_PRODUCT( 5, dot05 );\n" "MM_DOT_PRODUCT( 6, dot06 );\n" "MM_DOT_PRODUCT( 7, dot07 );\n" "#undef MM_DOT_PRODUCT\n" "src0_read += TILE_K;\n" "slm_brow0 += TILE_K;\n" "w += TILE_K;\n" "}\n" "src1_read0 += SLM_BLOCK;\n" "}\n" "if(w < K) {\n" "Dtype4 arow;\n" "#define READ_BROW(_brow, _row) \\\n" "_brow = vload4(0, slm_brow0 + _row * SLM_BLOCK); \\\n" "_brow.x = (mad24(local_x, 4, w) < K) ? _brow.x : 0.0f; \\\n" "_brow.y = (mad24(local_x, 4, w + 1) < K) ? _brow.y : 0.0f; \\\n" "_brow.z = (mad24(local_x, 4, w + 2) < K) ? _brow.z : 0.0f; \\\n" "_brow.w = (mad24(local_x, 4, w + 3) < K) ? _brow.w : 0.0f;\n" "READ_BROW(brow0, 0);\n" "READ_BROW(brow1, 1);\n" "READ_BROW(brow2, 2);\n" "READ_BROW(brow3, 3);\n" "READ_BROW(brow4, 4);\n" "READ_BROW(brow5, 5);\n" "READ_BROW(brow6, 6);\n" "READ_BROW(brow7, 7);\n" "#undef READ_BROW\n" "#define MM_DOT_PRODUCT( _row, _dot ) \\\n" "arow = vload4(0, src0_read + _row * K); \\\n" "arow.x = (mad24(local_x, 4, w) < K) ? arow.x : 0.0f; \\\n" "arow.y = (mad24(local_x, 4, w + 1) < K) ? arow.y : 0.0f; \\\n" "arow.z = (mad24(local_x, 4, w + 2) < K) ? arow.z : 0.0f; \\\n" "arow.w = (mad24(local_x, 4, w + 3) < K) ? arow.w : 0.0f; \\\n" "_dot = mad( (Dtype8)(arow.x), (Dtype8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.y), (Dtype8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.z), (Dtype8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \\\n" "_dot = mad( (Dtype8)(arow.w), (Dtype8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot );\n" "MM_DOT_PRODUCT( 0, dot00 );\n" "MM_DOT_PRODUCT( 1, dot01 );\n" "MM_DOT_PRODUCT( 2, dot02 );\n" "MM_DOT_PRODUCT( 3, dot03 );\n" "MM_DOT_PRODUCT( 4, dot04 );\n" "MM_DOT_PRODUCT( 5, dot05 );\n" "MM_DOT_PRODUCT( 6, dot06 );\n" "MM_DOT_PRODUCT( 7, dot07 );\n" "#undef MM_DOT_PRODUCT\n" "}\n" "#define REDUCE(_dot) \\\n" "_dot = as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 0)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 1)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 2)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 3)) + \\\n" "as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 4)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 5)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 6)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 7));\n" "REDUCE(dot00);\n" "REDUCE(dot01);\n" "REDUCE(dot02);\n" "REDUCE(dot03);\n" "REDUCE(dot04);\n" "REDUCE(dot05);\n" "REDUCE(dot06);\n" "REDUCE(dot07);\n" "#undef REDUCE\n" "Dtype output = 0.0f;\n" "#define OUTPUT( _dot) \\\n" "output = (local_x == 0) ? _dot.s0 : output; \\\n" "output = (local_x == 1) ? _dot.s1 : output; \\\n" "output = (local_x == 2) ? _dot.s2 : output; \\\n" "output = (local_x == 3) ? _dot.s3 : output; \\\n" "output = (local_x == 4) ? _dot.s4 : output; \\\n" "output = (local_x == 5) ? _dot.s5 : output; \\\n" "output = (local_x == 6) ? _dot.s6 : output; \\\n" "output = (local_x == 7) ? _dot.s7 : output; \\\n" "dst_write0[0] = BETA_ZERO_CHECK(alpha * output, mad(output, alpha, beta * dst_write0[0])); \\\n" "dst_write0 += N;\n" "if(global_x < N && global_y * 8 < M) {\n" "OUTPUT(dot00);\n" "if(mad24(global_y, 8, 1) < M) { OUTPUT(dot01); }\n" "if(mad24(global_y, 8, 2) < M) { OUTPUT(dot02); }\n" "if(mad24(global_y, 8, 3) < M) { OUTPUT(dot03); }\n" "if(mad24(global_y, 8, 4) < M) { OUTPUT(dot04); }\n" "if(mad24(global_y, 8, 5) < M) { OUTPUT(dot05); }\n" "if(mad24(global_y, 8, 6) < M) { OUTPUT(dot06); }\n" "if(mad24(global_y, 8, 7) < M) { OUTPUT(dot07); }\n" "}\n" "#undef OUTPUT\n" "}\n" "#endif\n" "#undef VEC_SIZE\n" "#undef LWG_HEIGHT\n" "#undef TILE_M\n" "#undef TILE_K\n" "#undef TILE_N\n" "#undef SLM_BLOCK\n" "#define SLM_SIZE 64\n" "void TEMPLATE(gemm_buffer_NT_M_2_edgerows,Dtype)(\n" "const __global Dtype* srca_read0,\n" "const __global Dtype* srca_read1,\n" "const __global Dtype* srcb_read,\n" "__local Dtype4* work0,\n" "__local Dtype4* work1,\n" "int N,\n" "int K,\n" "int x_gid,\n" "int lid,\n" "Dtype alpha,\n" "Dtype beta,\n" "__global Dtype* dstc0,\n" "__global Dtype* dstc1)\n" "{\n" "__local Dtype* work_each0 = (__local Dtype*)work0;\n" "__local Dtype* work_each1 = (__local Dtype*)work1;\n" "int rows = N - x_gid * 4;\n" "Dtype4 dot0[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n" "Dtype4 dot1[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n" "int i = lid;\n" "while( i < K / 4) {\n" "const Dtype4 b0 = {srca_read0[i*4], srca_read0[(i*4+1)], srca_read0[(i*4+2)], srca_read0[(i*4+3)]};\n" "const Dtype4 b1 = {srca_read1[i*4], srca_read1[(i*4+1)], srca_read1[(i*4+2)], srca_read1[(i*4+3)]};\n" "#pragma unroll\n" "for(int j = 0; j < rows; ++j) {\n" "Dtype4 a = vload4(i, srcb_read + j * K);\n" "dot0[j] += b0 * a;\n" "dot1[j] += b1 * a;\n" "}\n" "i += get_local_size(0);\n" "}\n" "#pragma unroll\n" "for(int j = 0; j < rows; ++j) {\n" "work_each0[lid * 4 + j] = dot0[j].x + dot0[j].y + dot0[j].z + dot0[j].w;\n" "work_each1[lid * 4 + j] = dot1[j].x + dot1[j].y + dot1[j].z + dot1[j].w;\n" "}\n" "if(i == K / 4) {\n" "short tail_items = K % 4;\n" "if(tail_items != 0) {\n" "const __global Dtype *srcb_tail = srcb_read + i * 4;\n" "const __global Dtype *srca_tail0 = srca_read0 + i * 4;\n" "const __global Dtype *srca_tail1 = srca_read1 + i * 4;\n" "#pragma unroll\n" "for(short i = 0; i < tail_items; ++i) {\n" "const Dtype at0 = srca_tail0[i];\n" "const Dtype at1 = srca_tail1[i];\n" "#pragma unroll\n" "for(int j = 0; j < rows; ++j) {\n" "work_each0[lid * 4 + j] += at0 * srcb_tail[i + j * K];\n" "work_each1[lid * 4 + j] += at1 * srcb_tail[i + j * K];\n" "}\n" "}\n" "}\n" "}\n" "for(int stride = get_local_size(0) >> 1; stride > 0 ; stride >>= 1) {\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if(lid < stride) {\n" "work0[lid] += work0[lid+stride];\n" "work1[lid] += work1[lid+stride];\n" "}\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if(lid == 0) {\n" "#pragma unroll\n" "for(int j = 0; j < rows; ++j) {\n" "#ifdef ZERO_BETA\n" "Dtype a0 = alpha * work_each0[j];\n" "Dtype a1 = alpha * work_each1[j];\n" "#else\n" "Dtype a0 = alpha * work_each0[j] + beta * dstc0[(x_gid * 4 + j)];\n" "Dtype a1 = alpha * work_each1[j] + beta * dstc1[(x_gid * 4 + j)];\n" "#endif\n" "dstc0[(x_gid * 4 + j)] = a0;\n" "dstc1[(x_gid * 4 + j)] = a1;\n" "}\n" "}\n" "}\n" "__kernel void TEMPLATE(gemm_buffer_NT_M_2,Dtype)(\n" "__global const Dtype * A,\n" "int offA,\n" "__global const Dtype * B,\n" "int offB,\n" "__global Dtype * C,\n" "int offC,\n" "int M,\n" "int N,\n" "int K,\n" "KERNEL_ARG_DTYPE alpha_f,\n" "KERNEL_ARG_DTYPE beta_f)\n" "{\n" "Dtype alpha = (Dtype)alpha_f;\n" "Dtype beta = (Dtype)beta_f;\n" "int x_gid = get_group_id(0);\n" "int lid = get_local_id(0);\n" "const __global Dtype *srca_read0 = A + offA;\n" "const __global Dtype *srca_read1 = srca_read0 + K;\n" "const __global Dtype *srcb_read = B + x_gid * 4 * K + offB;\n" "__global Dtype4 *dstc0 = (__global Dtype4*)(C + offC);\n" "__global Dtype4 *dstc1 = (__global Dtype4*)((__global Dtype*)(dstc0) + N);\n" "__local Dtype4 work0[SLM_SIZE];\n" "__local Dtype4 work1[SLM_SIZE];\n" "__local Dtype* work_each0 = (__local Dtype*)work0;\n" "__local Dtype* work_each1 = (__local Dtype*)work1;\n" "if(x_gid == N / 4) {\n" "TEMPLATE(gemm_buffer_NT_M_2_edgerows,Dtype) \\\n" "(srca_read0, srca_read1, srcb_read, work0, work1, N, K, x_gid, lid, alpha, beta, (__global Dtype*)dstc0, (__global Dtype*)dstc1);\n" "} else {\n" "Dtype4 dot0[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n" "Dtype4 dot1[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n" "int i = lid;\n" "while( i < K / 4) {\n" "const Dtype4 b0 = vload4(i, srca_read0);\n" "const Dtype4 b1 = vload4(i, srca_read1);\n" "#pragma unroll\n" "for(int j = 0; j < 4; ++j) {\n" "Dtype4 a = vload4(i, srcb_read + j * K);\n" "dot0[j] += b0 * a;\n" "dot1[j] += b1 * a;\n" "}\n" "i += get_local_size(0);\n" "}\n" "#pragma unroll\n" "for(int j = 0; j < 4; ++j) {\n" "work_each0[lid * 4 + j] = dot0[j].x + dot0[j].y + dot0[j].z + dot0[j].w;\n" "work_each1[lid * 4 + j] = dot1[j].x + dot1[j].y + dot1[j].z + dot1[j].w;\n" "}\n" "if(i == K / 4) {\n" "short tail_items = K % 4;\n" "if(tail_items != 0) {\n" "const __global Dtype *srcb_tail = srcb_read + i * 4;\n" "const __global Dtype *srca_tail0 = srca_read0 + i * 4;\n" "const __global Dtype *srca_tail1 = srca_read1 + i * 4;\n" "#pragma unroll\n" "for(short i = 0; i < tail_items; ++i) {\n" "const Dtype at0 = srca_tail0[i];\n" "const Dtype at1 = srca_tail1[i];\n" "#pragma unroll\n" "for(int j = 0; j < 4; ++j) {\n" "work_each0[lid * 4 + j] += at0 * srcb_tail[i + j * K];\n" "work_each1[lid * 4 + j] += at1 * srcb_tail[i + j * K];\n" "}\n" "}\n" "}\n" "}\n" "for(int stride = get_local_size(0) >> 1; stride > 0 ; stride >>= 1) {\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if(lid < stride) {\n" "work0[lid] += work0[lid+stride];\n" "work1[lid] += work1[lid+stride];\n" "}\n" "}\n" "if(lid == 0)\n" "{\n" "#ifdef ZERO_BETA\n" "dstc0[x_gid] = alpha * work0[0];\n" "dstc1[x_gid] = alpha * work1[0];\n" "#else\n" "dstc0[x_gid] = alpha * work0[0] + beta * dstc0[x_gid];\n" "dstc1[x_gid] = alpha * work1[0] + beta * dstc1[x_gid];\n" "#endif\n" "}\n" "}\n" "}\n" "#undef SLM_SIZE\n" "#define SLM_SIZE 32\n" "void TEMPLATE(gemm_buffer_NT_M_4_edgerows,Dtype)(\n" "const __global Dtype* srca_read0,\n" "const __global Dtype* srca_read1,\n" "const __global Dtype* srca_read2,\n" "const __global Dtype* srca_read3,\n" "const __global Dtype* srcb_read,\n" "__local Dtype4* work0,\n" "__local Dtype4* work1,\n" "__local Dtype4* work2,\n" "__local Dtype4* work3,\n" "int N,\n" "int K,\n" "int x_gid,\n" "int lid,\n" "Dtype alpha,\n" "Dtype beta,\n" "__global Dtype* dstc0,\n" "__global Dtype* dstc1,\n" "__global Dtype* dstc2,\n" "__global Dtype* dstc3)\n" "{\n" "__local Dtype* work_each0 = (__local Dtype*)(work0 + lid);\n" "__local Dtype* work_each1 = (__local Dtype*)(work1 + lid);\n" "__local Dtype* work_each2 = (__local Dtype*)(work2 + lid);\n" "__local Dtype* work_each3 = (__local Dtype*)(work3 + lid);\n" "int rows = N - x_gid * 4;\n" "Dtype4 dot0[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n" "Dtype4 dot1[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n" "Dtype4 dot2[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n" "Dtype4 dot3[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n" "int i = lid;\n" "while( i < K / 4) {\n" "const Dtype4 a0 = {srca_read0[i*4], srca_read0[(i*4+1)], srca_read0[(i*4+2)], srca_read0[(i*4+3)]};\n" "const Dtype4 a1 = {srca_read1[i*4], srca_read1[(i*4+1)], srca_read1[(i*4+2)], srca_read1[(i*4+3)]};\n" "const Dtype4 a2 = {srca_read2[i*4], srca_read2[(i*4+1)], srca_read2[(i*4+2)], srca_read2[(i*4+3)]};\n" "const Dtype4 a3 = {srca_read3[i*4], srca_read3[(i*4+1)], srca_read3[(i*4+2)], srca_read3[(i*4+3)]};\n" "#pragma unrol\n" "for(int j = 0; j < rows; ++j) {\n" "dot0[j] += a0 * vload4(i, srcb_read + j * K);\n" "dot1[j] += a1 * vload4(i, srcb_read + j * K);\n" "dot2[j] += a2 * vload4(i, srcb_read + j * K);\n" "dot3[j] += a3 * vload4(i, srcb_read + j * K);\n" "}\n" "i += get_local_size(0);\n" "}\n" "#pragma unroll\n" "for(int j = 0; j < rows; ++j) {\n" "work_each0[j] = dot0[j].x + dot0[j].y + dot0[j].z + dot0[j].w;\n" "work_each1[j] = dot1[j].x + dot1[j].y + dot1[j].z + dot1[j].w;\n" "work_each2[j] = dot2[j].x + dot2[j].y + dot2[j].z + dot2[j].w;\n" "work_each3[j] = dot3[j].x + dot3[j].y + dot3[j].z + dot3[j].w;\n" "}\n" "if(i == K / 4) {\n" "short tail_items = K % 4;\n" "if(tail_items != 0) {\n" "const __global Dtype *srcb_tail = srcb_read + i * 4;\n" "const __global Dtype *srca_tail0 = srca_read0 + i * 4;\n" "const __global Dtype *srca_tail1 = srca_read1 + i * 4;\n" "const __global Dtype *srca_tail2 = srca_read2 + i * 4;\n" "const __global Dtype *srca_tail3 = srca_read3 + i * 4;\n" "#pragma unroll\n" "for(short i = 0; i < tail_items; ++i) {\n" "const Dtype at0 = srca_tail0[i];\n" "const Dtype at1 = srca_tail1[i];\n" "const Dtype at2 = srca_tail2[i];\n" "const Dtype at3 = srca_tail3[i];\n" "#pragma unroll\n" "for(int j = 0; j < rows; ++j) {\n" "work_each0[j] += at0 * srcb_tail[i + j * K];\n" "work_each1[j] += at1 * srcb_tail[i + j * K];\n" "work_each2[j] += at2 * srcb_tail[i + j * K];\n" "work_each3[j] += at3 * srcb_tail[i + j * K];\n" "}\n" "}\n" "}\n" "}\n" "for(int stride = get_local_size(0) >> 1; stride > 0 ; stride >>= 1) {\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if(lid < stride) {\n" "work0[lid] += work0[lid+stride];\n" "work1[lid] += work1[lid+stride];\n" "work2[lid] += work2[lid+stride];\n" "work3[lid] += work3[lid+stride];\n" "}\n" "}\n" "if(lid == 0) {\n" "#pragma unroll\n" "for(int j = 0; j < rows; ++j) {\n" "#ifdef ZERO_BETA\n" "dstc0[(x_gid * 4 + j)] = alpha * work_each0[j];\n" "dstc1[(x_gid * 4 + j)] = alpha * work_each1[j];\n" "dstc2[(x_gid * 4 + j)] = alpha * work_each2[j];\n" "dstc3[(x_gid * 4 + j)] = alpha * work_each3[j];\n" "#else\n" "dstc0[(x_gid * 4 + j)] = alpha * work_each0[j] + beta * dstc0[(x_gid * 4 + j)];\n" "dstc1[(x_gid * 4 + j)] = alpha * work_each1[j] + beta * dstc1[(x_gid * 4 + j)];\n" "dstc2[(x_gid * 4 + j)] = alpha * work_each2[j] + beta * dstc2[(x_gid * 4 + j)];\n" "dstc3[(x_gid * 4 + j)] = alpha * work_each3[j] + beta * dstc3[(x_gid * 4 + j)];\n" "#endif\n" "}\n" "}\n" "}\n" "__kernel void TEMPLATE(gemm_buffer_NT_M_4,Dtype)(\n" "__global const Dtype * A,\n" "int offA,\n" "__global const Dtype * B,\n" "int offB,\n" "__global Dtype * C,\n" "int offC,\n" "int M,\n" "int N,\n" "int K,\n" "KERNEL_ARG_DTYPE alpha_f,\n" "KERNEL_ARG_DTYPE beta_f)\n" "{\n" "Dtype alpha = (Dtype)alpha_f;\n" "Dtype beta = (Dtype)beta_f;\n" "int x_gid = get_group_id(0);\n" "int lid = get_local_id(0);\n" "int lsize = get_local_size(0);\n" "const __global Dtype *srca_read0 = A + offA;\n" "const __global Dtype *srca_read1 = srca_read0 + K;\n" "const __global Dtype *srca_read2 = srca_read1 + K;\n" "const __global Dtype *srca_read3 = srca_read2 + K;\n" "const __global Dtype *srcb_read = B + x_gid * 4 * K + offB;\n" "__global Dtype4 *dstc0 = (__global Dtype4*)(C + offC);\n" "__global Dtype4 *dstc1 = (__global Dtype4*)((__global Dtype*)(dstc0) + N);\n" "__global Dtype4 *dstc2 = (__global Dtype4*)((__global Dtype*)(dstc1) + N);\n" "__global Dtype4 *dstc3 = (__global Dtype4*)((__global Dtype*)(dstc2) + N);\n" "__local Dtype4 work0[SLM_SIZE];\n" "__local Dtype4 work1[SLM_SIZE];\n" "__local Dtype4 work2[SLM_SIZE];\n" "__local Dtype4 work3[SLM_SIZE];\n" "__local Dtype* work_each0 = (__local Dtype*)(work0 + lid);\n" "__local Dtype* work_each1 = (__local Dtype*)(work1 + lid);\n" "__local Dtype* work_each2 = (__local Dtype*)(work2 + lid);\n" "__local Dtype* work_each3 = (__local Dtype*)(work3 + lid);\n" "if(x_gid == N / 4) {\n" "TEMPLATE(gemm_buffer_NT_M_4_edgerows,Dtype) \\\n" "(srca_read0, srca_read1, srca_read2, srca_read3, srcb_read, \\\n" "work0, work1, work2, work3, N, K, x_gid, lid, alpha, beta, \\\n" "(__global Dtype*)dstc0, (__global Dtype*)dstc1, (__global Dtype*)dstc2, (__global Dtype*)dstc3);\n" "} else {\n" "Dtype4 dot0[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n" "Dtype4 dot1[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n" "Dtype4 dot2[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n" "Dtype4 dot3[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n" "int kid = lid;\n" "while( kid < K / 4) {\n" "const Dtype4 b0 = vload4(kid, srca_read0);\n" "const Dtype4 b1 = vload4(kid, srca_read1);\n" "const Dtype4 b2 = vload4(kid, srca_read2);\n" "const Dtype4 b3 = vload4(kid, srca_read3);\n" "#pragma unroll\n" "for(int j = 0; j < 4; ++j) {\n" "Dtype4 a = vload4(kid, srcb_read + j * K);\n" "dot0[j] += b0 * a;\n" "dot1[j] += b1 * a;\n" "dot2[j] += b2 * a;\n" "dot3[j] += b3 * a;\n" "}\n" "kid += lsize;\n" "}\n" "#pragma unroll\n" "for(int j = 0; j < 4; ++j) {\n" "work_each0[j] = dot0[j].x + dot0[j].y + dot0[j].z + dot0[j].w;\n" "work_each1[j] = dot1[j].x + dot1[j].y + dot1[j].z + dot1[j].w;\n" "work_each2[j] = dot2[j].x + dot2[j].y + dot2[j].z + dot2[j].w;\n" "work_each3[j] = dot3[j].x + dot3[j].y + dot3[j].z + dot3[j].w;\n" "}\n" "if(kid == (K >> 2)) {\n" "short tail_items = K % 4;\n" "if(tail_items != 0) {\n" "int offset = kid << 2;\n" "const __global Dtype *srcb_tail = srcb_read + offset;\n" "const __global Dtype *srca_tail0 = srca_read0 + offset;\n" "const __global Dtype *srca_tail1 = srca_read1 + offset;\n" "const __global Dtype *srca_tail2 = srca_read2 + offset;\n" "const __global Dtype *srca_tail3 = srca_read3 + offset;\n" "#pragma unroll\n" "for(short i = 0; i < tail_items; ++i) {\n" "const Dtype at0 = srca_tail0[i];\n" "const Dtype at1 = srca_tail1[i];\n" "const Dtype at2 = srca_tail2[i];\n" "const Dtype at3 = srca_tail3[i];\n" "#pragma unroll\n" "for(int j = 0; j < 4; ++j) {\n" "work_each0[j] += at0 * srcb_tail[i + j * K];\n" "work_each1[j] += at1 * srcb_tail[i + j * K];\n" "work_each2[j] += at2 * srcb_tail[i + j * K];\n" "work_each3[j] += at3 * srcb_tail[i + j * K];\n" "}\n" "}\n" "}\n" "}\n" "for(int stride = get_local_size(0) >> 1; stride > 0 ; stride >>= 1) {\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if(lid < stride) {\n" "work0[lid] += work0[lid+stride];\n" "work1[lid] += work1[lid+stride];\n" "work2[lid] += work2[lid+stride];\n" "work3[lid] += work3[lid+stride];\n" "}\n" "}\n" "if(lid == 0) {\n" "#ifdef ZERO_BETA\n" "dstc0[x_gid] = alpha * work0[0];\n" "dstc1[x_gid] = alpha * work1[0];\n" "dstc2[x_gid] = alpha * work2[0];\n" "dstc3[x_gid] = alpha * work3[0];\n" "#else\n" "dstc0[x_gid] = alpha * work0[0] + beta * dstc0[x_gid];\n" "dstc1[x_gid] = alpha * work1[0] + beta * dstc1[x_gid];\n" "dstc2[x_gid] = alpha * work2[0] + beta * dstc2[x_gid];\n" "dstc3[x_gid] = alpha * work3[0] + beta * dstc3[x_gid];\n" "#endif\n" "}\n" "}\n" "}\n" "#undef SLM_SIZE\n" "#define SLM_SIZE 16\n" "__kernel void TEMPLATE(gemm_buffer_NT_M_8,Dtype)(\n" "__global const Dtype * A,\n" "int offA,\n" "__global const Dtype * B,\n" "int offB,\n" "__global Dtype * C,\n" "int offC,\n" "int M,\n" "int N,\n" "int K,\n" "KERNEL_ARG_DTYPE alpha_f,\n" "KERNEL_ARG_DTYPE beta_f)\n" "{\n" "Dtype alpha = (Dtype)alpha_f;\n" "Dtype beta = (Dtype)beta_f;\n" "int x_gid = get_group_id(0);\n" "int lid = get_local_id(0);\n" "int lsize = get_local_size(0);\n" "const __global Dtype *srca_read0 = A + offA;\n" "const __global Dtype *srca_read1 = srca_read0 + K;\n" "const __global Dtype *srca_read2 = srca_read1 + K;\n" "const __global Dtype *srca_read3 = srca_read2 + K;\n" "const __global Dtype *srca_read4 = srca_read3 + K;\n" "const __global Dtype *srca_read5 = srca_read4 + K;\n" "const __global Dtype *srca_read6 = srca_read5 + K;\n" "const __global Dtype *srca_read7 = srca_read6 + K;\n" "const __global Dtype *srcb_read = B + x_gid * K + offB;\n" "__global Dtype *dstc0 = C + offC;\n" "__global Dtype *dstc1 = dstc0 + N;\n" "__global Dtype *dstc2 = dstc1 + N;\n" "__global Dtype *dstc3 = dstc2 + N;\n" "__global Dtype *dstc4 = dstc3 + N;\n" "__global Dtype *dstc5 = dstc4 + N;\n" "__global Dtype *dstc6 = dstc5 + N;\n" "__global Dtype *dstc7 = dstc6 + N;\n" "__local Dtype work0[SLM_SIZE];\n" "__local Dtype work1[SLM_SIZE];\n" "__local Dtype work2[SLM_SIZE];\n" "__local Dtype work3[SLM_SIZE];\n" "__local Dtype work4[SLM_SIZE];\n" "__local Dtype work5[SLM_SIZE];\n" "__local Dtype work6[SLM_SIZE];\n" "__local Dtype work7[SLM_SIZE];\n" "Dtype4 dot0 = (Dtype4)(0.);\n" "Dtype4 dot1 = (Dtype4)(0.);\n" "Dtype4 dot2 = (Dtype4)(0.);\n" "Dtype4 dot3 = (Dtype4)(0.);\n" "Dtype4 dot4 = (Dtype4)(0.);\n" "Dtype4 dot5 = (Dtype4)(0.);\n" "Dtype4 dot6 = (Dtype4)(0.);\n" "Dtype4 dot7 = (Dtype4)(0.);\n" "int kid = lid;\n" "while( kid < K / 4) {\n" "const Dtype4 a0 = vload4(kid, srca_read0);\n" "const Dtype4 a1 = vload4(kid, srca_read1);\n" "const Dtype4 a2 = vload4(kid, srca_read2);\n" "const Dtype4 a3 = vload4(kid, srca_read3);\n" "const Dtype4 a4 = vload4(kid, srca_read4);\n" "const Dtype4 a5 = vload4(kid, srca_read5);\n" "const Dtype4 a6 = vload4(kid, srca_read6);\n" "const Dtype4 a7 = vload4(kid, srca_read7);\n" "Dtype4 b = vload4(kid, srcb_read);\n" "dot0 += a0 * b;\n" "dot1 += a1 * b;\n" "dot2 += a2 * b;\n" "dot3 += a3 * b;\n" "dot4 += a4 * b;\n" "dot5 += a5 * b;\n" "dot6 += a6 * b;\n" "dot7 += a7 * b;\n" "kid += lsize;\n" "}\n" "work0[lid] = dot0.x + dot0.y + dot0.z + dot0.w;\n" "work1[lid] = dot1.x + dot1.y + dot1.z + dot1.w;\n" "work2[lid] = dot2.x + dot2.y + dot2.z + dot2.w;\n" "work3[lid] = dot3.x + dot3.y + dot3.z + dot3.w;\n" "work4[lid] = dot4.x + dot4.y + dot4.z + dot4.w;\n" "work5[lid] = dot5.x + dot5.y + dot5.z + dot5.w;\n" "work6[lid] = dot6.x + dot6.y + dot6.z + dot6.w;\n" "work7[lid] = dot7.x + dot7.y + dot7.z + dot7.w;\n" "if(kid == (K >> 2)) {\n" "short tail_items = K % 4;\n" "if(tail_items != 0) {\n" "int offset = kid << 2;\n" "const __global Dtype *srcb_tail = srcb_read + offset;\n" "const __global Dtype *srca_tail0 = srca_read0 + offset;\n" "const __global Dtype *srca_tail1 = srca_read1 + offset;\n" "const __global Dtype *srca_tail2 = srca_read2 + offset;\n" "const __global Dtype *srca_tail3 = srca_read3 + offset;\n" "const __global Dtype *srca_tail4 = srca_read4 + offset;\n" "const __global Dtype *srca_tail5 = srca_read5 + offset;\n" "const __global Dtype *srca_tail6 = srca_read6 + offset;\n" "const __global Dtype *srca_tail7 = srca_read7 + offset;\n" "#pragma unroll\n" "for(short item = 0; item < tail_items; ++item) {\n" "work0[lid] += srca_tail0[item] * srcb_tail[item];\n" "work1[lid] += srca_tail1[item] * srcb_tail[item];\n" "work2[lid] += srca_tail2[item] * srcb_tail[item];\n" "work3[lid] += srca_tail3[item] * srcb_tail[item];\n" "work4[lid] += srca_tail4[item] * srcb_tail[item];\n" "work5[lid] += srca_tail5[item] * srcb_tail[item];\n" "work6[lid] += srca_tail6[item] * srcb_tail[item];\n" "work7[lid] += srca_tail7[item] * srcb_tail[item];\n" "}\n" "}\n" "}\n" "for(int stride = get_local_size(0) >> 1; stride > 0 ; stride >>= 1) {\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if(lid < stride) {\n" "work0[lid] += work0[lid+stride];\n" "work1[lid] += work1[lid+stride];\n" "work2[lid] += work2[lid+stride];\n" "work3[lid] += work3[lid+stride];\n" "work4[lid] += work4[lid+stride];\n" "work5[lid] += work5[lid+stride];\n" "work6[lid] += work6[lid+stride];\n" "work7[lid] += work7[lid+stride];\n" "}\n" "}\n" "if(lid == 0) {\n" "#ifdef ZERO_BETA\n" "dstc0[x_gid] = alpha * work0[0];\n" "dstc1[x_gid] = alpha * work1[0];\n" "dstc2[x_gid] = alpha * work2[0];\n" "dstc3[x_gid] = alpha * work3[0];\n" "dstc4[x_gid] = alpha * work4[0];\n" "dstc5[x_gid] = alpha * work5[0];\n" "dstc6[x_gid] = alpha * work6[0];\n" "dstc7[x_gid] = alpha * work7[0];\n" "#else\n" "dstc0[x_gid] = alpha * work0[0] + beta * dstc0[x_gid];\n" "dstc1[x_gid] = alpha * work1[0] + beta * dstc1[x_gid];\n" "dstc2[x_gid] = alpha * work2[0] + beta * dstc2[x_gid];\n" "dstc3[x_gid] = alpha * work3[0] + beta * dstc3[x_gid];\n" "dstc4[x_gid] = alpha * work4[0] + beta * dstc4[x_gid];\n" "dstc5[x_gid] = alpha * work5[0] + beta * dstc5[x_gid];\n" "dstc6[x_gid] = alpha * work6[0] + beta * dstc6[x_gid];\n" "dstc7[x_gid] = alpha * work7[0] + beta * dstc7[x_gid];\n" "#endif\n" "}\n" "}\n" "#undef SLM_SIZE\n" "#undef VEC_SIZE\n" "#undef LWG_HEIGHT\n" "#undef TILE_M\n" "#undef TILE_K\n" "#undef TILE_N\n" "#undef SIMD_SIZE_GEMM\n" "#undef SHUFFLE_TYPE2\n" "#undef SHUFFLE_TYPE8\n" , "11f94a50f6b8bb41e89301f396ba7921", NULL}; struct cv::ocl::internal::ProgramEntry gemm_image_oclsrc={moduleName, "gemm_image", "#if defined(cl_khr_fp16)\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "#define CONCAT(A,B) A##_##B\n" "#define TEMPLATE(name,type) CONCAT(name,type)\n" "#define KERNEL_ARG_DTYPE float\n" "#define TYPE_FLOAT 1\n" "#define TYPE_HALF 2\n" "#if TYPE == TYPE_HALF\n" "#define Dtype half\n" "#define Dtype2 half2\n" "#define Dtype4 half4\n" "#define Dtype8 half8\n" "#define Dtype16 half16\n" "#define as_Dtype as_half\n" "#define as_Dtype2 as_half2\n" "#define as_Dtype4 as_half4\n" "#define as_Dtype8 as_half8\n" "#define as_Dtype16 as_half16\n" "#else\n" "#define Dtype float\n" "#define Dtype2 float2\n" "#define Dtype4 float4\n" "#define Dtype8 float8\n" "#define Dtype16 float16\n" "#define as_Dtype as_float\n" "#define as_Dtype2 as_float2\n" "#define as_Dtype4 as_float4\n" "#define as_Dtype8 as_float8\n" "#define as_Dtype16 as_float16\n" "#endif\n" "#if defined(cl_intel_subgroups)\n" "#pragma OPENCL EXTENSION cl_intel_subgroups : enable\n" "#endif\n" "#define TILE_M 32\n" "#define TILE_K 8\n" "#if TYPE == TYPE_HALF\n" "#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read_us8( __image, __coord )\n" "#define SHUFFLE_TYPE2(val) as_ushort2(val)\n" "#define SHUFFLE_TYPE8(val) as_ushort8(val)\n" "#define READ_IMAGE(__image, __coord) read_imageh(__image, sampler, __coord)\n" "#define SIZE_OF_ELEMENT sizeof(ushort)\n" "#define SIMD_SIZE_GEMM 16\n" "#define TILE_N 16\n" "#else\n" "#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read8( __image, __coord )\n" "#define SHUFFLE_TYPE2(val) val\n" "#define SHUFFLE_TYPE8(val) val\n" "#define READ_IMAGE(__image, __coord) read_imagef(__image, sampler, __coord)\n" "#define SIZE_OF_ELEMENT sizeof(uint)\n" "#define SIMD_SIZE_GEMM 8\n" "#define TILE_N 8\n" "#endif\n" "#ifdef USE_IMAGE_C\n" "#if TYPE == TYPE_HALF\n" "#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read_us8( _C, _coordC ) )\n" "#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write_us8( _C, _coordC, as_ushort8( _val ) )\n" "#else\n" "#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read8( _C, _coordC ) )\n" "#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write8( _C, _coordC, as_uint8( _val ) )\n" "#endif\n" "#define MATC_PARAMETER __read_only image2d_t C, __write_only image2d_t dst\n" "#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, C, dst, sizeof(uint))\n" "#else\n" "#define BLOCKC_READ8( _C, _coordC ) \\\n" "(Dtype8) ( (_coordC.x + get_local_id(0) < N && _coordC.y < M) ? _C[ _coordC.y * ldc + _coordC.x + get_local_id(0) ] : 0, \\\n" "(_coordC.x + get_local_id(0) < N && _coordC.y + 1 < M) ? _C[ ( _coordC.y + 1 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \\\n" "(_coordC.x + get_local_id(0) < N && _coordC.y + 2 < M) ? _C[ ( _coordC.y + 2 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \\\n" "(_coordC.x + get_local_id(0) < N && _coordC.y + 3 < M) ? _C[ ( _coordC.y + 3 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \\\n" "(_coordC.x + get_local_id(0) < N && _coordC.y + 4 < M) ? _C[ ( _coordC.y + 4 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \\\n" "(_coordC.x + get_local_id(0) < N && _coordC.y + 5 < M) ? _C[ ( _coordC.y + 5 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \\\n" "(_coordC.x + get_local_id(0) < N && _coordC.y + 6 < M) ? _C[ ( _coordC.y + 6 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \\\n" "(_coordC.x + get_local_id(0) < N && _coordC.y + 7 < M) ? _C[ ( _coordC.y + 7 ) * ldc + _coordC.x + get_local_id(0) ] : 0)\n" "#define BLOCKC_WRITE8( _C, _coordC, _val) do {\\\n" "if (_coordC.x + get_local_id(0) < N) { \\\n" "if (_coordC.y < M) \\\n" "_C[ _coordC.y * ldc + _coordC.x + get_local_id(0) ] = _val.s0; \\\n" "if (_coordC.y + 1 < M) \\\n" "_C[ ( _coordC.y + 1 )* ldc + _coordC.x + get_local_id(0) ] = _val.s1; \\\n" "if (_coordC.y + 2 < M) \\\n" "_C[ ( _coordC.y + 2 )* ldc + _coordC.x + get_local_id(0) ] = _val.s2; \\\n" "if (_coordC.y + 3 < M) \\\n" "_C[ ( _coordC.y + 3 )* ldc + _coordC.x + get_local_id(0) ] = _val.s3; \\\n" "if (_coordC.y + 4 < M) \\\n" "_C[ ( _coordC.y + 4 )* ldc + _coordC.x + get_local_id(0) ] = _val.s4; \\\n" "if (_coordC.y + 5 < M) \\\n" "_C[ ( _coordC.y + 5 )* ldc + _coordC.x + get_local_id(0) ] = _val.s5; \\\n" "if (_coordC.y + 6 < M) \\\n" "_C[ ( _coordC.y + 6 )* ldc + _coordC.x + get_local_id(0) ] = _val.s6; \\\n" "if (_coordC.y + 7 < M) \\\n" "_C[ ( _coordC.y + 7 )* ldc + _coordC.x + get_local_id(0) ] = _val.s7; \\\n" "}} while(0)\n" "#define MATC_PARAMETER __global Dtype * C, const int offC, const int M, const int N, const int ldc\n" "#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, (C + offC), (C + offC), 1)\n" "#endif\n" "#define GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, _C, _dst, _C_step) \\\n" "int2 coordDst = (int2)( ( group_x * TILE_N ) * _C_step, ( group_y * TILE_M ) ); \\\n" "int2 coordC = coordDst; \\\n" "Dtype8 blockC00; \\\n" "Dtype8 blockC01; \\\n" "Dtype8 blockC02; \\\n" "Dtype8 blockC03; \\\n" "if (BETA_NOT0) { \\\n" "blockC00 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \\\n" "blockC01 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \\\n" "blockC02 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \\\n" "blockC03 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); \\\n" "if (!ALPHA1) { \\\n" "blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \\\n" "blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \\\n" "blockC02 = mad(blockAxB02, (Dtype8)alpha, blockC02); \\\n" "blockC03 = mad(blockAxB03, (Dtype8)alpha, blockC03); \\\n" "} else { \\\n" "blockC00 += blockAxB00; \\\n" "blockC01 += blockAxB01; \\\n" "blockC02 += blockAxB02; \\\n" "blockC03 += blockAxB03; \\\n" "} \\\n" "} else { \\\n" "blockC00 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \\\n" "blockC01 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \\\n" "blockC02 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \\\n" "blockC03 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); \\\n" "if (!ALPHA1) { \\\n" "blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \\\n" "blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \\\n" "blockC02 = mad(blockAxB02, (Dtype8)alpha, blockC02); \\\n" "blockC03 = mad(blockAxB03, (Dtype8)alpha, blockC03); \\\n" "} else { \\\n" "blockC00 += blockAxB00; \\\n" "blockC01 += blockAxB01; \\\n" "blockC02 += blockAxB02; \\\n" "blockC03 += blockAxB03; \\\n" "} \\\n" "} \\\n" "BLOCKC_WRITE8( _dst, coordDst, blockC00 ); coordDst.y += 8; \\\n" "BLOCKC_WRITE8( _dst, coordDst, blockC01 ); coordDst.y += 8; \\\n" "BLOCKC_WRITE8( _dst, coordDst, blockC02 ); coordDst.y += 8; \\\n" "BLOCKC_WRITE8( _dst, coordDst, blockC03 );\n" "#define TRANSPOSE_BLOCK_8( _block, _col ) \\\n" "(Dtype8)( intel_sub_group_shuffle( _block.s0, _col ), \\\n" "intel_sub_group_shuffle( _block.s1, _col ), \\\n" "intel_sub_group_shuffle( _block.s2, _col ), \\\n" "intel_sub_group_shuffle( _block.s3, _col ), \\\n" "intel_sub_group_shuffle( _block.s4, _col ), \\\n" "intel_sub_group_shuffle( _block.s5, _col ), \\\n" "intel_sub_group_shuffle( _block.s6, _col ), \\\n" "intel_sub_group_shuffle( _block.s7, _col ) );\n" "#if TYPE == TYPE_HALF\n" "#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB00, _blockB01 ) \\\n" "{ \\\n" "const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \\\n" "const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \\\n" "const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \\\n" "const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \\\n" "const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \\\n" "const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \\\n" "const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \\\n" "const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \\\n" "const Dtype8 acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 ); \\\n" "const Dtype8 acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 ); \\\n" "const Dtype8 acola = TRANSPOSE_BLOCK_8( _blockA, 10 ); \\\n" "const Dtype8 acolb = TRANSPOSE_BLOCK_8( _blockA, 11 ); \\\n" "const Dtype8 acolc = TRANSPOSE_BLOCK_8( _blockA, 12 ); \\\n" "const Dtype8 acold = TRANSPOSE_BLOCK_8( _blockA, 13 ); \\\n" "const Dtype8 acole = TRANSPOSE_BLOCK_8( _blockA, 14 ); \\\n" "const Dtype8 acolf = TRANSPOSE_BLOCK_8( _blockA, 15 ); \\\n" "_result = mad( (Dtype8)(_blockB00.s0), acol0, _result ); \\\n" "_result = mad( (Dtype8)(_blockB00.s1), acol1, _result ); \\\n" "_result = mad( (Dtype8)(_blockB00.s2), acol2, _result ); \\\n" "_result = mad( (Dtype8)(_blockB00.s3), acol3, _result ); \\\n" "_result = mad( (Dtype8)(_blockB00.s4), acol4, _result ); \\\n" "_result = mad( (Dtype8)(_blockB00.s5), acol5, _result ); \\\n" "_result = mad( (Dtype8)(_blockB00.s6), acol6, _result ); \\\n" "_result = mad( (Dtype8)(_blockB00.s7), acol7, _result ); \\\n" "_result = mad( (Dtype8)(_blockB01.s0), acol8, _result ); \\\n" "_result = mad( (Dtype8)(_blockB01.s1), acol9, _result ); \\\n" "_result = mad( (Dtype8)(_blockB01.s2), acola, _result ); \\\n" "_result = mad( (Dtype8)(_blockB01.s3), acolb, _result ); \\\n" "_result = mad( (Dtype8)(_blockB01.s4), acolc, _result ); \\\n" "_result = mad( (Dtype8)(_blockB01.s5), acold, _result ); \\\n" "_result = mad( (Dtype8)(_blockB01.s6), acole, _result ); \\\n" "_result = mad( (Dtype8)(_blockB01.s7), acolf, _result ); \\\n" "}\n" "#else\n" "#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \\\n" "{ \\\n" "const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \\\n" "const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \\\n" "const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \\\n" "const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \\\n" "const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \\\n" "const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \\\n" "const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \\\n" "const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \\\n" "_result = mad( (Dtype8)(_blockB.s0), acol0, _result ); \\\n" "_result = mad( (Dtype8)(_blockB.s1), acol1, _result ); \\\n" "_result = mad( (Dtype8)(_blockB.s2), acol2, _result ); \\\n" "_result = mad( (Dtype8)(_blockB.s3), acol3, _result ); \\\n" "_result = mad( (Dtype8)(_blockB.s4), acol4, _result ); \\\n" "_result = mad( (Dtype8)(_blockB.s5), acol5, _result ); \\\n" "_result = mad( (Dtype8)(_blockB.s6), acol6, _result ); \\\n" "_result = mad( (Dtype8)(_blockB.s7), acol7, _result ); \\\n" "}\n" "#endif\n" "#if TYPE == TYPE_HALF\n" "#define GEMM_NN(ALPHA1, BETA_NOT0) \\\n" "__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \\\n" "__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \\\n" "__kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \\\n" "__read_only image2d_t A, \\\n" "__read_only image2d_t B, \\\n" "MATC_PARAMETER, \\\n" "KERNEL_ARG_DTYPE alpha_in, \\\n" "KERNEL_ARG_DTYPE beta_in, \\\n" "int width0, \\\n" "int isFirstColBlock) \\\n" "{ \\\n" "const Dtype alpha = (Dtype)alpha_in; \\\n" "const Dtype beta = (Dtype)beta_in; \\\n" "const int group_x = get_group_id(0); \\\n" "const int group_y = get_group_id(1); \\\n" "Dtype8 blockAxB00 = 0; \\\n" "Dtype8 blockAxB01 = 0; \\\n" "Dtype8 blockAxB02 = 0; \\\n" "Dtype8 blockAxB03 = 0; \\\n" "int2 coordA = (int2)( 0, group_y * TILE_M ); \\\n" "int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ); \\\n" "do \\\n" "{ \\\n" "int2 coordBTemp = coordB; \\\n" "Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K; \\\n" "Dtype8 blockB01 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K; \\\n" "int2 coordATemp = coordA; \\\n" "Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \\\n" "Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \\\n" "Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \\\n" "Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, blockB01 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00, blockB01 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00, blockB01 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00, blockB01 ); \\\n" "} \\\n" "while( coordB.y < width0 ); \\\n" "GEMM_OUTPUT(ALPHA1, BETA_NOT0); \\\n" "}\n" "#else\n" "#define GEMM_NN(ALPHA1, BETA_NOT0) \\\n" "__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \\\n" "__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \\\n" "__kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \\\n" "__read_only image2d_t A, \\\n" "__read_only image2d_t B, \\\n" "MATC_PARAMETER, \\\n" "KERNEL_ARG_DTYPE alpha_in, \\\n" "KERNEL_ARG_DTYPE beta_in, \\\n" "int width0, \\\n" "int isFirstColBlock) \\\n" "{ \\\n" "const Dtype alpha = (Dtype)alpha_in; \\\n" "const Dtype beta = (Dtype)beta_in; \\\n" "const int group_x = get_group_id(0); \\\n" "const int group_y = get_group_id(1); \\\n" "Dtype8 blockAxB00 = 0.0f; \\\n" "Dtype8 blockAxB01 = 0.0f; \\\n" "Dtype8 blockAxB02 = 0.0f; \\\n" "Dtype8 blockAxB03 = 0.0f; \\\n" "int2 coordA = (int2)( 0, group_y * TILE_M ); \\\n" "int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ); \\\n" "do \\\n" "{ \\\n" "int2 coordBTemp = coordB; \\\n" "Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K; \\\n" "int2 coordATemp = coordA; \\\n" "Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \\\n" "Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \\\n" "Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \\\n" "Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT; \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \\\n" "} \\\n" "while( coordB.y < width0 ); \\\n" "GEMM_OUTPUT(ALPHA1, BETA_NOT0); \\\n" "}\n" "#endif\n" "GEMM_NN(1, 0)\n" "GEMM_NN(1, 1)\n" "GEMM_NN(0, 0)\n" "GEMM_NN(0, 1)\n" "#undef TRANSPOSE_BLOCK_8\n" "#undef MULTIPLY_BLOCKS_8x8\n" "#undef GEMM_NN\n" "#define TRANSPOSE_BLOCK_8(_vec, _col) \\\n" "(Dtype8)( intel_sub_group_shuffle(_vec, _col + 0), \\\n" "intel_sub_group_shuffle(_vec, _col + 1), \\\n" "intel_sub_group_shuffle(_vec, _col + 2), \\\n" "intel_sub_group_shuffle(_vec, _col + 3), \\\n" "intel_sub_group_shuffle(_vec, _col + 4), \\\n" "intel_sub_group_shuffle(_vec, _col + 5), \\\n" "intel_sub_group_shuffle(_vec, _col + 6), \\\n" "intel_sub_group_shuffle(_vec, _col + 7) )\n" "#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB, _col ) \\\n" "{ \\\n" "_result = mad( (Dtype8)(_blockB.s0), TRANSPOSE_BLOCK_8(_blockA.s0, _col), _result ); \\\n" "_result = mad( (Dtype8)(_blockB.s1), TRANSPOSE_BLOCK_8(_blockA.s1, _col), _result ); \\\n" "_result = mad( (Dtype8)(_blockB.s2), TRANSPOSE_BLOCK_8(_blockA.s2, _col), _result ); \\\n" "_result = mad( (Dtype8)(_blockB.s3), TRANSPOSE_BLOCK_8(_blockA.s3, _col), _result ); \\\n" "_result = mad( (Dtype8)(_blockB.s4), TRANSPOSE_BLOCK_8(_blockA.s4, _col), _result ); \\\n" "_result = mad( (Dtype8)(_blockB.s5), TRANSPOSE_BLOCK_8(_blockA.s5, _col), _result ); \\\n" "_result = mad( (Dtype8)(_blockB.s6), TRANSPOSE_BLOCK_8(_blockA.s6, _col), _result ); \\\n" "_result = mad( (Dtype8)(_blockB.s7), TRANSPOSE_BLOCK_8(_blockA.s7, _col), _result ); \\\n" "}\n" "#if TYPE == TYPE_HALF\n" "#define GEMM_TN(ALPHA1, BETA_NOT0) \\\n" "__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \\\n" "__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \\\n" "__kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \\\n" "__read_only image2d_t A, \\\n" "__read_only image2d_t B, \\\n" "MATC_PARAMETER, \\\n" "KERNEL_ARG_DTYPE alpha_in, \\\n" "KERNEL_ARG_DTYPE beta_in, \\\n" "int width0, \\\n" "int isFirstColBlock) \\\n" "{ \\\n" "const Dtype alpha = (Dtype)alpha_in; \\\n" "const Dtype beta = (Dtype)beta_in; \\\n" "const int group_x = get_group_id(0);\\\n" "const int group_y = get_group_id(1);\\\n" "Dtype8 blockAxB00 = 0;\\\n" "Dtype8 blockAxB01 = 0;\\\n" "Dtype8 blockAxB02 = 0;\\\n" "Dtype8 blockAxB03 = 0;\\\n" "int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 );\\\n" "int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 );\\\n" "do\\\n" "{\\\n" "int2 coordBTemp = coordB;\\\n" "Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K;\\\n" "int2 coordATemp = coordA;\\\n" "Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 16 * SIZE_OF_ELEMENT;\\\n" "Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K;\\\n" "MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \\\n" "} \\\n" "while( coordB.y < width0 ); \\\n" "GEMM_OUTPUT(ALPHA1, BETA_NOT0); \\\n" "}\n" "#else\n" "#define GEMM_TN(ALPHA1, BETA_NOT0) \\\n" "__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \\\n" "__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \\\n" "__kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \\\n" "__read_only image2d_t A, \\\n" "__read_only image2d_t B, \\\n" "MATC_PARAMETER, \\\n" "KERNEL_ARG_DTYPE alpha_in, \\\n" "KERNEL_ARG_DTYPE beta_in, \\\n" "int width0, \\\n" "int isFirstColBlock) \\\n" "{ \\\n" "const Dtype alpha = (Dtype)alpha_in; \\\n" "const Dtype beta = (Dtype)beta_in; \\\n" "const int group_x = get_group_id(0);\\\n" "const int group_y = get_group_id(1);\\\n" "Dtype8 blockAxB00 = 0.0f;\\\n" "Dtype8 blockAxB01 = 0.0f;\\\n" "Dtype8 blockAxB02 = 0.0f;\\\n" "Dtype8 blockAxB03 = 0.0f;\\\n" "int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 );\\\n" "int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 );\\\n" "do\\\n" "{\\\n" "int2 coordBTemp = coordB;\\\n" "Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K;\\\n" "int2 coordATemp = coordA;\\\n" "Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT;\\\n" "Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT;\\\n" "Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT;\\\n" "Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K;\\\n" "MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00, 0 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00, 0 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00, 0 ); \\\n" "} \\\n" "while( coordB.y < width0 ); \\\n" "GEMM_OUTPUT(ALPHA1, BETA_NOT0); \\\n" "}\n" "#endif\n" "GEMM_TN(1, 0)\n" "GEMM_TN(1, 1)\n" "GEMM_TN(0, 0)\n" "GEMM_TN(0, 1)\n" "#undef MULTIPLY_BLOCKS_8x8\n" "#undef TRANSPOSE_BLOCK_8\n" "#undef GEMM_TN\n" "#define TRANSPOSE_BLOCK_8( _block, _col ) \\\n" "(Dtype8)( intel_sub_group_shuffle( _block.s0, _col), \\\n" "intel_sub_group_shuffle( _block.s1, _col), \\\n" "intel_sub_group_shuffle( _block.s2, _col), \\\n" "intel_sub_group_shuffle( _block.s3, _col), \\\n" "intel_sub_group_shuffle( _block.s4, _col), \\\n" "intel_sub_group_shuffle( _block.s5, _col), \\\n" "intel_sub_group_shuffle( _block.s6, _col), \\\n" "intel_sub_group_shuffle( _block.s7, _col) )\n" "#if TYPE == TYPE_HALF\n" "#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \\\n" "{ \\\n" "const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \\\n" "const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \\\n" "const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \\\n" "const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \\\n" "const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \\\n" "const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \\\n" "const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \\\n" "const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \\\n" "const Dtype8 acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 ); \\\n" "const Dtype8 acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 ); \\\n" "const Dtype8 acola = TRANSPOSE_BLOCK_8( _blockA, 10 ); \\\n" "const Dtype8 acolb = TRANSPOSE_BLOCK_8( _blockA, 11 ); \\\n" "const Dtype8 acolc = TRANSPOSE_BLOCK_8( _blockA, 12 ); \\\n" "const Dtype8 acold = TRANSPOSE_BLOCK_8( _blockA, 13 ); \\\n" "const Dtype8 acole = TRANSPOSE_BLOCK_8( _blockA, 14 ); \\\n" "const Dtype8 acolf = TRANSPOSE_BLOCK_8( _blockA, 15 ); \\\n" "_result = mad( (Dtype8)_blockB.s0, acol0, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s1, acol1, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s2, acol2, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s3, acol3, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s4, acol4, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s5, acol5, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s6, acol6, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s7, acol7, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s8, acol8, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s9, acol9, _result ); \\\n" "_result = mad( (Dtype8)_blockB.sa, acola, _result ); \\\n" "_result = mad( (Dtype8)_blockB.sb, acolb, _result ); \\\n" "_result = mad( (Dtype8)_blockB.sc, acolc, _result ); \\\n" "_result = mad( (Dtype8)_blockB.sd, acold, _result ); \\\n" "_result = mad( (Dtype8)_blockB.se, acole, _result ); \\\n" "_result = mad( (Dtype8)_blockB.sf, acolf, _result ); \\\n" "}\n" "#else\n" "#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \\\n" "{ \\\n" "const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \\\n" "const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \\\n" "const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \\\n" "const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \\\n" "const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \\\n" "const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \\\n" "const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \\\n" "const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \\\n" "_result = mad( (Dtype8)_blockB.s0, acol0, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s1, acol1, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s2, acol2, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s3, acol3, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s4, acol4, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s5, acol5, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s6, acol6, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s7, acol7, _result ); \\\n" "}\n" "#endif\n" "#if TYPE == TYPE_HALF\n" "#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \\\n" "__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \\\n" "__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \\\n" "__kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \\\n" "__read_only image2d_t A, \\\n" "MATB_PARAMETER, \\\n" "MATC_PARAMETER, \\\n" "KERNEL_ARG_DTYPE alpha_in, \\\n" "KERNEL_ARG_DTYPE beta_in, \\\n" "int padded_k, \\\n" "int k, \\\n" "int isFirstColBlock) \\\n" "{ \\\n" "const Dtype alpha = (Dtype)alpha_in; \\\n" "const Dtype beta = (Dtype)beta_in; \\\n" "const int group_x = get_group_id(0); \\\n" "const int group_y = get_group_id(1); \\\n" "Dtype8 blockAxB00 = 0; \\\n" "Dtype8 blockAxB01 = 0; \\\n" "Dtype8 blockAxB02 = 0; \\\n" "Dtype8 blockAxB03 = 0; \\\n" "int2 coordA = (int2)( 0, group_y * TILE_M ); \\\n" "int2 coordB = (int2)( 0, ( group_x * TILE_N )); \\\n" "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \\\n" "do \\\n" "{ \\\n" "Dtype16 blockB00; \\\n" "BLOCKB_READ8(blockB00, B, coordB); \\\n" "int2 coordATemp = coordA; \\\n" "Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \\\n" "Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \\\n" "Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \\\n" "Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \\\n" "} \\\n" "while( coordB.x < padded_k / VECSIZE ); \\\n" "GEMM_OUTPUT(ALPHA1, BETA_NOT0); \\\n" "}\n" "#else\n" "#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \\\n" "__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \\\n" "__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \\\n" "__kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \\\n" "__read_only image2d_t A, \\\n" "MATB_PARAMETER, \\\n" "MATC_PARAMETER, \\\n" "KERNEL_ARG_DTYPE alpha_in, \\\n" "KERNEL_ARG_DTYPE beta_in, \\\n" "int padded_k, \\\n" "int k, \\\n" "int isFirstColBlock) \\\n" "{ \\\n" "const Dtype alpha = (Dtype)alpha_in; \\\n" "const Dtype beta = (Dtype)beta_in; \\\n" "const int group_x = get_group_id(0); \\\n" "const int group_y = get_group_id(1); \\\n" "Dtype8 blockAxB00 = 0.0f; \\\n" "Dtype8 blockAxB01 = 0.0f; \\\n" "Dtype8 blockAxB02 = 0.0f; \\\n" "Dtype8 blockAxB03 = 0.0f; \\\n" "int2 coordA = (int2)( 0, group_y * TILE_M ); \\\n" "int2 coordB = (int2)( 0, ( group_x * TILE_N )); \\\n" "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \\\n" "do \\\n" "{ \\\n" "Dtype8 blockB00; \\\n" "BLOCKB_READ8(blockB00, B, coordB); \\\n" "int2 coordATemp = coordA; \\\n" "Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \\\n" "Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \\\n" "Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \\\n" "Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT; \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \\\n" "} \\\n" "while( coordB.x < padded_k / VECSIZE ); \\\n" "GEMM_OUTPUT(ALPHA1, BETA_NOT0); \\\n" "}\n" "#endif\n" "#if TYPE == TYPE_HALF\n" "#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n" "int2 _coordBTemp = _coordB; \\\n" "_coordBTemp.y += get_local_id(0); \\\n" "_blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s89ab = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.scdef = READ_IMAGE(_B, _coordBTemp); _coordB.x += 4;\n" "#else\n" "#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n" "int2 _coordBTemp = _coordB; \\\n" "_coordBTemp.y += get_local_id(0); \\\n" "_blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;\n" "#endif\n" "#define MATB_PARAMETER __read_only image2d_t B\n" "GEMM_NT(1, 0, VEC4, 4)\n" "GEMM_NT(1, 1, VEC4, 4)\n" "GEMM_NT(0, 0, VEC4, 4)\n" "GEMM_NT(0, 1, VEC4, 4)\n" "#undef BLOCKB_READ8\n" "#undef MATB_PARAMETER\n" "#if TYPE == TYPE_HALF\n" "#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n" "int2 _coordBTemp = _coordB; \\\n" "_coordBTemp.y += get_local_id(0); \\\n" "const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \\\n" "_blockb = as_Dtype16(as_ushort16(vload8(0, B_read))); \\\n" "_coordB.x += TILE_K * 2;\n" "#else\n" "#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n" "int2 _coordBTemp = _coordB; \\\n" "_coordBTemp.y += get_local_id(0); \\\n" "const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \\\n" "_blockb = vload8(0, B_read); \\\n" "_coordB.x += TILE_K;\n" "#endif\n" "#define MATB_PARAMETER __global Dtype *B, int offB, int ldb\n" "GEMM_NT(1, 0, BUFFER, 1)\n" "GEMM_NT(1, 1, BUFFER, 1)\n" "GEMM_NT(0, 0, BUFFER, 1)\n" "GEMM_NT(0, 1, BUFFER, 1)\n" "#undef BLOCKB_READ8\n" "#undef MATB_PARAMETER\n" "#if TYPE == TYPE_HALF\n" "#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n" "int2 _coordBTemp = _coordB; \\\n" "_coordBTemp.y += get_local_id(0); \\\n" "Dtype4 temp; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s0 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s1 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s2 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s3 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s4 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s5 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s6 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s7 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s8 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s9 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.sa = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.sb = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.sc = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.sd = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.se = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.sf = temp.s0; \\\n" "_coordB.x += 16;\n" "#else\n" "#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n" "int2 _coordBTemp = _coordB; \\\n" "_coordBTemp.y += get_local_id(0); \\\n" "Dtype4 temp; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s0 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s1 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s2 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s3 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s4 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s5 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s6 = temp.s0; \\\n" "temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s7 = temp.s0; \\\n" "_coordB.x += 8;\n" "#endif\n" "#define MATB_PARAMETER __read_only image2d_t B\n" "GEMM_NT(1, 0, SCALAR, 1)\n" "GEMM_NT(1, 1, SCALAR, 1)\n" "GEMM_NT(0, 0, SCALAR, 1)\n" "GEMM_NT(0, 1, SCALAR, 1)\n" "#undef BLOCKB_READ8\n" "#undef MATB_PARAMETER\n" "#undef MULTIPLY_BLOCKS_8x8\n" "#undef TRANSPOSE_BLOCK_8\n" "#undef GEMM_NT\n" "#define TRANSPOSE_BLOCK_8(_vec, _col) \\\n" "(Dtype8)( intel_sub_group_shuffle(_vec, _col + 0), \\\n" "intel_sub_group_shuffle(_vec, _col + 1), \\\n" "intel_sub_group_shuffle(_vec, _col + 2), \\\n" "intel_sub_group_shuffle(_vec, _col + 3), \\\n" "intel_sub_group_shuffle(_vec, _col + 4), \\\n" "intel_sub_group_shuffle(_vec, _col + 5), \\\n" "intel_sub_group_shuffle(_vec, _col + 6), \\\n" "intel_sub_group_shuffle(_vec, _col + 7) );\n" "#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB, _col ) \\\n" "{ \\\n" "const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA.s0, _col ); \\\n" "const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA.s1, _col ); \\\n" "const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA.s2, _col ); \\\n" "const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA.s3, _col ); \\\n" "const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA.s4, _col ); \\\n" "const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA.s5, _col ); \\\n" "const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA.s6, _col ); \\\n" "const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA.s7, _col ); \\\n" "_result = mad( (Dtype8)_blockB.s0, acol0, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s1, acol1, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s2, acol2, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s3, acol3, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s4, acol4, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s5, acol5, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s6, acol6, _result ); \\\n" "_result = mad( (Dtype8)_blockB.s7, acol7, _result ); \\\n" "}\n" "#if TYPE == TYPE_HALF\n" "#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \\\n" "__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \\\n" "__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \\\n" "__kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \\\n" "__read_only image2d_t A, \\\n" "MATB_PARAMETER, \\\n" "MATC_PARAMETER, \\\n" "KERNEL_ARG_DTYPE alpha_in, \\\n" "KERNEL_ARG_DTYPE beta_in, \\\n" "int padded_k, \\\n" "int k, \\\n" "int isFirstColBlock) \\\n" "{ \\\n" "const Dtype alpha = (Dtype)alpha_in; \\\n" "const Dtype beta = (Dtype)beta_in; \\\n" "const int group_x = get_group_id(0); \\\n" "const int group_y = get_group_id(1); \\\n" "Dtype8 blockAxB00 = 0; \\\n" "Dtype8 blockAxB01 = 0; \\\n" "Dtype8 blockAxB02 = 0; \\\n" "Dtype8 blockAxB03 = 0; \\\n" "int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ); \\\n" "int2 coordB = (int2)( 0, ( group_x * TILE_N )); \\\n" "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \\\n" "do \\\n" "{ \\\n" "Dtype8 blockB00; \\\n" "BLOCKB_READ8(blockB00, B, coordB); \\\n" "int2 coordATemp = coordA; \\\n" "Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 16 * SIZE_OF_ELEMENT;\\\n" "Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K;\\\n" "MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \\\n" "} \\\n" "while( coordB.x < padded_k / VECSIZE ); \\\n" "GEMM_OUTPUT(ALPHA1, BETA_NOT0);\\\n" "}\n" "#else\n" "#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \\\n" "__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \\\n" "__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \\\n" "__kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \\\n" "__read_only image2d_t A, \\\n" "MATB_PARAMETER, \\\n" "MATC_PARAMETER, \\\n" "KERNEL_ARG_DTYPE alpha_in, \\\n" "KERNEL_ARG_DTYPE beta_in, \\\n" "int padded_k, \\\n" "int k, \\\n" "int isFirstColBlock) \\\n" "{ \\\n" "const Dtype alpha = (Dtype)alpha_in; \\\n" "const Dtype beta = (Dtype)beta_in; \\\n" "const int group_x = get_group_id(0); \\\n" "const int group_y = get_group_id(1); \\\n" "Dtype8 blockAxB00 = 0.0f; \\\n" "Dtype8 blockAxB01 = 0.0f; \\\n" "Dtype8 blockAxB02 = 0.0f; \\\n" "Dtype8 blockAxB03 = 0.0f; \\\n" "int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ); \\\n" "int2 coordB = (int2)( 0, ( group_x * TILE_N )); \\\n" "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \\\n" "do \\\n" "{ \\\n" "Dtype8 blockB00; \\\n" "BLOCKB_READ8(blockB00, B, coordB); \\\n" "int2 coordATemp = coordA; \\\n" "Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT; \\\n" "Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT; \\\n" "Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT; \\\n" "Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K; \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00 , blockB00, 0 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01 , blockB00, 0 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02 , blockB00, 0 ); \\\n" "MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03 , blockB00, 0 ); \\\n" "} \\\n" "while( coordB.x < padded_k / VECSIZE ); \\\n" "GEMM_OUTPUT(ALPHA1, BETA_NOT0);\\\n" "}\n" "#endif\n" "#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n" "int2 _coordBTemp = _coordB; \\\n" "_coordBTemp.y += get_local_id(0); \\\n" "_blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;\n" "#define MATB_PARAMETER __read_only image2d_t B\n" "GEMM_TT(1, 0, VEC4, 4)\n" "GEMM_TT(1, 1, VEC4, 4)\n" "GEMM_TT(0, 0, VEC4, 4)\n" "GEMM_TT(0, 1, VEC4, 4)\n" "#undef BLOCKB_READ8\n" "#undef MATB_PARAMETER\n" "#if TYPE == TYPE_HALF\n" "#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n" "int2 _coordBTemp = _coordB; \\\n" "_coordBTemp.y += get_local_id(0); \\\n" "const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \\\n" "_blockb = as_Dtype8(as_ushort8(vload4(0, B_read))); \\\n" "_coordB.x += TILE_K;\n" "#else\n" "#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n" "int2 _coordBTemp = _coordB; \\\n" "_coordBTemp.y += get_local_id(0); \\\n" "const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \\\n" "_blockb = vload8(0, B_read); \\\n" "_coordB.x += TILE_K;\n" "#endif\n" "#define MATB_PARAMETER __global Dtype *B, int offB, int ldb\n" "GEMM_TT(1, 0, BUFFER, 1)\n" "GEMM_TT(1, 1, BUFFER, 1)\n" "GEMM_TT(0, 0, BUFFER, 1)\n" "GEMM_TT(0, 1, BUFFER, 1)\n" "#undef BLOCKB_READ8\n" "#undef MATB_PARAMETER\n" "#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n" "int2 _coordBTemp = _coordB; \\\n" "_coordBTemp.y += get_local_id(0); \\\n" "Dtype4 temp; \\\n" "temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s0 = temp.s0; \\\n" "temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s1 = temp.s0; \\\n" "temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s2 = temp.s0; \\\n" "temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s3 = temp.s0; \\\n" "temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s4 = temp.s0; \\\n" "temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s5 = temp.s0; \\\n" "temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s6 = temp.s0; \\\n" "temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \\\n" "_blockb.s7 = temp.s0; \\\n" "_coordB.x += 8;\n" "#define MATB_PARAMETER __read_only image2d_t B\n" "GEMM_TT(1, 0, SCALAR, 1)\n" "GEMM_TT(1, 1, SCALAR, 1)\n" "GEMM_TT(0, 0, SCALAR, 1)\n" "GEMM_TT(0, 1, SCALAR, 1)\n" "#undef BLOCKB_READ8\n" "#undef MATB_PARAMETER\n" "#undef MULTIPLY_BLOCKS_8x8\n" "#undef TRANSPOSE_BLOCK_8\n" "#undef GEMM_TT\n" "#undef TILE_M\n" "#undef TILE_K\n" "#undef TILE_N\n" "#undef SUBGROUP_BLOCK_READ8\n" "#undef READ_IMAGE\n" "#undef SIZE_OF_ELEMENT\n" "__kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(\n" "__global Dtype* A,\n" "__write_only image2d_t ImA,\n" "int offA,\n" "int width,\n" "int height,\n" "int ldA)\n" "{\n" "const int gidx = get_global_id(0);\n" "const int gidy = get_global_id(1);\n" "if (gidx >= width || gidy >= height)\n" "return;\n" "int2 coord_dst = (int2)(gidx, gidy);\n" "__global Dtype* A_off = A + offA;\n" "Dtype srcA = A_off[gidy * ldA + gidx];\n" "#if TYPE == TYPE_HALF\n" "write_imageh(ImA, coord_dst, (Dtype4)srcA);\n" "#else\n" "write_imagef(ImA, coord_dst, (Dtype4)srcA);\n" "#endif\n" "}\n" "__kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(\n" "__global Dtype* A,\n" "__write_only image2d_t ImA,\n" "int offA,\n" "int padded_width,\n" "int padded_height,\n" "int width,\n" "int height,\n" "int ldA)\n" "{\n" "const int gidx = get_global_id(0);\n" "const int gidy = get_global_id(1);\n" "if (gidx >= padded_width || gidy >= padded_height)\n" "return;\n" "int2 coord_dst = (int2)(gidx, gidy);\n" "#if TYPE == TYPE_HALF\n" "if (gidx >= width || gidy >= height) {\n" "write_imageh(ImA, coord_dst, 0);\n" "return;\n" "}\n" "__global Dtype* A_off = A + offA;\n" "write_imageh(ImA, coord_dst, A_off[gidy * ldA + gidx]);\n" "#else\n" "if (gidx >= width || gidy >= height) {\n" "write_imageui(ImA, coord_dst, (uint4)0);\n" "return;\n" "}\n" "__global Dtype* A_off = A + offA;\n" "uint4 srcA = convert_uint4(as_uchar4(A_off[gidy * ldA + gidx]));\n" "write_imageui(ImA, coord_dst, srcA);\n" "#endif\n" "}\n" , "3b788ad998ad54977a6af81e04e13c15", NULL}; struct cv::ocl::internal::ProgramEntry im2col_oclsrc={moduleName, "im2col", "__kernel void im2col(__global const T *im_src, int im_src_offset,\n" "int channels, int height_inp, int width_inp,\n" "int kernel_h, int kernel_w, int pad_h, int pad_w, int stride_h, int stride_w,\n" "int height_out, int width_out,\n" "__global T *im_col, int im_col_offset\n" ")\n" "{\n" "int index = get_global_id(0);\n" "if (index >= height_out * width_out * channels)\n" "return;\n" "int j_out = index % width_out;\n" "int i_out = (index / width_out) % height_out;\n" "int c_inp = (index / width_out) / height_out;\n" "int c_out = c_inp * kernel_h * kernel_w;\n" "int i_inp = i_out * stride_h - pad_h;\n" "int j_inp = j_out * stride_w - pad_w;\n" "im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset;\n" "im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset;\n" "for (int ki = 0; ki < kernel_h; ++ki)\n" "for (int kj = 0; kj < kernel_w; ++kj) {\n" "int i = i_inp + ki;\n" "int j = j_inp + kj;\n" "*im_col = (i >= 0 && j >= 0 && i < height_inp && j < width_inp) ?\n" "im_src[ki * width_inp + kj] : 0;\n" "im_col += height_out * width_out;\n" "}\n" "}\n" , "609f199a321eef4535e1eff3ab281090", NULL}; struct cv::ocl::internal::ProgramEntry lrn_oclsrc={moduleName, "lrn", "/*************************************************************************************\n" "* Copyright (c) 2015, Advanced Micro Devices, Inc.\n" "* All rights reserved.\n" "*\n" "* Redistribution and use in source and binary forms, with or without modification,\n" "* are permitted provided that the following conditions are met:\n" "*\n" "* 1. Redistributions of source code must retain the above copyright notice, this\n" "* list of conditions and the following disclaimer.\n" "*\n" "* 2. Redistributions in binary form must reproduce the above copyright notice,\n" "* this list of conditions and the following disclaimer in the documentation and/or\n" "* other materials provided with the distribution.\n" "*\n" "* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n" "* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n" "* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\n" "* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,\n" "* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,\n" "* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,\n" "* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,\n" "* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n" "* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\n" "* POSSIBILITY OF SUCH DAMAGE.\n" "**************************************************************************************/\n" "__kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) {\n" "int index = get_global_id(0);\n" "int tmp = get_global_size(0);\n" "for(index; index < nthreads; index += tmp)\n" "out[index] = in[index] * pow(scale[index], negative_beta);\n" "}\n" "__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) {\n" "int index = get_global_id(0);\n" "int tmp = get_global_size(0);\n" "for(index; index < nthreads; index += tmp) {\n" "const int w = index % width;\n" "const int h = (index / width) % height;\n" "const int n = index / width / height;\n" "const int offset = (n * channels * height + h) * width + w;\n" "const int step = height * width;\n" "in = in + offset;\n" "scale = scale + offset;\n" "int head = 0;\n" "const int pre_pad = (size - 1) / 2;\n" "const int post_pad = size - pre_pad - 1;\n" "T accum_scale = 0;\n" "while (head < post_pad && head < channels) {\n" "accum_scale += in[head * step] * in[head * step];\n" "++head;\n" "}\n" "while (head < channels) {\n" "accum_scale += in[head * step] * in[head * step];\n" "if (head - size >= 0) {\n" "accum_scale -= in[(head - size) * step]\n" "* in[(head - size) * step];\n" "}\n" "scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;\n" "++head;\n" "}\n" "while (head < channels + post_pad) {\n" "if (head - size >= 0) {\n" "accum_scale -= in[(head - size) * step]\n" "* in[(head - size) * step];\n" "}\n" "scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;\n" "++head;\n" "}\n" "}\n" "}\n" , "0c65eb40713b6261f88bfa6731e32733", NULL}; struct cv::ocl::internal::ProgramEntry math_oclsrc={moduleName, "math", "#if defined(cl_khr_fp16)\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "#define CONCAT(A,B) A##_##B\n" "#define TEMPLATE(name,type) CONCAT(name,type)\n" "#define KERNEL_ARG_DTYPE float\n" "__kernel void TEMPLATE(axpy,Dtype)(const int n, const KERNEL_ARG_DTYPE alpha, __global const Dtype* x,\n" "const int offx, __global Dtype* y,\n" "const int offy) {\n" "for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n" "Dtype src = x[offx + index];\n" "Dtype dst = y[offy + index];\n" "y[offy + index] = convert_Dtype(alpha) * src + dst;\n" "}\n" "}\n" , "a76839299bc739767433b6d55915e1b7", NULL}; struct cv::ocl::internal::ProgramEntry matvec_mul_oclsrc={moduleName, "matvec_mul", "#if defined(cl_khr_fp16)\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "#define CONCAT(A,B) A##_##B\n" "#define TEMPLATE(name,type) CONCAT(name,type)\n" "#define KERNEL_ARG_DTYPE float\n" "__kernel void TEMPLATE(matvec_mul4,Dtype)(\n" "__global const Dtype * A,\n" "int offA,\n" "unsigned int A_col_size,\n" "unsigned int trail_item,\n" "__global const Dtype * v,\n" "int offv,\n" "KERNEL_ARG_DTYPE alpha,\n" "KERNEL_ARG_DTYPE beta,\n" "__global Dtype4* result,\n" "int offr,\n" "__local Dtype4* work)\n" "{\n" "unsigned int row_gid = get_group_id(0);\n" "unsigned int lid = get_local_id(0);\n" "const __global Dtype *src0_read = A + row_gid * 4 * A_col_size + offA;\n" "const __global Dtype *src1_read = v + offv;\n" "result = (__global Dtype4*)((__global Dtype*)result + offr);\n" "Dtype4 dot0 = (Dtype4)(0.f);\n" "Dtype4 dot1 = (Dtype4)(0.f);\n" "Dtype4 dot2 = (Dtype4)(0.f);\n" "Dtype4 dot3 = (Dtype4)(0.f);\n" "unsigned int i = lid;\n" "while( i < A_col_size / 4) {\n" "const Dtype4 a0 = vload4(i, src0_read);\n" "const Dtype4 a1 = vload4(i, src0_read + A_col_size);\n" "const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);\n" "const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);\n" "const Dtype4 b0 = vload4(i, src1_read);\n" "dot0 += a0 * b0;\n" "dot1 += a1 * b0;\n" "dot2 += a2 * b0;\n" "dot3 += a3 * b0;\n" "i += get_local_size(0);\n" "}\n" "work[lid].s0 = dot0.x + dot0.y + dot0.z + dot0.w;\n" "work[lid].s1 = dot1.x + dot1.y + dot1.z + dot1.w;\n" "work[lid].s2 = dot2.x + dot2.y + dot2.z + dot2.w;\n" "work[lid].s3 = dot3.x + dot3.y + dot3.z + dot3.w;\n" "if(i == A_col_size / 4)\n" "{\n" "if(trail_item != 0)\n" "{\n" "const __global Dtype *src0_trail = src0_read + i * 4;\n" "const __global Dtype *src1_trail = src1_read + i * 4;\n" "for(unsigned int i = 0; i < trail_item; ++i) {\n" "const Dtype at0 = src0_trail[i];\n" "const Dtype at1 = src0_trail[i + A_col_size];\n" "const Dtype at2 = src0_trail[i + 2 * A_col_size];\n" "const Dtype at3 = src0_trail[i + 3 * A_col_size];\n" "const Dtype bt = src1_trail[i];\n" "work[lid].s0 += at0 * bt;\n" "work[lid].s1 += at1 * bt;\n" "work[lid].s2 += at2 * bt;\n" "work[lid].s3 += at3 * bt;\n" "}\n" "}\n" "}\n" "for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) {\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if(lid < stride)\n" "work[lid] += work[lid+stride];\n" "}\n" "if(lid == 0) {\n" "if(beta == (Dtype)0)\n" "result[row_gid] = convert_Dtype(alpha) * work[0];\n" "else\n" "result[row_gid] = convert_Dtype(alpha) * work[0] + convert_Dtype(beta) * result[row_gid];\n" "}\n" "}\n" "__kernel void TEMPLATE(matvec_mul1,Dtype)(\n" "__global const Dtype * A,\n" "int offA,\n" "unsigned int A_col_size,\n" "unsigned int row_offset,\n" "unsigned int trail_item,\n" "__global const Dtype * v,\n" "int offv,\n" "KERNEL_ARG_DTYPE alpha,\n" "KERNEL_ARG_DTYPE beta,\n" "__global Dtype * result,\n" "int offr,\n" "__local Dtype * work)\n" "{\n" "unsigned int row_gid = get_group_id(0);\n" "unsigned int lid = get_local_id(0);\n" "const __global Dtype *src0_read = A + (row_offset + row_gid) * A_col_size + offA;\n" "const __global Dtype *src1_read = v + + offv;\n" "result = result + offr;\n" "Dtype4 dot0 = (Dtype4)(0.f);\n" "unsigned int i = lid;\n" "while( i < A_col_size / 4)\n" "{\n" "const Dtype4 a0 = vload4(i, src0_read);\n" "const Dtype4 b0 = vload4(i, src1_read);\n" "dot0 += a0 * b0;\n" "i += get_local_size(0);\n" "}\n" "work[lid] = dot0.x + dot0.y + dot0.z + dot0.w;\n" "if(i == A_col_size / 4)\n" "{\n" "if(trail_item != 0)\n" "{\n" "const __global Dtype *src0_trail = src0_read + i * 4;\n" "const __global Dtype *src1_trail = src1_read + i * 4;\n" "for(unsigned int i = 0; i < trail_item; ++i) {\n" "const Dtype at0 = src0_trail[i];\n" "const Dtype bt = src1_trail[i];\n" "work[lid] += at0 * bt;\n" "}\n" "}\n" "}\n" "for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) {\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if(lid < stride)\n" "work[lid] += work[lid+stride];\n" "}\n" "if(lid == 0) {\n" "if(beta == (Dtype)0) {\n" "result[row_gid+row_offset] = convert_Dtype(alpha) * work[0];\n" "} else {\n" "result[row_gid+row_offset] *= convert_Dtype(beta);\n" "result[row_gid+row_offset] += convert_Dtype(alpha) * work[0];\n" "}\n" "}\n" "}\n" , "b1ea7917f8161740ee6102617a54cfe1", NULL}; struct cv::ocl::internal::ProgramEntry mvn_oclsrc={moduleName, "mvn", "#if defined(cl_khr_fp16)\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "#define Dtype float\n" "#define Dtype4 float4\n" "#define Dtype8 float8\n" "#if NUM == 8\n" "#define load(src, index) vload8(0, src + index)\n" "#define store(vec, dst, index) vstore8(vec, 0, dst + index)\n" "#define vec_type Dtype8\n" "#define CALC_MEAN calc_mean8\n" "#define MVN mvn8\n" "#define MEAN_FUSE mean_fuse8\n" "#define MVN_FUSE mvn_fuse8\n" "#elif NUM == 4\n" "#define load(src, index) vload4(0, src + index)\n" "#define store(vec, dst, index) vstore4(vec, 0, dst + index)\n" "#define vec_type Dtype4\n" "#define CALC_MEAN calc_mean4\n" "#define MVN mvn4\n" "#define MEAN_FUSE mean_fuse4\n" "#define MVN_FUSE mvn_fuse4\n" "#elif NUM == 1\n" "#define load(src, index) src[index]\n" "#define store(vec, dst, index) dst[index] = vec\n" "#define vec_type Dtype\n" "#define CALC_MEAN calc_mean1\n" "#define MVN mvn1\n" "#define MEAN_FUSE mean_fuse1\n" "#define MVN_FUSE mvn_fuse1\n" "#endif\n" "#ifdef KERNEL_MEAN\n" "__kernel void CALC_MEAN(__global const Dtype* src,\n" "const int rows,\n" "const int cols,\n" "__global Dtype* mean,\n" "__global Dtype* dst)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * NUM;\n" "int index = x * cols + y;\n" "if (x >= rows || y >= cols)\n" "return;\n" "Dtype mean_val = mean[x];\n" "vec_type src_vec = load(src, index);\n" "vec_type dst_vec = src_vec - (vec_type)mean_val;\n" "dst_vec = dst_vec * dst_vec;\n" "store(dst_vec, dst, index);\n" "}\n" "#elif defined KERNEL_MVN\n" "__kernel void MVN(__global const Dtype* src,\n" "const int rows,\n" "const int cols,\n" "const Dtype eps,\n" "__global const Dtype* mean,\n" "__global const Dtype* dev,\n" "__global const Dtype* bnorm_weight,\n" "__global const Dtype* bnorm_bias,\n" "const int channels,\n" "const float relu_slope,\n" "__global Dtype* dst)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * NUM;\n" "int index = x * cols + y;\n" "if (x >= rows || y >= cols)\n" "return;\n" "Dtype mean_val = mean[x];\n" "Dtype dev_val = dev[x];\n" "Dtype alpha;\n" "#ifdef NORM_VARIANCE\n" "alpha = 1 / sqrt(eps + dev_val);\n" "#else\n" "alpha = 1;\n" "#endif\n" "Dtype w = 1.f, b = 0.f;\n" "#ifdef FUSE_BATCH_NORM\n" "w = bnorm_weight[x % channels];\n" "b = bnorm_bias[x % channels];\n" "#endif\n" "vec_type src_vec = load(src, index) - (vec_type)mean_val;\n" "vec_type dst_vec = src_vec * alpha;\n" "dst_vec = dst_vec * w + (vec_type)b;\n" "#ifdef FUSE_RELU\n" "vec_type new_val = dst_vec * relu_slope;\n" "dst_vec = select(new_val, dst_vec, dst_vec > (vec_type)0.f);\n" "#endif\n" "store(dst_vec, dst, index);\n" "}\n" "#elif defined KERNEL_MEAN_FUSE\n" "__kernel void MEAN_FUSE(__global const T * A,\n" "unsigned int A_col_size,\n" "float alpha,\n" "__global T4 * mean,\n" "__global Dtype * tmp)\n" "{\n" "unsigned int row_gid = get_group_id(0);\n" "unsigned int lid = get_local_id(0);\n" "const __global T *src0_read = A + row_gid * 4 * A_col_size;\n" "__global Dtype *dst0_read = tmp + row_gid * 4 * A_col_size;\n" "Dtype4 dot0, dot1, dot2, dot3;\n" "dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);\n" "unsigned int i = lid;\n" "const Dtype4 b0 = (Dtype4)1.f;\n" "while( i < A_col_size / 4)\n" "{\n" "const T4 a0 = vload4(i, src0_read);\n" "const T4 a1 = vload4(i, src0_read + A_col_size);\n" "const T4 a2 = vload4(i, src0_read + 2 * A_col_size);\n" "const T4 a3 = vload4(i, src0_read + 3 * A_col_size);\n" "dot0 += convert_float4(a0);\n" "dot1 += convert_float4(a1);\n" "dot2 += convert_float4(a2);\n" "dot3 += convert_float4(a3);\n" "i += LOCAL_SIZE;\n" "}\n" "__local Dtype4 work[LOCAL_SIZE];\n" "work[lid].s0 = dot(dot0, b0);\n" "work[lid].s1 = dot(dot1, b0);\n" "work[lid].s2 = dot(dot2, b0);\n" "work[lid].s3 = dot(dot3, b0);\n" "for(unsigned int stride=LOCAL_SIZE/2 ; stride>0 ; stride>>=1)\n" "{\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if(lid < stride)\n" "work[lid] += work[lid+stride];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if(lid == 0)\n" "{\n" "mean[row_gid] = convert_T(alpha * work[0]);\n" "}\n" "Dtype4 sum = work[0] * alpha;\n" "i = lid;\n" "while( i < A_col_size / 4)\n" "{\n" "const T4 a0 = vload4(i, src0_read);\n" "const T4 a1 = vload4(i, src0_read + A_col_size);\n" "const T4 a2 = vload4(i, src0_read + 2 * A_col_size);\n" "const T4 a3 = vload4(i, src0_read + 3 * A_col_size);\n" "dot0 = convert_float4(a0) - (Dtype4)sum.x;\n" "dot1 = convert_float4(a1) - (Dtype4)sum.y;\n" "dot2 = convert_float4(a2) - (Dtype4)sum.z;\n" "dot3 = convert_float4(a3) - (Dtype4)sum.w;\n" "dot0 = dot0 * dot0;\n" "dot1 = dot1 * dot1;\n" "dot2 = dot2 * dot2;\n" "dot3 = dot3 * dot3;\n" "vstore4(dot0, i, dst0_read);\n" "vstore4(dot1, i, dst0_read + A_col_size);\n" "vstore4(dot2, i, dst0_read + 2 * A_col_size);\n" "vstore4(dot3, i, dst0_read + 3 * A_col_size);\n" "i += LOCAL_SIZE;\n" "}\n" "}\n" "#elif defined KERNEL_MVN_FUSE\n" "__kernel void MVN_FUSE(__global const Dtype * tmp,\n" "__global const T * A,\n" "__global const T4 * mean,\n" "unsigned int A_col_size,\n" "const float alpha_val,\n" "const float eps,\n" "const float relu_slope,\n" "__global const Dtype4 * bnorm_weight,\n" "__global const Dtype4 * bnorm_bias,\n" "__global T * B)\n" "{\n" "unsigned int row_gid = get_group_id(0);\n" "unsigned int lid = get_local_id(0);\n" "const __global Dtype *src0_read = tmp + row_gid * 4 * A_col_size;\n" "const __global T *src1_read = A + row_gid * 4 * A_col_size;\n" "__global T *dst0_read = B + row_gid * 4 * A_col_size;\n" "Dtype4 dot0, dot1, dot2, dot3;\n" "dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);\n" "unsigned int i = lid;\n" "const Dtype4 b0 = (Dtype4)1.f;\n" "while( i < A_col_size / 4)\n" "{\n" "const Dtype4 a0 = vload4(i, src0_read);\n" "const Dtype4 a1 = vload4(i, src0_read + A_col_size);\n" "const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);\n" "const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);\n" "dot0 += a0;\n" "dot1 += a1;\n" "dot2 += a2;\n" "dot3 += a3;\n" "i += LOCAL_SIZE;\n" "}\n" "__local Dtype4 work[LOCAL_SIZE];\n" "work[lid].s0 = dot(dot0, b0);\n" "work[lid].s1 = dot(dot1, b0);\n" "work[lid].s2 = dot(dot2, b0);\n" "work[lid].s3 = dot(dot3, b0);\n" "for(unsigned int stride=LOCAL_SIZE/2 ; stride>0 ; stride>>=1)\n" "{\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if(lid < stride)\n" "work[lid] += work[lid+stride];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "Dtype4 mean_val = convert_float4(mean[row_gid]);\n" "Dtype4 dev_val = sqrt(work[0] * alpha_val + (Dtype4)eps);\n" "Dtype4 alpha = (Dtype4)1.f / dev_val;\n" "Dtype4 w = (Dtype4)1.f;\n" "Dtype4 b = (Dtype4)0.f;\n" "#ifdef FUSE_BATCH_NORM\n" "w = bnorm_weight[row_gid];\n" "b = bnorm_bias[row_gid];\n" "#endif\n" "i = lid;\n" "while( i < A_col_size / 4)\n" "{\n" "const T4 a0 = vload4(i, src1_read);\n" "const T4 a1 = vload4(i, src1_read + A_col_size);\n" "const T4 a2 = vload4(i, src1_read + 2 * A_col_size);\n" "const T4 a3 = vload4(i, src1_read + 3 * A_col_size);\n" "dot0 = (convert_float4(a0) - (Dtype4)mean_val.x) * alpha.x;\n" "dot1 = (convert_float4(a1) - (Dtype4)mean_val.y) * alpha.y;\n" "dot2 = (convert_float4(a2) - (Dtype4)mean_val.z) * alpha.z;\n" "dot3 = (convert_float4(a3) - (Dtype4)mean_val.w) * alpha.w;\n" "dot0 = dot0 * w.x + (Dtype4)b.x;\n" "dot1 = dot1 * w.y + (Dtype4)b.y;\n" "dot2 = dot2 * w.z + (Dtype4)b.z;\n" "dot3 = dot3 * w.w + (Dtype4)b.w;\n" "#ifdef FUSE_RELU\n" "Dtype4 new0 = dot0 * relu_slope;\n" "dot0 = select(new0, dot0, dot0 > (Dtype4)0.f);\n" "Dtype4 new1 = dot1 * relu_slope;\n" "dot1 = select(new1, dot1, dot1 > (Dtype4)0.f);\n" "Dtype4 new2 = dot2 * relu_slope;\n" "dot2 = select(new2, dot2, dot2 > (Dtype4)0.f);\n" "Dtype4 new3 = dot3 * relu_slope;\n" "dot3 = select(new3, dot3, dot3 > (Dtype4)0.f);\n" "#endif\n" "vstore4(convert_T(dot0), i, dst0_read);\n" "vstore4(convert_T(dot1), i, dst0_read + A_col_size);\n" "vstore4(convert_T(dot2), i, dst0_read + 2 * A_col_size);\n" "vstore4(convert_T(dot3), i, dst0_read + 3 * A_col_size);\n" "i += LOCAL_SIZE;\n" "}\n" "}\n" "#else\n" "#error \"Configuration error!\"\n" "#endif\n" , "d0e6334dcdc9ef67a14d01801722c035", NULL}; struct cv::ocl::internal::ProgramEntry ocl4dnn_lrn_oclsrc={moduleName, "ocl4dnn_lrn", "#define CONCAT(A,B) A##_##B\n" "#define TEMPLATE(name,type) CONCAT(name,type)\n" "#define KERNEL_ARG_DTYPE float\n" "#if defined(cl_khr_fp16)\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "__kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in,\n" "const int num, const int channels,\n" "const int height, const int width, const int size,\n" "const KERNEL_ARG_DTYPE alpha_over_size, const KERNEL_ARG_DTYPE k,\n" "__global Dtype* const out,\n" "const KERNEL_ARG_DTYPE negative_beta) {\n" "for (int index = get_global_id(0); index < nthreads;\n" "index += get_global_size(0)) {\n" "const int w = index % width;\n" "const int h = (index / width) % height;\n" "const int n = index / width / height;\n" "const int offset = (n * channels * height + h) * width + w;\n" "const int step = height * width;\n" "__global const Dtype* in_off = in + offset;\n" "__global Dtype* out_off = out + offset;\n" "KERNEL_ARG_DTYPE scale_val;\n" "int head = 0;\n" "const int pre_pad = (size - 1) / 2;\n" "const int post_pad = size - pre_pad - 1;\n" "KERNEL_ARG_DTYPE accum_scale = 0;\n" "while (head < post_pad && head < channels) {\n" "accum_scale += in_off[head * step] * in_off[head * step];\n" "++head;\n" "}\n" "while (head < channels) {\n" "accum_scale += in_off[head * step] * in_off[head * step];\n" "if (head - size >= 0) {\n" "accum_scale -= in_off[(head - size) * step]\n" "* in_off[(head - size) * step];\n" "}\n" "scale_val = k + accum_scale * alpha_over_size;\n" "out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr(scale_val, negative_beta);\n" "++head;\n" "}\n" "while (head < channels + post_pad) {\n" "if (head - size >= 0) {\n" "accum_scale -= in_off[(head - size) * step]\n" "* in_off[(head - size) * step];\n" "}\n" "scale_val = k + accum_scale * alpha_over_size;\n" "out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr(scale_val, negative_beta);\n" "++head;\n" "}\n" "}\n" "}\n" , "5b3b0615ca2e06228fef74b23250379b", NULL}; struct cv::ocl::internal::ProgramEntry ocl4dnn_pooling_oclsrc={moduleName, "ocl4dnn_pooling", "#define CONCAT(A,B) A##_##B\n" "#define TEMPLATE(name,type) CONCAT(name,type)\n" "#if defined(cl_khr_fp16)\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "#if defined KERNEL_MAX_POOL\n" "__kernel void\n" "#ifdef HAVE_MASK\n" "TEMPLATE(max_pool_forward_mask, Dtype)\n" "#else\n" "TEMPLATE(max_pool_forward, Dtype)\n" "#endif\n" "(\n" "const int nthreads, __global const Dtype* bottom_data,\n" "const int channels, const int height, const int width,\n" "const int pooled_height, const int pooled_width,\n" "__global Dtype* top_data\n" "#ifdef HAVE_MASK\n" ", __global Dtype* mask\n" "#endif\n" ")\n" "{\n" "int index = get_global_id(0);\n" "if (index >= nthreads)\n" "return;\n" "const int pw = index % pooled_width;\n" "const int xx = index / pooled_width;\n" "const int ph = xx % pooled_height;\n" "const int ch = xx / pooled_height;\n" "int hstart = ph * STRIDE_H - PAD_T;\n" "int wstart = pw * STRIDE_W - PAD_L;\n" "Dtype maxval = -FLT_MAX;\n" "int maxidx = -1;\n" "int in_offset = ch * height * width;\n" "for (int h = 0; h < KERNEL_H; ++h)\n" "{\n" "int off_y = hstart + h;\n" "if (off_y >= 0 && off_y < height)\n" "{\n" "for (int w = 0; w < KERNEL_W; ++w)\n" "{\n" "int off_x = wstart + w;\n" "if (off_x >= 0 && off_x < width)\n" "{\n" "Dtype val = bottom_data[in_offset + off_y * width + off_x];\n" "maxidx = (val > maxval) ? (off_y * width + off_x) : maxidx;\n" "maxval = fmax(val, maxval);\n" "}\n" "}\n" "}\n" "}\n" "top_data[index] = maxval;\n" "#ifdef HAVE_MASK\n" "mask[index] = maxidx;\n" "#endif\n" "}\n" "#elif defined KERNEL_AVE_POOL\n" "__kernel void TEMPLATE(ave_pool_forward, Dtype)(\n" "const int nthreads, __global const Dtype* bottom_data,\n" "const int channels, const int height, const int width,\n" "const int pooled_height, const int pooled_width,\n" "__global Dtype* top_data)\n" "{\n" "int index = get_global_id(0);\n" "if (index >= nthreads)\n" "return;\n" "const int pw = index % pooled_width;\n" "const int xx = index / pooled_width;\n" "const int ph = xx % pooled_height;\n" "const int ch = xx / pooled_height;\n" "int hstart = ph * STRIDE_H - PAD_T;\n" "int wstart = pw * STRIDE_W - PAD_L;\n" "int hend = min(hstart + KERNEL_H, height + PAD_B);\n" "int wend = min(wstart + KERNEL_W, width + PAD_R);\n" "int pool_size;\n" "#ifdef AVE_POOL_PADDING_AREA\n" "pool_size = (hend - hstart) * (wend - wstart);\n" "hstart = max(hstart, (int)0);\n" "wstart = max(wstart, (int)0);\n" "hend = min(hend, height);\n" "wend = min(wend, width);\n" "#else\n" "hstart = max(hstart, (int)0);\n" "wstart = max(wstart, (int)0);\n" "hend = min(hend, height);\n" "wend = min(wend, width);\n" "pool_size = (hend - hstart) * (wend - wstart);\n" "#endif\n" "Dtype aveval = 0;\n" "int in_offset = ch * height * width;\n" "for (int h = hstart; h < hend; ++h)\n" "{\n" "for (int w = wstart; w < wend; ++w)\n" "{\n" "aveval += bottom_data[in_offset + h * width + w];\n" "}\n" "}\n" "top_data[index] = aveval / pool_size;\n" "}\n" "#elif defined KERNEL_STO_POOL\n" "__kernel void TEMPLATE(sto_pool_forward_test,Dtype)(\n" "const int nthreads, __global const Dtype* bottom_data,\n" "const int channels, const int height, const int width,\n" "const int pooled_height, const int pooled_width,\n" "__global Dtype* top_data)\n" "{\n" "for (int index = get_global_id(0); index < nthreads;\n" "index += get_global_size(0))\n" "{\n" "const int pw = index % pooled_width;\n" "const int ph = (index / pooled_width) % pooled_height;\n" "const int c = (index / pooled_width / pooled_height) % channels;\n" "const int n = index / pooled_width / pooled_height / channels;\n" "const int hstart = ph * STRIDE_H;\n" "const int hend = min(hstart + KERNEL_H, height);\n" "const int wstart = pw * STRIDE_W;\n" "const int wend = min(wstart + KERNEL_W, width);\n" "Dtype cumsum = FLT_MIN;\n" "Dtype cumvalues = 0.;\n" "__global const Dtype* bottom_slice = bottom_data\n" "+ (n * channels + c) * height * width;\n" "for (int h = hstart; h < hend; ++h) {\n" "for (int w = wstart; w < wend; ++w) {\n" "Dtype v = bottom_slice[h * width + w];\n" "cumsum += v;\n" "cumvalues += v * v;\n" "}\n" "}\n" "top_data[index] = cumvalues / cumsum;\n" "}\n" "}\n" "#endif\n" , "323321c5f6f114f2693c552e81e87230", NULL}; struct cv::ocl::internal::ProgramEntry permute_oclsrc={moduleName, "permute", "#if defined(cl_khr_fp16)\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "__kernel void permute(const int nthreads,\n" "__global Dtype* bottom_data,\n" "global int* permute_order,\n" "global int* oldStride,\n" "global int* newStride,\n" "const int num_axes,\n" "__global Dtype* top_data)\n" "{\n" "for (int i = get_global_id(0); i < nthreads; i += get_global_size(0))\n" "{\n" "int oldPosition = 0;\n" "int newPosition = i;\n" "for (int j = 0; j < num_axes; ++j)\n" "{\n" "int order = permute_order[j];\n" "oldPosition += (newPosition / newStride[j]) * oldStride[order];\n" "newPosition %= newStride[j];\n" "}\n" "top_data[i] = bottom_data[oldPosition];\n" "}\n" "}\n" , "81803672217de8c6fc01de8a6e7f283a", NULL}; struct cv::ocl::internal::ProgramEntry pooling_oclsrc={moduleName, "pooling", "/*************************************************************************************\n" "* Copyright (c) 2015, Advanced Micro Devices, Inc.\n" "* All rights reserved.\n" "*\n" "* Redistribution and use in source and binary forms, with or without modification,\n" "* are permitted provided that the following conditions are met:\n" "*\n" "* 1. Redistributions of source code must retain the above copyright notice, this\n" "* list of conditions and the following disclaimer.\n" "*\n" "* 2. Redistributions in binary form must reproduce the above copyright notice,\n" "* this list of conditions and the following disclaimer in the documentation and/or\n" "* other materials provided with the distribution.\n" "*\n" "* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n" "* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n" "* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\n" "* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,\n" "* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,\n" "* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,\n" "* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,\n" "* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n" "* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\n" "* POSSIBILITY OF SUCH DAMAGE.\n" "**************************************************************************************/\n" "__kernel void MaxPoolForward(const int nthreads,\n" "__global T* bottom_data, const int num, const int channels, const int height, const int width,\n" "const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w,\n" "const int stride_h, const int stride_w, const int pad_t, const int pad_l, const int pad_b, const int pad_r,\n" "__global T* top_data\n" "#ifdef MASK\n" ", __global float* mask\n" "#endif\n" ")\n" "{\n" "int index = get_global_id(0);\n" "int tmp = get_global_size(0);\n" "for(index; index < nthreads; index += tmp) {\n" "int pw = index % pooled_width;\n" "int ph = (index / pooled_width) % pooled_height;\n" "int c = (index / pooled_width / pooled_height) % channels;\n" "int n = index / pooled_width / pooled_height / channels;\n" "int hstart = ph * stride_h - pad_t;\n" "int wstart = pw * stride_w - pad_l;\n" "const int hend = min(hstart + kernel_h, height);\n" "const int wend = min(wstart + kernel_w, width);\n" "hstart = max(hstart, 0);\n" "wstart = max(wstart, 0);\n" "T maxval = -FLT_MAX;\n" "int maxidx = -1;\n" "bottom_data =\n" "bottom_data + (n * channels + c) * height * width;\n" "for (int h = hstart; h < hend; ++h) {\n" "for (int w = wstart; w < wend; ++w) {\n" "if (bottom_data[h * width + w] > maxval) {\n" "maxidx = h * width + w;\n" "maxval = bottom_data[maxidx];\n" "}\n" "}\n" "}\n" "top_data[index] = maxval;\n" "#ifdef MASK\n" "mask[index] = maxidx;\n" "#endif\n" "}\n" "}\n" "__kernel void AvePoolForward(const int nthreads,\n" "__global T* bottom_data, const int num, const int channels, const int height, const int width,\n" "const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w,\n" "const int stride_h, const int stride_w, const int pad_t, const int pad_l, const int pad_b, const int pad_r,\n" "__global T* top_data\n" "#ifdef MASK\n" ", __global float* mask\n" "#endif\n" ")\n" "{\n" "int index = get_global_id(0);\n" "int tmp = get_global_size(0);\n" "for(index; index < nthreads; index+=tmp) {\n" "int pw = index % pooled_width;\n" "int ph = (index / pooled_width) % pooled_height;\n" "int c = (index / pooled_width / pooled_height) % channels;\n" "int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_t; int wstart = pw * stride_w - pad_l;\n" "int hend = min(hstart + kernel_h, height + pad_b);\n" "int wend = min(wstart + kernel_w, width + pad_r);\n" "const int pool_size = (hend - hstart) * (wend - wstart);\n" "hstart = max(hstart, 0);\n" "wstart = max(wstart, 0);\n" "hend = min(hend, height);\n" "wend = min(wend, width);\n" "T aveval = 0;\n" "bottom_data =\n" "bottom_data + (n * channels + c) * height * width;\n" "for (int h = hstart; h < hend; ++h) {\n" "for (int w = wstart; w < wend; ++w) {\n" "aveval += bottom_data[h * width + w];\n" "}\n" "}\n" "top_data[index] = aveval / pool_size;\n" "}\n" "}\n" , "d2fa86ff9f1a4b51458f3caf054ff85f", NULL}; struct cv::ocl::internal::ProgramEntry prior_box_oclsrc={moduleName, "prior_box", "#if defined(cl_khr_fp16)\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "__kernel void prior_box(const int nthreads,\n" "const float stepX,\n" "const float stepY,\n" "__global const float* _offsetsX,\n" "__global const float* _offsetsY,\n" "const int offsetsX_size,\n" "__global const float* _widths,\n" "__global const float* _heights,\n" "const int widths_size,\n" "__global Dtype* dst,\n" "const int _layerHeight,\n" "const int _layerWidth,\n" "const int imgHeight,\n" "const int imgWidth)\n" "{\n" "for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))\n" "{\n" "int w = index % _layerWidth;\n" "int h = index / _layerWidth;\n" "__global Dtype* outputPtr;\n" "outputPtr = dst + index * 4 * offsetsX_size * widths_size;\n" "float _boxWidth, _boxHeight;\n" "Dtype4 vec;\n" "for (int i = 0; i < widths_size; ++i)\n" "{\n" "_boxWidth = _widths[i];\n" "_boxHeight = _heights[i];\n" "for (int j = 0; j < offsetsX_size; ++j)\n" "{\n" "Dtype center_x = (w + _offsetsX[j]) * (Dtype)stepX;\n" "Dtype center_y = (h + _offsetsY[j]) * (Dtype)stepY;\n" "vec.x = (center_x - _boxWidth * 0.5f) / imgWidth;\n" "vec.y = (center_y - _boxHeight * 0.5f) / imgHeight;\n" "vec.z = (center_x + _boxWidth * 0.5f) / imgWidth;\n" "vec.w = (center_y + _boxHeight * 0.5f) / imgHeight;\n" "vstore4(vec, 0, outputPtr);\n" "outputPtr += 4;\n" "}\n" "}\n" "}\n" "}\n" "__kernel void set_variance(const int nthreads,\n" "const int offset,\n" "const int variance_size,\n" "__global const float* variance,\n" "__global Dtype* dst)\n" "{\n" "for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))\n" "{\n" "Dtype4 var_vec;\n" "if (variance_size == 1)\n" "var_vec = (Dtype4)(variance[0]);\n" "else\n" "var_vec = convert_T(vload4(0, variance));\n" "vstore4(var_vec, 0, dst + offset + index * 4);\n" "}\n" "}\n" "__kernel void clip(const int nthreads,\n" "__global Dtype* dst)\n" "{\n" "for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))\n" "{\n" "Dtype4 vec = vload4(index, dst);\n" "vstore4(clamp(vec, (Dtype)0.0f, (Dtype)1.0f), index, dst);\n" "}\n" "}\n" , "1675591e0cb26d5b10d9a5a7e111007e", NULL}; struct cv::ocl::internal::ProgramEntry region_oclsrc={moduleName, "region", "#define Dtype float\n" "__kernel void logistic_activ(const int count,\n" "__global const Dtype* src,\n" "const int cell_size,\n" "__global Dtype* dst)\n" "{\n" "for (int i = get_global_id(0); i < count; i += get_global_size(0))\n" "{\n" "int index = cell_size * i;\n" "Dtype x = src[index + 4];\n" "dst[index + 4] = 1.f / (1.f + exp(-x));\n" "}\n" "}\n" "__kernel void softmax_activ(const int count,\n" "__global const Dtype* src,\n" "__global const Dtype* biasData,\n" "const int cell_size,\n" "const int classes,\n" "const int classfix,\n" "const int rows,\n" "const int cols,\n" "const int anchors,\n" "const float thresh,\n" "__global Dtype* dst)\n" "{\n" "for (int index = get_global_id(0); index < count; index += get_global_size(0))\n" "{\n" "int box_index = index * cell_size;\n" "float largest = -FLT_MAX;\n" "__global const Dtype *input = src + box_index + 5;\n" "__global Dtype *output = dst + box_index + 5;\n" "for (int i = 0; i < classes; ++i)\n" "largest = fmax(largest, input[i]);\n" "float sum = 0;\n" "for (int i = 0; i < classes; ++i)\n" "{\n" "float e = exp((input[i] - largest));\n" "sum += e;\n" "output[i] = e;\n" "}\n" "int y = (index / (anchors * cols)) % rows;\n" "int x = (index / anchors) % cols;\n" "int a = index % anchors;\n" "float scale = dst[box_index + 4];\n" "if (classfix == -1 && scale < .5) scale = 0;\n" "float v1 = src[box_index + 0];\n" "float v2 = src[box_index + 1];\n" "float l1 = 1.f / (1.f + exp(-v1));\n" "float l2 = 1.f / (1.f + exp(-v2));\n" "dst[box_index + 0] = (x + l1) / cols;\n" "dst[box_index + 1] = (y + l2) / rows;\n" "dst[box_index + 2] = exp(src[box_index + 2]) * biasData[2 * a] / cols;\n" "dst[box_index + 3] = exp(src[box_index + 3]) * biasData[2 * a + 1] / rows;\n" "for (int i = 0; i < classes; ++i)\n" "{\n" "float prob = scale * output[i] / sum;\n" "output[i] = (prob > thresh) ? prob : 0;\n" "}\n" "}\n" "}\n" , "974d8f1dbe16bfbf98914ab49bd55d11", NULL}; struct cv::ocl::internal::ProgramEntry slice_oclsrc={moduleName, "slice", "#define CONCAT_(A, B) A##B\n" "#define CONCAT(A, B) CONCAT_(A, B)\n" "#define BLOCK_COLS_X4 (BLOCK_COLS / 4)\n" "#define BLOCK_COLS_X16 (BLOCK_COLS / 16)\n" "__attribute__((reqd_work_group_size(WSZ, 1, 1)))\n" "__kernel void\n" "CONCAT(slice_, SLICE_KERNEL_SUFFIX)(\n" "__global const uchar* src0,\n" "__global uchar* dst0\n" ")\n" "{\n" "uint block_id = get_global_id(1);\n" "uint dst_offset0 = block_id * BLOCK_SIZE;\n" "uint src_offset0 = 0;\n" "{\n" "#define CALC_SRC_INDEX(dim) \\\n" "{ \\\n" "uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \\\n" "CONCAT(idx_, dim) = block_id / plane_sz; \\\n" "block_id = block_id - CONCAT(idx_, dim) * plane_sz; \\\n" "}\n" "#define UPDATE_SRC_OFFSET(dim) \\\n" "src_offset0 = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset0);\n" "#if DIMS > 5\n" "#error \"invalid configuration\"\n" "#endif\n" "#if DIMS > 4\n" "uint idx_4 = 0;\n" "#if BLOCK_DIMS <= 4\n" "CALC_SRC_INDEX(4)\n" "#endif\n" "UPDATE_SRC_OFFSET(4)\n" "#endif\n" "#if DIMS > 3\n" "uint idx_3 = 0;\n" "#if BLOCK_DIMS <= 3\n" "CALC_SRC_INDEX(3)\n" "#endif\n" "UPDATE_SRC_OFFSET(3)\n" "#endif\n" "#if DIMS > 2\n" "uint idx_2 = 0;\n" "#if BLOCK_DIMS <= 2\n" "CALC_SRC_INDEX(2)\n" "#endif\n" "UPDATE_SRC_OFFSET(2)\n" "#endif\n" "#if DIMS > 1\n" "uint idx_1 = 0;\n" "#if BLOCK_DIMS <= 1\n" "CALC_SRC_INDEX(1)\n" "#endif\n" "UPDATE_SRC_OFFSET(1)\n" "#endif\n" "#if DIMS > 0\n" "uint idx_0 = 0;\n" "UPDATE_SRC_OFFSET(0)\n" "#endif\n" "}\n" "#ifdef USE_COPY_1D\n" "{\n" "__global const uchar* src = src0 + src_offset0;\n" "__global uchar* dst = dst0 + dst_offset0;\n" "uint processed = 0;\n" "#if BLOCK_COLS_X16 >= 4\n" "{\n" "uint i = get_local_id(0) * 16;\n" "while (i < BLOCK_COLS_X16 * 16)\n" "{\n" "uint4 idx0 = (uint4)i;\n" "uint4 idx = idx0 + (uint4)(0, 16 * WSZ, 32 * WSZ, 48 * WSZ);\n" "idx = select(idx0, idx, idx < (BLOCK_COLS_X16 * 16));\n" "uchar16 a0 = vload16(0, src + idx.s0);\n" "uchar16 a1 = vload16(0, src + idx.s1);\n" "uchar16 a2 = vload16(0, src + idx.s2);\n" "uchar16 a3 = vload16(0, src + idx.s3);\n" "vstore16(a0, 0, dst + idx.s0);\n" "vstore16(a1, 0, dst + idx.s1);\n" "vstore16(a2, 0, dst + idx.s2);\n" "vstore16(a3, 0, dst + idx.s3);\n" "i += WSZ * 16 * 4;\n" "}\n" "processed = BLOCK_COLS_X16 * 16;\n" "}\n" "#else\n" "#define SKIP_1D_BLOCK_COLS_X16 1\n" "#endif\n" "#if BLOCK_COLS_X4 > 0 && (defined(SKIP_1D_BLOCK_COLS_X16) || (BLOCK_COLS_X16 * 16 != BLOCK_COLS_X4 * 4))\n" "{\n" "uint i = get_local_id(0) * 4 + processed;\n" "while (i < BLOCK_COLS_X4 * 4)\n" "{\n" "uint4 idx0 = (uint4)i;\n" "uint4 idx = idx0 + (uint4)(0, 4 * WSZ, 8 * WSZ, 12 * WSZ);\n" "idx = select(idx0, idx, idx < (BLOCK_COLS_X4 * 4));\n" "uchar4 a0 = vload4(0, src + idx.s0);\n" "uchar4 a1 = vload4(0, src + idx.s1);\n" "uchar4 a2 = vload4(0, src + idx.s2);\n" "uchar4 a3 = vload4(0, src + idx.s3);\n" "vstore4(a0, 0, dst + idx.s0);\n" "vstore4(a1, 0, dst + idx.s1);\n" "vstore4(a2, 0, dst + idx.s2);\n" "vstore4(a3, 0, dst + idx.s3);\n" "i += WSZ * 4 * 4;\n" "}\n" "processed = BLOCK_COLS_X4 * 4;\n" "}\n" "#else\n" "#define SKIP_1D_BLOCK_COLS_X4 1\n" "#endif\n" "#if (defined(SKIP_1D_BLOCK_COLS_X16) && defined(SKIP_1D_BLOCK_COLS_X4)) || BLOCK_COLS_X4 * 4 != BLOCK_COLS\n" "{\n" "uint i = get_local_id(0) + processed;\n" "while (i < BLOCK_COLS)\n" "{\n" "uchar a0 = src[i];\n" "dst[i] = a0;\n" "i += WSZ;\n" "}\n" "}\n" "#endif\n" "}\n" "#else\n" "{\n" "__global const uchar* src = src0 + src_offset0;\n" "__global uchar* dst = dst0 + dst_offset0;\n" "uint i = get_local_id(0) * 4;\n" "#define BLOCK_COLS_FILL_X4 (((BLOCK_COLS + 3) / 4) * 4)\n" "#define BLOCK_SIZE_FILL_X4 (BLOCK_COLS_FILL_X4 * BLOCK_ROWS)\n" "while (i < BLOCK_SIZE_FILL_X4)\n" "{\n" "int row = i / BLOCK_COLS_FILL_X4;\n" "int col = i % BLOCK_COLS_FILL_X4;\n" "uint src_offset = row * BLOCK_SRC_STRIDE + col;\n" "#if BLOCK_COLS_FILL_X4 == BLOCK_COLS\n" "uint dst_offset = i;\n" "#else\n" "uint dst_offset = row * BLOCK_COLS + col;\n" "#endif\n" "#if BLOCK_COLS_FILL_X4 != BLOCK_COLS\n" "if (col <= BLOCK_COLS - 4)\n" "#endif\n" "{\n" "uchar4 a = vload4(0, src + src_offset);\n" "vstore4(a, 0, dst + dst_offset);\n" "}\n" "#if BLOCK_COLS_FILL_X4 != BLOCK_COLS\n" "else\n" "{\n" "uint4 shift = (uint4)(0, 1, 2, 3);\n" "shift = select((uint4)0, shift, col + shift < BLOCK_COLS);\n" "dst[dst_offset + shift.s0] = src[src_offset + shift.s0];\n" "#if BLOCK_COLS_FILL_X4 - BLOCK_COLS <= 2\n" "dst[dst_offset + shift.s1] = src[src_offset + shift.s1];\n" "#endif\n" "#if BLOCK_COLS_FILL_X4 - BLOCK_COLS <= 1\n" "dst[dst_offset + shift.s2] = src[src_offset + shift.s2];\n" "#endif\n" "}\n" "#endif\n" "i += WSZ * 4;\n" "}\n" "}\n" "#endif\n" "}\n" , "94dad8ab7b7b1da7fd128d8ba047b152", NULL}; struct cv::ocl::internal::ProgramEntry softmax_oclsrc={moduleName, "softmax", "/*************************************************************************************\n" "* Copyright (c) 2015, Advanced Micro Devices, Inc.\n" "* All rights reserved.\n" "*\n" "* Redistribution and use in source and binary forms, with or without modification,\n" "* are permitted provided that the following conditions are met:\n" "*\n" "* 1. Redistributions of source code must retain the above copyright notice, this\n" "* list of conditions and the following disclaimer.\n" "*\n" "* 2. Redistributions in binary form must reproduce the above copyright notice,\n" "* this list of conditions and the following disclaimer in the documentation and/or\n" "* other materials provided with the distribution.\n" "*\n" "* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n" "* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n" "* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\n" "* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,\n" "* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,\n" "* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,\n" "* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,\n" "* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n" "* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\n" "* POSSIBILITY OF SUCH DAMAGE.\n" "**************************************************************************************/\n" "#if defined(cl_khr_fp16)\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "__kernel void kernel_channel_max(const int num, const int channels,\n" "const int spatial_dim, __global const T* data, __global T* out) {\n" "int index = get_global_id(0);\n" "if(index < num * spatial_dim) {\n" "int n = index / spatial_dim;\n" "int s = index % spatial_dim;\n" "T maxval = -FLT_MAX;\n" "for (int c = 0; c < channels; ++c) {\n" "maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);\n" "}\n" "out[index] = maxval;\n" "}\n" "}\n" "__kernel void kernel_channel_subtract(const int count,\n" "const int num, const int channels,\n" "const int spatial_dim, __global const T* channel_max, __global const T* src, __global T* data) {\n" "int index = get_global_id(0);\n" "if(index < count) {\n" "int n = index / channels / spatial_dim;\n" "int s = index % spatial_dim;\n" "data[index] = exp(src[index] - channel_max[n * spatial_dim + s]);\n" "}\n" "}\n" "__kernel void kernel_channel_sum(const int num, const int channels,\n" "const int spatial_dim, __global const T* data, __global T* channel_sum) {\n" "int index = get_global_id(0);\n" "if(index < num * spatial_dim) {\n" "int n = index / spatial_dim;\n" "int s = index % spatial_dim;\n" "T sum = 0;\n" "for (int c = 0; c < channels; ++c) {\n" "sum += data[(n * channels + c) * spatial_dim + s];\n" "}\n" "channel_sum[index] = sum;\n" "}\n" "}\n" "__kernel void kernel_channel_div(const int count,\n" "const int num, const int channels,\n" "const int spatial_dim, __global const T* channel_sum, __global T* data) {\n" "int index = get_global_id(0);\n" "if(index < count) {\n" "int n = index / channels / spatial_dim;\n" "int s = index % spatial_dim;\n" "T v = data[index] / channel_sum[n * spatial_dim + s];\n" "#ifdef LOG_SOFTMAX\n" "v = log(v);\n" "#endif\n" "data[index] = v;\n" "}\n" "}\n" , "db5bfbbe4215a169392800a28b6834c4", NULL}; struct cv::ocl::internal::ProgramEntry softmax_loss_oclsrc={moduleName, "softmax_loss", "#define CONCAT(A,B) A##_##B\n" "#define TEMPLATE(name,type) CONCAT(name,type)\n" "#if defined(cl_intel_subgroups)\n" "#pragma OPENCL EXTENSION cl_intel_subgroups : enable\n" "#endif\n" "#if defined(cl_khr_fp16)\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "__kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels,\n" "const int spatial_dim,\n" "__global Dtype* scale,\n" "__global const Dtype* data,\n" "__global Dtype* out,\n" "__local Dtype *out_tmp,\n" "__local Dtype *scale_tmp,\n" "__local Dtype *group_tmp) {\n" "int n = get_global_id(1);\n" "for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=\n" "get_global_size(0), ++s) {\n" "Dtype maxval = -DTYPE_MAX;\n" "for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {\n" "Dtype tmp = data[(n * channels + c) * spatial_dim + s];\n" "maxval = max((Dtype)tmp, (Dtype)maxval);\n" "}\n" "maxval = sub_group_reduce_max(maxval);\n" "group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=\n" "get_global_size(0)) {\n" "int s = index / get_max_sub_group_size();\n" "Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);\n" "scale_tmp[s] = maxval;\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "for (int index = get_global_id(0); index < channels * spatial_dim;\n" "index += get_global_size(0)) {\n" "int s = index % spatial_dim;\n" "out_tmp[index] = exp(data[n * channels * spatial_dim + index] - scale_tmp[s]);\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=\n" "get_global_size(0), ++s) {\n" "Dtype sum = 0;\n" "for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {\n" "sum += out_tmp[c * spatial_dim + s];\n" "}\n" "sum = sub_group_reduce_add(sum);\n" "group_tmp[get_sub_group_id() * spatial_dim + s] = sum;\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=\n" "get_global_size(0)) {\n" "int s = index / get_max_sub_group_size();\n" "Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);\n" "scale_tmp[s] = sum;\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "for (int index = get_global_id(0); index < channels * spatial_dim;\n" "index += get_global_size(0)) {\n" "int s = index % spatial_dim;\n" "Dtype v = out_tmp[index] / scale_tmp[s];\n" "#ifdef LOG_SOFTMAX\n" "v = log(v);\n" "#endif\n" "out[n * channels * spatial_dim + index] = v;\n" "}\n" "}\n" "__kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,\n" "const int spatial_dim,\n" "__global Dtype* scale,\n" "__global const Dtype* data,\n" "__global Dtype* out) {\n" "int n = get_global_id(1);\n" "__global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim;\n" "for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=\n" "get_global_size(0), ++s) {\n" "Dtype maxval = -DTYPE_MAX;\n" "for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {\n" "Dtype tmp = data[(n * channels + c) * spatial_dim + s];\n" "maxval = max((Dtype)tmp, (Dtype)maxval);\n" "}\n" "maxval = sub_group_reduce_max(maxval);\n" "group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;\n" "}\n" "barrier(CLK_GLOBAL_MEM_FENCE);\n" "for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=\n" "get_global_size(0)) {\n" "int s = index / get_max_sub_group_size();\n" "Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);\n" "scale[n * spatial_dim + s] = maxval;\n" "}\n" "barrier(CLK_GLOBAL_MEM_FENCE);\n" "for (int index = get_global_id(0); index < channels * spatial_dim;\n" "index += get_global_size(0)) {\n" "int s = index % spatial_dim;\n" "out[n * channels * spatial_dim + index] = exp(data[n * channels * spatial_dim + index] - scale[n * spatial_dim + s]);\n" "}\n" "barrier(CLK_GLOBAL_MEM_FENCE);\n" "for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=\n" "get_global_size(0), ++s) {\n" "Dtype sum = 0;\n" "for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {\n" "sum += out[n * channels * spatial_dim + c * spatial_dim + s];\n" "}\n" "sum = sub_group_reduce_add(sum);\n" "group_tmp[get_sub_group_id() * spatial_dim + s] = sum;\n" "}\n" "barrier(CLK_GLOBAL_MEM_FENCE);\n" "for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=\n" "get_global_size(0)) {\n" "int s = index / get_max_sub_group_size();\n" "Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);\n" "scale[n * spatial_dim + s] = sum;\n" "}\n" "barrier(CLK_GLOBAL_MEM_FENCE);\n" "for (int index = get_global_id(0); index < channels * spatial_dim;\n" "index += get_global_size(0)) {\n" "int s = index % spatial_dim;\n" "Dtype v = out[n * channels * spatial_dim + index] / scale[n * spatial_dim + s];\n" "#ifdef LOG_SOFTMAX\n" "v = log(v);\n" "#endif\n" "out[n * channels * spatial_dim + index] = v;\n" "}\n" "}\n" , "9b1ebb425bf1e67c1294d8bf60783216", NULL}; }}} #endif