// This file is auto-generated. Do not edit!

#include "opencv2/core.hpp"
#include "cvconfig.h"
#include "opencl_kernels_dnn.hpp"

#ifdef HAVE_OPENCL

namespace cv
{
namespace ocl
{
namespace dnn
{

static const char* const moduleName = "dnn";

struct cv::ocl::internal::ProgramEntry activations_oclsrc={moduleName, "activations",
"#define CONCAT(A,B) A##_##B\n"
"#define TEMPLATE(name,type) CONCAT(name,type)\n"
"#define KERNEL_ARG_DTYPE float\n"
"#if defined(cl_khr_fp16)\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"__kernel void ReLUForward(const int count, __global const T* in, __global T* out\n"
"#ifndef RELU_NO_SLOPE\n"
", KERNEL_ARG_DTYPE negative_slope\n"
"#endif\n"
") {\n"
"int index = get_global_id(0);\n"
"if(index < count)\n"
"#ifndef RELU_NO_SLOPE\n"
"out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;\n"
"#else\n"
"out[index] = in[index] > 0 ? in[index] : 0;\n"
"#endif\n"
"}\n"
"__kernel void ReLU6Forward(const int count, __global const T* in, __global T* out,\n"
"const KERNEL_ARG_DTYPE minValue, const KERNEL_ARG_DTYPE maxValue)\n"
"{\n"
"int index = get_global_id(0);\n"
"if(index < count)\n"
"{\n"
"T x = in[index];\n"
"out[index] = clamp(x, convert_T(minValue), convert_T(maxValue));\n"
"}\n"
"}\n"
"__kernel void PReLUForward(const int count, const int channels, const int plane_size,\n"
"__global const T* in, __global T* out,\n"
"__global const KERNEL_ARG_DTYPE* slope_data)\n"
"{\n"
"int index = get_global_id(0);\n"
"int c = (index / plane_size) % channels;\n"
"if(index < count)\n"
"out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];\n"
"}\n"
"__kernel void TanHForward(const int count, __global T* in, __global T* out) {\n"
"int index = get_global_id(0);\n"
"if(index < count)\n"
"out[index] = tanh(in[index]);\n"
"}\n"
"__kernel void SigmoidForward(const int count, __global const T* in, __global T* out) {\n"
"int index = get_global_id(0);\n"
"if(index < count)\n"
"out[index] = 1.0f / (1.0f + exp(-in[index]));\n"
"}\n"
"__kernel void SwishForward(const int count, __global const T* in, __global T* out) {\n"
"int index = get_global_id(0);\n"
"if(index < count)\n"
"out[index] = in[index] / (1.0f + exp(-in[index]));\n"
"}\n"
"__kernel void MishForward(const int count, __global const T* in, __global T* out) {\n"
"int index = get_global_id(0);\n"
"if(index < count)\n"
"out[index] = in[index] * tanh(log(1.0f + exp(in[index])));\n"
"}\n"
"__kernel void BNLLForward(const int n, __global const T* in, __global T* out) {\n"
"int index = get_global_id(0);\n"
"if (index < n) {\n"
"T x = in[index];\n"
"out[index] = x > 0 ? x + log(1.0f + exp(-x)) : log(1.0f + exp(x));\n"
"}\n"
"}\n"
"__kernel void AbsValForward(const int n, __global const T* in, __global T* out) {\n"
"int index = get_global_id(0);\n"
"if (index < n)\n"
"out[index] = fabs(in[index]);\n"
"}\n"
"__kernel void PowForward(const int n, __global const T* in, __global T* out,\n"
"const KERNEL_ARG_DTYPE power,\n"
"const KERNEL_ARG_DTYPE scale,\n"
"const KERNEL_ARG_DTYPE shift)\n"
"{\n"
"int index = get_global_id(0);\n"
"if (index < n)\n"
"out[index] = pow(shift + scale * in[index], power);\n"
"}\n"
"__kernel void ELUForward(const int n, __global const T* in, __global T* out)\n"
"{\n"
"int index = get_global_id(0);\n"
"if (index < n)\n"
"{\n"
"T src = in[index];\n"
"out[index] = (src >= 0.f) ? src : exp(src) - 1;\n"
"}\n"
"}\n"
"__kernel void ExpForward(const int n, __global const T* in, __global T* out,\n"
"const KERNEL_ARG_DTYPE normScale,\n"
"const KERNEL_ARG_DTYPE normShift)\n"
"{\n"
"int index = get_global_id(0);\n"
"if (index < n)\n"
"{\n"
"out[index] = exp(normShift + normScale * in[index]);\n"
"}\n"
"}\n"
, "69e28bd964980d395339a63e2aabfe86", NULL};
struct cv::ocl::internal::ProgramEntry batchnorm_oclsrc={moduleName, "batchnorm",
"#if defined(cl_khr_fp16)\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#if NUM == 8\n"
"#define load(src, index) vload8(0, src + index)\n"
"#define store(vec, dst, index) vstore8(vec, 0, dst + index)\n"
"#define float_type float8\n"
"#define convert_f convert_float8\n"
"#define BATCH_NORM batch_norm8\n"
"#elif NUM == 4\n"
"#define load(src, index) vload4(0, src + index)\n"
"#define store(vec, dst, index) vstore4(vec, 0, dst + index)\n"
"#define float_type float4\n"
"#define convert_f convert_float4\n"
"#define BATCH_NORM batch_norm4\n"
"#elif NUM == 1\n"
"#define load(src, index) src[index]\n"
"#define store(vec, dst, index) dst[index] = vec\n"
"#define float_type float\n"
"#define convert_f convert_float\n"
"#define BATCH_NORM batch_norm1\n"
"#endif\n"
"__kernel void BATCH_NORM(__global const Dtype* src,\n"
"const int rows,\n"
"const int cols,\n"
"const int channels,\n"
"__global const float* weight,\n"
"__global const float* bias,\n"
"__global Dtype* dst)\n"
"{\n"
"int x = get_global_id(0);\n"
"int y = get_global_id(1) * NUM;\n"
"int index = x * cols + y;\n"
"if (x >= rows || y >= cols)\n"
"return;\n"
"float w = weight[x % channels];\n"
"float b = bias[x % channels];\n"
"float_type src_vec = convert_f(load(src, index));\n"
"float_type dst_vec = src_vec * w + (float_type)b;\n"
"store(convert_T(dst_vec), dst, index);\n"
"}\n"
, "c84913b518980a1dc7a4f1f41f7f95fc", NULL};
struct cv::ocl::internal::ProgramEntry col2im_oclsrc={moduleName, "col2im",
"__kernel void col2im(const int n, __global const T* data_col,\n"
"const int data_col_offset,\n"
"const int channels,\n"
"const int height, const int width,\n"
"const int height_col, const int width_col,\n"
"const int coeff_h, const int coeff_w,\n"
"__global const T* biasvec,\n"
"const int bias_offset,\n"
"__global T* data_im,\n"
"const int data_im_offset)\n"
"{\n"
"data_col = data_col + data_col_offset;\n"
"biasvec = biasvec + bias_offset;\n"
"data_im = data_im + data_im_offset;\n"
"int index = get_global_id(0);\n"
"if(index < n)\n"
"{\n"
"T val = 0.f;\n"
"int w = index % width + PAD_W;\n"
"int h = (index / width) % height + PAD_H;\n"
"int c = index / (width * height);\n"
"int h_col_start = (h < KERNEL_H) ? 0 : (h - KERNEL_H) / STRIDE_H + 1;\n"
"int h_col_end = min(h / STRIDE_H + 1, height_col);\n"
"int plane_size_col = height_col * width_col;\n"
"int offset = (c * KERNEL_H * KERNEL_W + h * KERNEL_W + w) * plane_size_col;\n"
"int w_col_start = (w < KERNEL_W) ? 0 : (w - KERNEL_W) / STRIDE_W + 1;\n"
"int w_col_end = min(w / STRIDE_W + 1, width_col);\n"
"for (int h_col = h_col_start; h_col < h_col_end; ++h_col)\n"
"for (int w_col = w_col_start; w_col < w_col_end; ++w_col)\n"
"val += data_col[offset + h_col * coeff_h + w_col * coeff_w];\n"
"data_im[index] = val + biasvec[c];\n"
"}\n"
"}\n"
, "ce817c6699c25771483253b686f98562", NULL};
struct cv::ocl::internal::ProgramEntry concat_oclsrc={moduleName, "concat",
"#if defined(cl_khr_fp16)\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define CONCAT(A,B) A##_##B\n"
"#define TEMPLATE(name,type) CONCAT(name,type)\n"
"__kernel void TEMPLATE(concat, Dtype)(const int nthreads,\n"
"__global const Dtype* in_data,\n"
"const int num_concats,\n"
"const int concat_size,\n"
"const int top_concat_axis,\n"
"const int bottom_concat_axis,\n"
"const int offset_concat_axis,\n"
"__global Dtype* out_data)\n"
"{\n"
"for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))\n"
"{\n"
"const int total_concat_size = concat_size * bottom_concat_axis;\n"
"const int concat_num = index / total_concat_size;\n"
"const int concat_index = index % total_concat_size;\n"
"const int top_index = concat_index +\n"
"(concat_num * top_concat_axis + offset_concat_axis) * concat_size;\n"
"out_data[top_index] = in_data[index];\n"
"}\n"
"}\n"
, "504946fb5e8e715dcede68425e93486a", NULL};
struct cv::ocl::internal::ProgramEntry conv_layer_spatial_oclsrc={moduleName, "conv_layer_spatial",
"#if defined(cl_khr_fp16)\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define KERNEL_ARG_DTYPE float\n"
"#define TYPE_FLOAT  1\n"
"#define TYPE_HALF   2\n"
"#if defined(FUSED_CONV_RELU)\n"
"#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope)))\n"
"#define FUSED_ARG KERNEL_ARG_DTYPE negative_slope,\n"
"#elif defined(FUSED_CONV_PRELU)\n"
"#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope[c])))\n"
"#define FUSED_ARG __global const KERNEL_ARG_DTYPE* negative_slope,\n"
"#elif defined(FUSED_CONV_POWER)\n"
"#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, (Dtype)power)\n"
"#define FUSED_ARG KERNEL_ARG_DTYPE power,\n"
"#elif defined(FUSED_CONV_TANH)\n"
"#define ACTIVATION_RELU_FUNCTION(x, c) tanh(x)\n"
"#define FUSED_ARG\n"
"#elif defined(FUSED_CONV_RELU6)\n"
"#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), (Dtype)min_value, (Dtype)max_value))\n"
"#define FUSED_ARG KERNEL_ARG_DTYPE min_value, KERNEL_ARG_DTYPE max_value,\n"
"#else\n"
"#define ACTIVATION_RELU_FUNCTION(x, c) (x)\n"
"#define FUSED_ARG\n"
"#endif\n"
"#ifdef FUSED_CONV_ELTWISE\n"
"#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \\\n"
"const Dtype _x_ = eltwise_data[(_offset_)] + (_data_); \\\n"
"(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \\\n"
"} while(0)\n"
"#define ELTWISE_DATA_ARG __global Dtype* eltwise_data,\n"
"#define ELTWISE_DATA_ARG_WITH_OFFSET __global Dtype* eltwise_ptr, int eltwise_offset,\n"
"#else\n"
"#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \\\n"
"const Dtype _x_ = (_data_); \\\n"
"(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \\\n"
"} while(0)\n"
"#define ELTWISE_DATA_ARG\n"
"#define ELTWISE_DATA_ARG_WITH_OFFSET\n"
"#endif\n"
"#if APPLY_BIAS\n"
"#define BIAS_KERNEL_ARG __global Dtype * biases_base,\n"
"#define BIAS_KERNEL_ARG_WITH_OFFSET __global Dtype * biases_base_ptr, int biases_base_offset,\n"
"#else\n"
"#define BIAS_KERNEL_ARG\n"
"#define BIAS_KERNEL_ARG_WITH_OFFSET\n"
"#endif\n"
"#define __CAT(x, y) x##y\n"
"#define CAT(x, y) __CAT(x, y)\n"
"#define LOOP0(VAR, STMT)\n"
"#define LOOP1(VAR, STMT) (STMT); (VAR)++;\n"
"#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;\n"
"#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;\n"
"#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;\n"
"#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;\n"
"#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;\n"
"#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;\n"
"#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;\n"
"#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;\n"
"#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;\n"
"#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;\n"
"#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;\n"
"#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;\n"
"#define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++;\n"
"#define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++;\n"
"#define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++;\n"
"#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))\n"
"#if defined(convolve_simd) || defined(Conv_Interleaved)\n"
"#if TYPE == TYPE_HALF\n"
"#define INT_TYPE ushort\n"
"#define INT_TYPE2 ushort2\n"
"#define INT_TYPE4 ushort4\n"
"#define INT_TYPE8 ushort8\n"
"#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read_us2\n"
"#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read_us4\n"
"#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read_us8\n"
"#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read_us\n"
"#else\n"
"#define INT_TYPE uint\n"
"#define INT_TYPE2 uint2\n"
"#define INT_TYPE4 uint4\n"
"#define INT_TYPE8 uint8\n"
"#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read2\n"
"#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read4\n"
"#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8\n"
"#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read\n"
"#endif\n"
"#endif\n"
"#ifdef KERNEL_BASIC\n"
"__kernel void ConvolveBasic(\n"
"ELTWISE_DATA_ARG\n"
"FUSED_ARG\n"
"__global Dtype* image_data,\n"
"int image_offset,\n"
"__global Dtype* kernel_data,\n"
"int kernel_offset,\n"
"__global Dtype* bias,\n"
"const int bias_offset,\n"
"__global Dtype* convolved_image_base,\n"
"const int convolved_image_base_offset,\n"
"const int convolved_image_offset,\n"
"const ushort input_width,\n"
"const ushort input_height,\n"
"const ushort output_width,\n"
"const ushort output_height,\n"
"const ushort pad_w,\n"
"const ushort pad_h\n"
")\n"
"{\n"
"__global Dtype* convolved_image = convolved_image_base + convolved_image_base_offset;\n"
"const int out_idx = get_global_id(0);\n"
"const int plane_size = output_width * output_height;\n"
"const int out_plane_idx = out_idx % plane_size;\n"
"const int outputZ = out_idx / plane_size;\n"
"const int outputY = out_plane_idx / output_width;\n"
"const int outputX = out_plane_idx % output_width;\n"
"if (outputZ < OUTPUT_Z)\n"
"{\n"
"Dtype sum = 0.0f;\n"
"const int org_y = outputY * STRIDE_Y - pad_h;\n"
"const int org_x = outputX * STRIDE_X - pad_w;\n"
"const int currentKernelOffset = kernel_offset + outputZ*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;\n"
"const int local_image_offset = org_y * input_width + org_x;\n"
"const int imageSize = input_width * input_height;\n"
"__global Dtype* image_dataPtr = (image_data + (image_offset + local_image_offset));\n"
"__global Dtype* kernel_dataPtr = (kernel_data + (currentKernelOffset));\n"
"for (int c = 0; c < CHANNELS; c++)\n"
"{\n"
"for (int y = 0; y < KERNEL_HEIGHT; y++)\n"
"{\n"
"int y_ = org_y + y * DILATION_Y;\n"
"for (int x = 0; x < KERNEL_WIDTH; x++)\n"
"{\n"
"int x_ = org_x + x * DILATION_X;\n"
"if (y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width)\n"
"{\n"
"sum = mad(image_dataPtr[x * DILATION_X], kernel_dataPtr[x], sum);\n"
"}\n"
"}\n"
"image_dataPtr += input_width * DILATION_Y;\n"
"kernel_dataPtr += KERNEL_WIDTH;\n"
"}\n"
"image_dataPtr += imageSize - input_width*KERNEL_HEIGHT*DILATION_Y;\n"
"}\n"
"int offset = convolved_image_offset + out_idx;\n"
"#if APPLY_BIAS\n"
"int biasIndex = bias_offset + outputZ;\n"
"ACTIVATION_FUNCTION(convolved_image, offset, sum + bias[biasIndex], biasIndex);\n"
"#else\n"
"ACTIVATION_FUNCTION(convolved_image, offset, sum, outputZ);\n"
"#endif\n"
"}\n"
"}\n"
"#elif defined KERNEL_IDLF\n"
"__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))\n"
"__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))\n"
"__kernel void\n"
"convolve_simd(\n"
"ELTWISE_DATA_ARG_WITH_OFFSET\n"
"FUSED_ARG\n"
"__global Dtype* inputs_ptr, const int inputs_offset,\n"
"__global Dtype* weights_ptr, const int weights_offset,\n"
"BIAS_KERNEL_ARG_WITH_OFFSET\n"
"__global Dtype* outputs_base, const int outputs_offset,\n"
"const ushort input_width,\n"
"const ushort input_height,\n"
"const ushort output_width,\n"
"const ushort output_height)\n"
"{\n"
"__global Dtype* inputs = inputs_ptr + inputs_offset;\n"
"__global Dtype* weights = weights_ptr + weights_offset;\n"
"#if APPLY_BIAS\n"
"__global Dtype* biases_base = biases_base_ptr + biases_base_offset;\n"
"#endif\n"
"__global Dtype* outputs = outputs_base + outputs_offset;\n"
"#ifdef FUSED_CONV_ELTWISE\n"
"__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;\n"
"#endif\n"
"unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH;\n"
"unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT;\n"
"unsigned int fm = get_global_id(2);\n"
"unsigned int fmg = get_group_id(2);\n"
"unsigned int lid = get_local_id(2);\n"
"Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0.0f };\n"
"unsigned int weight_addr = (fmg % FILTERS_IN_GROUP) *\n"
"INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;\n"
"unsigned int num_in_batch = fm / ALIGNED_NUM_FILTERS;\n"
"unsigned int input_batch_offset = num_in_batch * INPUT_PITCH * TOTAL_INPUT_DEPTH_SIZE;\n"
"int curr_y = or * STRIDE_Y;\n"
"int curr_x = oc * STRIDE_X + lid;\n"
"int in_addr = input_batch_offset\n"
"+  (curr_y - INPUT_PAD_H) * INPUT_WIDTH\n"
"+   curr_x - INPUT_PAD_W;\n"
"const int in_limit = (get_global_size(2) / ALIGNED_NUM_FILTERS) * TOTAL_INPUT_DEPTH_SIZE * INPUT_PITCH - 1;\n"
"Dtype in_buf[INVEC_SIZE];\n"
"for(int kd = 0; kd < INPUT_DEPTH; kd++)\n"
"{\n"
"#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n"
"const bool cx_out_of_range = !(curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W);\n"
"int in_offset = in_addr;\n"
"__attribute__((opencl_unroll_hint(INVEC_SIZE)))\n"
"for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH)\n"
"{\n"
"Dtype input = inputs[clamp(in_offset, 0, in_limit)];\n"
"int cy = curr_y + reg;\n"
"in_buf[reg] = (cx_out_of_range || cy < INPUT_PAD_H || cy >= INPUT_HEIGHT + INPUT_PAD_H) ? 0 : input;\n"
"}\n"
"#else\n"
"int in_offset = in_addr;\n"
"__attribute__((opencl_unroll_hint(INVEC_SIZE)))\n"
"for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH)\n"
"{\n"
"in_buf[reg] = inputs[min(in_offset, in_limit)];\n"
"}\n"
"#endif\n"
"in_addr += INPUT_PITCH;\n"
"#define BLOCK_IN(n, c) intel_sub_group_shuffle(in_buf[n], (c))\n"
"int kr = 0;\n"
"LOOP(KERNEL_HEIGHT, kr,\n"
"{\n"
"int kc = 0;\n"
"LOOP(KERNEL_WIDTH, kc,\n"
"{\n"
"Dtype weight_value = weights[weight_addr];\n"
"weight_addr += SIMD_SIZE;\n"
"for (int br=0; br < OUT_BLOCK_HEIGHT; br++)\n"
"{\n"
"for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++)\n"
"{\n"
"Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y), bc * STRIDE_X + kc * DILATION_X);\n"
"out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_value, input, out[br * OUT_BLOCK_WIDTH + bc]);\n"
"}\n"
"}\n"
"});\n"
"});\n"
"}\n"
"fm = fm % ALIGNED_NUM_FILTERS;\n"
"#if LEFT_FILTERS > 0\n"
"if (fm < NUM_FILTERS)\n"
"#endif\n"
"{\n"
"unsigned int out_addr = (num_in_batch * TOTAL_OUTPUT_DEPTH + fm) * OUTPUT_PITCH;\n"
"out_addr += or * output_width + oc;\n"
"#if APPLY_BIAS\n"
"Dtype bias = biases_base[fm];\n"
"#else\n"
"Dtype bias = 0;\n"
"#endif\n"
"for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++)\n"
"{\n"
"if (r + or >= output_height) break;\n"
"for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++)\n"
"{\n"
"if (c + oc >= output_width) break;\n"
"ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c], fm);\n"
"}\n"
"}\n"
"}\n"
"}\n"
"#elif defined KERNEL_GEMM_LIKE\n"
"#if APPLY_BIAS\n"
"#define SUBGROUP_GET_BIAS(k, i) intel_sub_group_shuffle(bias[k], i)\n"
"#else\n"
"#define SUBGROUP_GET_BIAS(k, i) ((Dtype)0)\n"
"#endif\n"
"#ifdef Conv_Interleaved\n"
"typedef struct float1 { float s0; } float1;\n"
"typedef struct float5 { float s0; float s1; float s2; float s3; float s4; } float5;\n"
"typedef struct float6 { float s0; float s1; float s2; float s3; float s4; float s5; } float6;\n"
"typedef struct float7 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; } float7;\n"
"typedef struct float9 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; } float9;\n"
"typedef struct float10 { float s0; float s1; float s2; float s3; float s4; float s5;\n"
"float s6; float s7; float s8; float s9;} float10;\n"
"typedef struct float11 { float s0; float s1; float s2; float s3; float s4; float s5;\n"
"float s6; float s7; float s8; float s9; float sa;} float11;\n"
"typedef struct float12 { float s0; float s1; float s2; float s3; float s4; float s5;\n"
"float s6; float s7; float s8; float s9; float sa; float sb; } float12;\n"
"typedef struct float13 { float s0; float s1; float s2; float s3; float s4; float s5;\n"
"float s6; float s7; float s8; float s9; float sa; float sb; float sc;} float13;\n"
"typedef struct float14 { float s0; float s1; float s2; float s3; float s4; float s5;\n"
"float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; } float14;\n"
"typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float s5;\n"
"float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15;\n"
"typedef struct float0 { float s0; } float0;\n"
"typedef struct half1 { half s0; } half1;\n"
"typedef struct half5 { half s0; half s1; half s2; half s3; half s4; } half5;\n"
"typedef struct half6 { half s0; half s1; half s2; half s3; half s4; half s5; } half6;\n"
"typedef struct half7 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; } half7;\n"
"typedef struct half9 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; } half9;\n"
"typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5;\n"
"half s6; half s7; half s8; half s9; } half10;\n"
"typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5;\n"
"half s6; half s7; half s8; half s9; half sa; } half11;\n"
"typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5;\n"
"half s6; half s7; half s8; half s9; half sa; half sb; } half12;\n"
"typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5;\n"
"half s6; half s7; half s8; half s9; half sa; half sb; half sc; } half13;\n"
"typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5;\n"
"half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; } half14;\n"
"typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5;\n"
"half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; half se; } half15;\n"
"typedef struct half0 { half s0; } half0;\n"
"#define OUT_PITCH_X output_width\n"
"#define ROW_PITCH input_width\n"
"#define GEMM_LIKE_KERNEL_ARGS     \\\n"
"ELTWISE_DATA_ARG_WITH_OFFSET  \\\n"
"FUSED_ARG                     \\\n"
"const __global Dtype *src0_ptr, const unsigned int src0_offset, const unsigned int src0_limit, \\\n"
"const __global Dtype *src1_ptr, const unsigned int src1_offset, const unsigned int src1_limit, \\\n"
"BIAS_KERNEL_ARG_WITH_OFFSET   \\\n"
"__global Dtype *dst_base, const unsigned int dst_offset, const unsigned int dst_limit, \\\n"
"const ushort input_width,     \\\n"
"const ushort input_height,    \\\n"
"const ushort output_width,    \\\n"
"const ushort output_height,   \\\n"
"const int out_pitch_y,     \\\n"
"const int out_pitch_z,     \\\n"
"const int aligned_input_size, \\\n"
"const int slice_pitch\n"
"#endif\n"
"#ifdef GEMM_LIKE_CONV_32_1\n"
"#define TILE_M          1\n"
"#define TILE_K          KERNEL_WIDTH\n"
"#define TILE_N          32\n"
"__attribute__((intel_reqd_sub_group_size(8)))\n"
"__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)\n"
"{\n"
"const __global Dtype *src0 = src0_ptr + src0_offset;\n"
"const __global Dtype *src1 = src1_ptr + src1_offset;\n"
"#if APPLY_BIAS\n"
"__global Dtype* biases_base = biases_base_ptr + biases_base_offset;\n"
"#endif\n"
"__global Dtype *dst = dst_base + dst_offset;\n"
"#ifdef FUSED_CONV_ELTWISE\n"
"__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;\n"
"#endif\n"
"const int group_x = get_group_id(0);\n"
"const int group_y = get_group_id(1);\n"
"const int global_x = get_global_id(0);\n"
"const int global_y = get_global_id(1);\n"
"const int global_z = get_global_id(2);\n"
"int interleaved_y;\n"
"int kernel_y;\n"
"int kernel_idx;\n"
"#define DOT_PRODUCT_8( _result, _rowA, colB )    \\\n"
"{   \\\n"
"_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 );  \\\n"
"_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 );  \\\n"
"_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 );  \\\n"
"_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 );  \\\n"
"_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 );  \\\n"
"_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 );  \\\n"
"_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 );  \\\n"
"_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 );  \\\n"
"}\n"
"typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;\n"
"#if 0\n"
"#define OPTIMIZE_READ 1\n"
"#else\n"
"#define OPTIMIZE_READ 0\n"
"#endif\n"
"if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )\n"
"{\n"
"Dtype8  blockC00 = 0.f;\n"
"Dtype8  blockC10 = 0.f;\n"
"Dtype8  blockC20 = 0.f;\n"
"Dtype8  blockC30 = 0.f;\n"
"int curr_x = ( global_y % output_width ) * STRIDE_X;\n"
"int curr_y = ( global_y / output_width ) * STRIDE_Y;\n"
"#if !OPTIMIZE_READ\n"
"int saved_y = curr_y;\n"
"#endif\n"
"const __global Dtype *src0_read = src0\n"
"+ aligned_input_size * global_z\n"
"+ (curr_y - INPUT_PAD_H) * ROW_PITCH\n"
"+ (curr_x - INPUT_PAD_W);\n"
"const __global Dtype *src1_read = src1 + ( global_x * TILE_N  * 2);\n"
"int patch_depth = 0;\n"
"do\n"
"{\n"
"int patch_row = 0;\n"
"#if !OPTIMIZE_READ\n"
"curr_y = saved_y;\n"
"#endif\n"
"do\n"
"{\n"
"const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;\n"
"#if OPTIMIZE_READ\n"
"#if KERNEL_WIDTH == 3\n"
"Dtype_t blockA00 = vload3(0, src0_read);\n"
"Dtype*  pblockA00 = (Dtype*)(&blockA00);\n"
"#else\n"
"#if 0\n"
"if ((int)(src0_read - src0) >= src0_limit - KERNEL_WIDTH)\n"
"{\n"
"printf(\"CATCH: src0_read-src0: %d   limit=%d   curr_y,curr_x=%d,%d\\n\", (int)(src0_read - src0), src0_limit, curr_y, curr_x);\n"
"}\n"
"#endif\n"
"Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];\n"
"Dtype*  pblockA00 = (Dtype*)(&blockA00);\n"
"#endif\n"
"#else\n"
"Dtype_t blockA00;\n"
"Dtype*  pblockA00 = (Dtype*)(&blockA00);\n"
"int pos = 0;\n"
"LOOP(KERNEL_WIDTH, pos,\n"
"{\n"
"if (curr_y >= INPUT_PAD_H &&\n"
"curr_y < input_height + INPUT_PAD_H &&\n"
"curr_x + pos * DILATION_X >= INPUT_PAD_W &&\n"
"curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)\n"
"pblockA00[pos] = src0_read[pos * DILATION_X];\n"
"else\n"
"pblockA00[pos] = 0;\n"
"})\n"
"curr_y += DILATION_Y;\n"
"#endif\n"
"src0_read += (ROW_PITCH * DILATION_Y);\n"
"Dtype blockB00[KERNEL_WIDTH*4];\n"
"Dtype8* p8BlockB00 = (Dtype8*)blockB00;\n"
"Dtype4* p4BlockB00 = (Dtype4*)blockB00;\n"
"Dtype*  pBlockB00 =  (Dtype* )blockB00;\n"
"interleaved_y = 0;\n"
"LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n"
"{\n"
"p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE *)src1_read ) );\n"
"src1_read += WIDTH1 * 2;\n"
"} )\n"
"if ( kernel_width_is_odd )\n"
"{\n"
"p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE *)src1_read ) );\n"
"src1_read += WIDTH1 * 2;\n"
"}\n"
"kernel_idx = 0;\n"
"interleaved_y = 0;\n"
"LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n"
"{\n"
"kernel_y = interleaved_y * 2;\n"
"DOT_PRODUCT_8( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC20, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC30, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"} )\n"
"kernel_y = interleaved_y * 2;\n"
"if ( kernel_width_is_odd )\n"
"{\n"
"DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"}\n"
"}\n"
"while( ++patch_row < KERNEL_HEIGHT );\n"
"src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);\n"
"}\n"
"while ( ++patch_depth < INPUT_DEPTH );\n"
"int out_offset = global_z * out_pitch_z\n"
"+ ( group_x * TILE_N ) * out_pitch_y\n"
"+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X\n"
"+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;\n"
"__global Dtype *out = dst + out_offset;\n"
"#if APPLY_BIAS\n"
"Dtype bias[4];\n"
"Dtype4 *bias_vec;\n"
"bias_vec = (Dtype4*)bias;\n"
"*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));\n"
"if (group_x > 0xFFFFFFFEul) {\n"
"dst[0] = bias[0] + bias[1] + bias[2] + bias[3];\n"
"}\n"
"#else\n"
"const Dtype bias[4] = {0, 0, 0, 0};\n"
"#endif\n"
"if (global_y * TILE_M < output_width * output_height )\n"
"{\n"
"for (int i = 0; i < 8; i++)\n"
"{\n"
"ACTIVATION_FUNCTION(dst, out_offset + ( 0 + i ) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);\n"
"ACTIVATION_FUNCTION(dst, out_offset + ( 8 + i ) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + 8 + i);\n"
"ACTIVATION_FUNCTION(dst, out_offset + ( 16 + i ) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + 16 + i);\n"
"ACTIVATION_FUNCTION(dst, out_offset + ( 24 + i ) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + 24 + i);\n"
"}\n"
"}\n"
"}\n"
"#if TILE_N_LAST > 0\n"
"else\n"
"{\n"
"int i = 0;\n"
"Dtype8  blockC[TILE_N_LAST_DIV8];\n"
"LOOP(TILE_N_LAST_DIV8, i,\n"
"{\n"
"blockC[i] = 0.f;\n"
"} )\n"
"int curr_x = ( global_y % output_width ) * STRIDE_X;\n"
"int curr_y = ( global_y / output_width ) * STRIDE_Y;\n"
"#if !OPTIMIZE_READ\n"
"int saved_y = curr_y;\n"
"#endif\n"
"const __global Dtype *src0_read = src0\n"
"+ aligned_input_size * global_z\n"
"+ (curr_y - INPUT_PAD_H) * ROW_PITCH\n"
"+ (curr_x - INPUT_PAD_W);\n"
"const __global Dtype *src1_read = src1 + ( global_x * TILE_N  * 2);\n"
"int patch_depth = 0;\n"
"do\n"
"{\n"
"int patch_row = 0;\n"
"#if !OPTIMIZE_READ\n"
"curr_y = saved_y;\n"
"#endif\n"
"do\n"
"{\n"
"const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;\n"
"#if OPTIMIZE_READ\n"
"Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];\n"
"Dtype*  pblockA00 = (Dtype*)(&blockA00);\n"
"#else\n"
"Dtype_t blockA00;\n"
"Dtype*  pblockA00 = (Dtype*)(&blockA00);\n"
"int pos = 0;\n"
"LOOP(KERNEL_WIDTH, pos,\n"
"{\n"
"if (curr_y >= INPUT_PAD_H &&\n"
"curr_y < input_height + INPUT_PAD_H &&\n"
"curr_x + pos * DILATION_X >= INPUT_PAD_W &&\n"
"curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)\n"
"pblockA00[pos] = src0_read[pos * DILATION_X];\n"
"else\n"
"pblockA00[pos] = 0;\n"
"})\n"
"curr_y += DILATION_Y;\n"
"#endif\n"
"src0_read += (ROW_PITCH * DILATION_Y);\n"
"Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8];\n"
"interleaved_y = 0;\n"
"LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n"
"{\n"
"#if TILE_N_LAST_DIV8 == 1\n"
"Dtype2* p2BlockB = (Dtype2* )blockB;\n"
"p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );\n"
"#elif TILE_N_LAST_DIV8 == 2\n"
"Dtype4* p4BlockB = (Dtype4* )blockB;\n"
"p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );\n"
"#elif TILE_N_LAST_DIV8 == 3\n"
"Dtype6* p6BlockB = (Dtype6* )blockB;\n"
"(*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );\n"
"(*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) );\n"
"#endif\n"
"src1_read += WIDTH1 * 2;\n"
"} )\n"
"if ( kernel_width_is_odd )\n"
"{\n"
"#if TILE_N_LAST_DIV8 == 1\n"
"Dtype* pBlockB = (Dtype* )blockB;\n"
"pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) );\n"
"#elif TILE_N_LAST_DIV8 == 2\n"
"Dtype2* p2BlockB = (Dtype2* )blockB;\n"
"p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );\n"
"#elif TILE_N_LAST_DIV8 == 3\n"
"Dtype3* p3BlockB = (Dtype3* )blockB;\n"
"p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );\n"
"p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 2 * 8) ) );\n"
"#endif\n"
"src1_read += WIDTH1 * 2;\n"
"}\n"
"Dtype* pBlockB = (Dtype*)blockB;\n"
"kernel_idx = 0;\n"
"interleaved_y = 0;\n"
"LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n"
"{\n"
"kernel_y = interleaved_y * 2;\n"
"DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;\n"
"#if TILE_N_LAST_DIV8 >= 2\n"
"DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;\n"
"#if TILE_N_LAST_DIV8 >= 3\n"
"DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;\n"
"#endif\n"
"#endif\n"
"} )\n"
"kernel_y = interleaved_y * 2;\n"
"if ( kernel_width_is_odd )\n"
"{\n"
"DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;\n"
"#if TILE_N_LAST_DIV8 >= 2\n"
"DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;\n"
"#if TILE_N_LAST_DIV8 >= 3\n"
"DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;\n"
"#endif\n"
"#endif\n"
"}\n"
"}\n"
"while( ++patch_row < KERNEL_HEIGHT );\n"
"src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );\n"
"}\n"
"while ( ++patch_depth < INPUT_DEPTH );\n"
"int out_offset = global_z * out_pitch_z\n"
"+ ( group_x * TILE_N ) * out_pitch_y\n"
"+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X\n"
"+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;\n"
"__global Dtype *out = dst + out_offset;\n"
"#if APPLY_BIAS\n"
"Dtype bias[4];\n"
"Dtype4 *bias_vec;\n"
"bias_vec = (Dtype4*)bias;\n"
"*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));\n"
"if (group_x > 0xFFFFFFFEul) {\n"
"dst[0] = bias[0] + bias[1] + bias[2] + bias[3];\n"
"}\n"
"#else\n"
"const Dtype bias[4] = {0, 0, 0, 0};\n"
"#endif\n"
"if (global_y * TILE_M < output_width * output_height )\n"
"{\n"
"for (int i = 0; i < 8; i++)\n"
"{\n"
"if ( TILE_N_LAST_DIV8 > 0 )\n"
"{\n"
"ACTIVATION_FUNCTION(dst, out_offset + ( 0+i) * out_pitch_y, blockC[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);\n"
"}\n"
"if ( TILE_N_LAST_DIV8 > 1 )\n"
"{\n"
"ACTIVATION_FUNCTION(dst, out_offset + ( 8+i) * out_pitch_y, blockC[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);\n"
"}\n"
"if ( TILE_N_LAST_DIV8 > 2 )\n"
"{\n"
"ACTIVATION_FUNCTION(dst, out_offset + (16+i) * out_pitch_y, blockC[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);\n"
"}\n"
"if ( TILE_N_LAST_DIV8 > 3 )\n"
"{\n"
"ACTIVATION_FUNCTION(dst, out_offset + (24+i) * out_pitch_y, blockC[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);\n"
"}\n"
"}\n"
"}\n"
"}\n"
"#endif\n"
"}\n"
"#endif\n"
"#ifdef GEMM_LIKE_CONV_32_2\n"
"#define TILE_M          2\n"
"#define TILE_K          KERNEL_WIDTH\n"
"#define TILE_N          32\n"
"__attribute__((intel_reqd_sub_group_size(8)))\n"
"__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)\n"
"{\n"
"const __global Dtype *src0 = src0_ptr + src0_offset;\n"
"const __global Dtype *src1 = src1_ptr + src1_offset;\n"
"#if APPLY_BIAS\n"
"__global Dtype* biases_base = biases_base_ptr + biases_base_offset;\n"
"#endif\n"
"__global Dtype *dst = dst_base + dst_offset;\n"
"#ifdef FUSED_CONV_ELTWISE\n"
"__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;\n"
"#endif\n"
"const int group_x = get_group_id(0);\n"
"const int group_y = get_group_id(1);\n"
"const int global_x = get_global_id(0);\n"
"const int global_y = get_global_id(1);\n"
"const int global_z = get_global_id(2);\n"
"int interleaved_y;\n"
"int kernel_y;\n"
"int kernel_idx;\n"
"#define DOT_PRODUCT_8( _result, _rowA, colB )    \\\n"
"{   \\\n"
"_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 );  \\\n"
"_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 );  \\\n"
"_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 );  \\\n"
"_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 );  \\\n"
"_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 );  \\\n"
"_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 );  \\\n"
"_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 );  \\\n"
"_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 );  \\\n"
"}\n"
"typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;\n"
"if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )\n"
"{\n"
"Dtype8  blockC00 = 0.f;\n"
"Dtype8  blockC10 = 0.f;\n"
"Dtype8  blockC20 = 0.f;\n"
"Dtype8  blockC30 = 0.f;\n"
"Dtype8  blockC01 = 0.f;\n"
"Dtype8  blockC11 = 0.f;\n"
"Dtype8  blockC21 = 0.f;\n"
"Dtype8  blockC31 = 0.f;\n"
"int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;\n"
"int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;\n"
"int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;\n"
"int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;\n"
"#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n"
"int saved_y0 = curr_y0;\n"
"int saved_y1 = curr_y1;\n"
"#endif\n"
"const __global Dtype *src0_read0 = src0\n"
"+ aligned_input_size * global_z\n"
"+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH\n"
"+ curr_x0 - INPUT_PAD_W;\n"
"const __global Dtype *src0_read1 = src0\n"
"+ aligned_input_size * global_z\n"
"+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH\n"
"+ curr_x1 - INPUT_PAD_W;\n"
"const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);\n"
"int patch_depth = 0;\n"
"do\n"
"{\n"
"int patch_row = 0;\n"
"do\n"
"{\n"
"const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;\n"
"#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0\n"
"#if KERNEL_WIDTH == 3\n"
"Dtype_t blockA00 = vload3(0, src0_read0); src0_read0 += ROW_PITCH;\n"
"Dtype_t blockA01 = vload3(0, src0_read1); src0_read1 += ROW_PITCH;\n"
"Dtype*  pblockA00 = (Dtype*)(&blockA00);\n"
"Dtype*  pblockA01 = (Dtype*)(&blockA01);\n"
"#else\n"
"Dtype_t blockA00 = { (Dtype)0.f };\n"
"Dtype_t blockA01 = { (Dtype)0.f };\n"
"Dtype*  pblockA00 = (Dtype*)(&blockA00);\n"
"Dtype*  pblockA01 = (Dtype*)(&blockA01);\n"
"int pos = 0;\n"
"LOOP(KERNEL_WIDTH, pos,\n"
"{\n"
"if (curr_x0 + pos < input_width)\n"
"pblockA00[pos] = src0_read0[pos];\n"
"if (curr_x1 + pos < input_width)\n"
"pblockA01[pos] = src0_read1[pos];\n"
"})\n"
"src0_read0 += ROW_PITCH;\n"
"src0_read1 += ROW_PITCH;\n"
"#endif\n"
"#else\n"
"Dtype_t blockA00;\n"
"Dtype*  pblockA00 = (Dtype*)(&blockA00);\n"
"int pos = 0;\n"
"LOOP(KERNEL_WIDTH, pos,\n"
"{\n"
"if (curr_y0 >= INPUT_PAD_H &&\n"
"curr_y0 < input_height + INPUT_PAD_H &&\n"
"curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&\n"
"curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)\n"
"pblockA00[pos] = src0_read0[pos * DILATION_X];\n"
"else\n"
"pblockA00[pos] = 0;\n"
"})\n"
"curr_y0 += DILATION_Y;\n"
"Dtype_t blockA01;\n"
"Dtype*  pblockA01 = (Dtype*)(&blockA01);\n"
"pos = 0;\n"
"LOOP(KERNEL_WIDTH, pos,\n"
"{\n"
"if (curr_y1 >= INPUT_PAD_H &&\n"
"curr_y1 < input_height + INPUT_PAD_H &&\n"
"curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&\n"
"curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)\n"
"pblockA01[pos] = src0_read1[pos * DILATION_X];\n"
"else\n"
"pblockA01[pos] = 0;\n"
"})\n"
"curr_y1 += DILATION_Y;\n"
"src0_read0 += (ROW_PITCH * DILATION_Y);\n"
"src0_read1 += (ROW_PITCH * DILATION_Y);\n"
"#endif\n"
"Dtype blockB00[KERNEL_WIDTH*4];\n"
"Dtype8* p8BlockB00 = (Dtype8*)blockB00;\n"
"Dtype4* p4BlockB00 = (Dtype4*)blockB00;\n"
"Dtype*  pBlockB00 =  (Dtype* )blockB00;\n"
"interleaved_y = 0;\n"
"LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n"
"{\n"
"p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE*)src1_read ) );\n"
"src1_read += WIDTH1 * 2;\n"
"} )\n"
"if ( kernel_width_is_odd )\n"
"{\n"
"p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );\n"
"src1_read += WIDTH1 * 2;\n"
"}\n"
"kernel_idx = 0;\n"
"interleaved_y = 0;\n"
"LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n"
"{\n"
"kernel_y = interleaved_y * 2;\n"
"DOT_PRODUCT_8( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC01, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC11, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC20, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC21, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC30, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC31, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"} )\n"
"if ( kernel_width_is_odd )\n"
"{\n"
"kernel_y = interleaved_y * 2;\n"
"DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"}\n"
"}\n"
"while( ++patch_row < KERNEL_HEIGHT );\n"
"#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n"
"curr_y0 = saved_y0;\n"
"curr_y1 = saved_y1;\n"
"#endif\n"
"src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );\n"
"src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );\n"
"}\n"
"while ( ++patch_depth < INPUT_DEPTH );\n"
"int out0_offset = global_z * out_pitch_z\n"
"+ ( group_x * TILE_N ) * out_pitch_y\n"
"+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X\n"
"+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;\n"
"int out1_offset = global_z * out_pitch_z\n"
"+ ( group_x * TILE_N ) * out_pitch_y\n"
"+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X\n"
"+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;\n"
"#if APPLY_BIAS\n"
"Dtype bias[4];\n"
"Dtype4 *bias_vec;\n"
"bias_vec = (Dtype4*)bias;\n"
"*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));\n"
"if (group_x > 0xFFFFFFFEul) {\n"
"dst[0] = bias[0] + bias[1] + bias[2] + bias[3];\n"
"}\n"
"#else\n"
"const Dtype bias[4] = {0, 0, 0, 0};\n"
"#endif\n"
"if( global_y * TILE_M < output_width * output_height )\n"
"{\n"
"for( int i = 0; i < 8; i++ )\n"
"{\n"
"ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);\n"
"ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);\n"
"ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);\n"
"ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);\n"
"}\n"
"}\n"
"if( global_y * TILE_M + 1 < output_width * output_height )\n"
"{\n"
"for( int i = 0; i < 8; i++ )\n"
"{\n"
"ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC01[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);\n"
"ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC11[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);\n"
"ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC21[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);\n"
"ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC31[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);\n"
"}\n"
"}\n"
"}\n"
"#if TILE_N_LAST > 0\n"
"else\n"
"{\n"
"int i = 0;\n"
"Dtype8  blockC0[TILE_N_LAST_DIV8];\n"
"Dtype8  blockC1[TILE_N_LAST_DIV8];\n"
"LOOP(TILE_N_LAST_DIV8, i,\n"
"{\n"
"blockC0[i] = 0.f;\n"
"blockC1[i] = 0.f;\n"
"} )\n"
"int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;\n"
"int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;\n"
"int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;\n"
"int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;\n"
"#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n"
"int saved_y0 = curr_y0;\n"
"int saved_y1 = curr_y1;\n"
"#endif\n"
"const __global Dtype *src0_read0 = src0\n"
"+ aligned_input_size * global_z\n"
"+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH\n"
"+ curr_x0 - INPUT_PAD_W;\n"
"const __global Dtype *src0_read1 = src0\n"
"+ aligned_input_size * global_z\n"
"+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH\n"
"+ curr_x1 - INPUT_PAD_W;\n"
"const __global Dtype *src1_read = src1 + ( global_x * TILE_N  * 2);\n"
"int patch_depth = 0;\n"
"do\n"
"{\n"
"int patch_row = 0;\n"
"do\n"
"{\n"
"const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;\n"
"#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0\n"
"Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[  0  ]; src0_read0 += ROW_PITCH;\n"
"Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[  0  ]; src0_read1 += ROW_PITCH;\n"
"Dtype*  pblockA00 = (Dtype*)(&blockA00);\n"
"Dtype*  pblockA01 = (Dtype*)(&blockA01);\n"
"#else\n"
"Dtype_t blockA00;\n"
"Dtype*  pblockA00 = (Dtype*)(&blockA00);\n"
"int pos = 0;\n"
"LOOP(KERNEL_WIDTH, pos,\n"
"{\n"
"if (curr_y0 >= INPUT_PAD_H &&\n"
"curr_y0 < input_height + INPUT_PAD_H &&\n"
"curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&\n"
"curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)\n"
"pblockA00[pos] = src0_read0[pos * DILATION_X];\n"
"else\n"
"pblockA00[pos] = 0;\n"
"})\n"
"curr_y0 += DILATION_Y;\n"
"Dtype_t blockA01;\n"
"Dtype*  pblockA01 = (Dtype*)(&blockA01);\n"
"pos = 0;\n"
"LOOP(KERNEL_WIDTH, pos,\n"
"{\n"
"if (curr_y1 >= INPUT_PAD_H &&\n"
"curr_y1 < input_height + INPUT_PAD_H &&\n"
"curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&\n"
"curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)\n"
"pblockA01[pos] = src0_read1[pos * DILATION_X];\n"
"else\n"
"pblockA01[pos] = 0;\n"
"})\n"
"curr_y1 += DILATION_Y;\n"
"src0_read0 += (ROW_PITCH * DILATION_Y);\n"
"src0_read1 += (ROW_PITCH * DILATION_Y);\n"
"#endif\n"
"Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8];\n"
"interleaved_y = 0;\n"
"LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n"
"{\n"
"#if TILE_N_LAST_DIV8 == 1\n"
"Dtype2* p2BlockB = (Dtype2* )blockB;\n"
"p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );\n"
"#elif TILE_N_LAST_DIV8 == 2\n"
"Dtype4* p4BlockB = (Dtype4* )blockB;\n"
"p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );\n"
"#elif TILE_N_LAST_DIV8 == 3\n"
"Dtype6* p6BlockB = (Dtype6* )blockB;\n"
"(*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );\n"
"(*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) );\n"
"#endif\n"
"src1_read += WIDTH1 * 2;\n"
"} )\n"
"if ( kernel_width_is_odd )\n"
"{\n"
"#if TILE_N_LAST_DIV8 == 1\n"
"Dtype* pBlockB = (Dtype* )blockB;\n"
"pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) );\n"
"#elif TILE_N_LAST_DIV8 == 2\n"
"Dtype2* p2BlockB = (Dtype2* )blockB;\n"
"p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );\n"
"#elif TILE_N_LAST_DIV8 == 3\n"
"Dtype3* p3BlockB = (Dtype3* )blockB;\n"
"p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );\n"
"p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 8) ) );\n"
"#endif\n"
"src1_read += WIDTH1 * 2;\n"
"}\n"
"Dtype* pBlockB = (Dtype*)blockB;\n"
"kernel_idx = 0;\n"
"interleaved_y = 0;\n"
"LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n"
"{\n"
"kernel_y = interleaved_y * 2;\n"
"DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y    ], pBlockB[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;\n"
"#if TILE_N_LAST_DIV8 >= 2\n"
"DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y    ], pBlockB[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;\n"
"#if TILE_N_LAST_DIV8 >= 3\n"
"DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y    ], pBlockB[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;\n"
"#endif\n"
"#endif\n"
"} )\n"
"kernel_y = interleaved_y * 2;\n"
"if ( kernel_width_is_odd )\n"
"{\n"
"DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y], pBlockB[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;\n"
"#if TILE_N_LAST_DIV8 >= 2\n"
"DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y], pBlockB[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;\n"
"#if TILE_N_LAST_DIV8 >= 3\n"
"DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y], pBlockB[kernel_idx] );\n"
"DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;\n"
"#endif\n"
"#endif\n"
"}\n"
"}\n"
"while( ++patch_row < KERNEL_HEIGHT );\n"
"#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n"
"curr_y0 = saved_y0;\n"
"curr_y1 = saved_y1;\n"
"#endif\n"
"src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );\n"
"src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );\n"
"}\n"
"while ( ++patch_depth < INPUT_DEPTH );\n"
"int out0_offset = global_z * out_pitch_z\n"
"+ ( group_x * TILE_N ) * out_pitch_y\n"
"+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X\n"
"+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;\n"
"int out1_offset = global_z * out_pitch_z\n"
"+ ( group_x * TILE_N ) * out_pitch_y\n"
"+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X\n"
"+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;\n"
"__global Dtype *out1 = dst + out1_offset;\n"
"#if APPLY_BIAS\n"
"Dtype bias[4];\n"
"Dtype4 *bias_vec;\n"
"bias_vec = (Dtype4*)bias;\n"
"*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));\n"
"if (group_x > 0xFFFFFFFEul) {\n"
"dst[0] = bias[0] + bias[1] + bias[2] + bias[3];\n"
"}\n"
"#else\n"
"const Dtype bias[4] = {0, 0, 0, 0};\n"
"#endif\n"
"if( global_y * TILE_M < output_width * output_height )\n"
"{\n"
"for( int i = 0; i < 8; i++ )\n"
"{\n"
"if ( TILE_N_LAST_DIV8 > 0 )\n"
"{\n"
"ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC0[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);\n"
"}\n"
"if ( TILE_N_LAST_DIV8 > 1 )\n"
"{\n"
"ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC0[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);\n"
"}\n"
"if ( TILE_N_LAST_DIV8 > 2 )\n"
"{\n"
"ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC0[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);\n"
"}\n"
"if ( TILE_N_LAST_DIV8 > 3 )\n"
"{\n"
"ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC0[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);\n"
"}\n"
"}\n"
"}\n"
"if( global_y * TILE_M + 1 < output_width * output_height )\n"
"{\n"
"for( int i = 0; i < 8; i++ )\n"
"{\n"
"if ( TILE_N_LAST_DIV8 > 0 )\n"
"{\n"
"ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC1[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);\n"
"}\n"
"if ( TILE_N_LAST_DIV8 > 1 )\n"
"{\n"
"ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC1[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);\n"
"}\n"
"if ( TILE_N_LAST_DIV8 > 2 )\n"
"{\n"
"ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC1[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);\n"
"}\n"
"if ( TILE_N_LAST_DIV8 > 3 )\n"
"{\n"
"ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC1[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);\n"
"}\n"
"}\n"
"}\n"
"}\n"
"#endif\n"
"}\n"
"#endif\n"
"#if defined(GEMM_LIKE_CONV_32_2_SIMD16) || defined(GEMM_LIKE_CONV_32_1_SIMD16)\n"
"#define INTERLEAVED_SIMD16_OUTPUT(_out_, _offset_,  _m_) do {\\\n"
"if (global_y * TILE_M < output_width * output_height ) \\\n"
"{ \\\n"
"if ( ( OUT_DEPTH % TILE_N ) == 0 ) {\\\n"
"for (int i = 0; i < 16; i++) \\\n"
"{ \\\n"
"ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \\\n"
"ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \\\n"
"} \\\n"
"} \\\n"
"else if( ( OUT_DEPTH % 16 ) == 0 ) { \\\n"
"if ( ( global_x + 1 ) < get_global_size(0) ) { \\\n"
"for ( int i = 0; i < 16; i++ ) \\\n"
"{ \\\n"
"ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \\\n"
"ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \\\n"
"} \\\n"
"} \\\n"
"else { \\\n"
"for (int i = 0; i < 16; i++) \\\n"
"{ \\\n"
"ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \\\n"
"} \\\n"
"} \\\n"
"} \\\n"
"else { \\\n"
"if ( ( global_x + 1 ) < get_global_size(0) ) \\\n"
"{ \\\n"
"for ( int i = 0; i < 16; i++ ) \\\n"
"{ \\\n"
"ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \\\n"
"ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \\\n"
"} \\\n"
"} \\\n"
"else { \\\n"
"if ( (OUT_DEPTH % TILE_N) > 16 ) { \\\n"
"for (int i = 0; i < 16 ; i++) \\\n"
"{ \\\n"
"ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \\\n"
"} \\\n"
"for (int i = 0; i < OUT_DEPTH % 16 ; i++) \\\n"
"{ \\\n"
"ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \\\n"
"} \\\n"
"} \\\n"
"else { \\\n"
"for (int i = 0; i < OUT_DEPTH % 16 ; i++) \\\n"
"{ \\\n"
"ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \\\n"
"} \\\n"
"} \\\n"
"} \\\n"
"} \\\n"
"} \\\n"
"}while(0)\n"
"#endif\n"
"#ifdef GEMM_LIKE_CONV_32_1_SIMD16\n"
"#define TILE_M          1\n"
"#define TILE_K          KERNEL_WIDTH\n"
"#define TILE_N          32\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)\n"
"{\n"
"const __global Dtype *src0 = src0_ptr + src0_offset;\n"
"const __global Dtype *src1 = src1_ptr + src1_offset;\n"
"#if APPLY_BIAS\n"
"__global Dtype* biases_base = biases_base_ptr + biases_base_offset;\n"
"#endif\n"
"__global Dtype *dst = dst_base + dst_offset;\n"
"#ifdef FUSED_CONV_ELTWISE\n"
"__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;\n"
"#endif\n"
"const int group_x = get_group_id(0);\n"
"const int group_y = get_group_id(1);\n"
"const int global_x = get_global_id(0);\n"
"const int global_y = get_global_id(1);\n"
"const int global_z = get_global_id(2);\n"
"int interleaved_y;\n"
"int kernel_y;\n"
"int kernel_idx;\n"
"Dtype16  blockC00 = 0.f;\n"
"Dtype16  blockC10 = 0.f;\n"
"int curr_x = ( global_y % output_width ) * STRIDE_X;\n"
"int curr_y = ( global_y / output_width ) * STRIDE_Y;\n"
"#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n"
"int saved_y = curr_y;\n"
"#endif\n"
"const __global Dtype *src0_read = src0\n"
"+ aligned_input_size * global_z\n"
"+ (curr_y - INPUT_PAD_H) * ROW_PITCH\n"
"+ curr_x - INPUT_PAD_W;\n"
"const __global Dtype *src0_read_orig = src0_read;\n"
"const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2 );\n"
"#define DOT_PRODUCT_16( _result, _rowA, colB )    \\\n"
"{   \\\n"
"_result.s0 = mad( _rowA, sub_group_broadcast( colB,  0 ), _result.s0 );  \\\n"
"_result.s1 = mad( _rowA, sub_group_broadcast( colB,  1 ), _result.s1 );  \\\n"
"_result.s2 = mad( _rowA, sub_group_broadcast( colB,  2 ), _result.s2 );  \\\n"
"_result.s3 = mad( _rowA, sub_group_broadcast( colB,  3 ), _result.s3 );  \\\n"
"_result.s4 = mad( _rowA, sub_group_broadcast( colB,  4 ), _result.s4 );  \\\n"
"_result.s5 = mad( _rowA, sub_group_broadcast( colB,  5 ), _result.s5 );  \\\n"
"_result.s6 = mad( _rowA, sub_group_broadcast( colB,  6 ), _result.s6 );  \\\n"
"_result.s7 = mad( _rowA, sub_group_broadcast( colB,  7 ), _result.s7 );  \\\n"
"_result.s8 = mad( _rowA, sub_group_broadcast( colB,  8 ), _result.s8 );  \\\n"
"_result.s9 = mad( _rowA, sub_group_broadcast( colB,  9 ), _result.s9 );  \\\n"
"_result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa );  \\\n"
"_result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb );  \\\n"
"_result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc );  \\\n"
"_result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd );  \\\n"
"_result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se );  \\\n"
"_result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf );  \\\n"
"}\n"
"typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;\n"
"int patch_depth = 0;\n"
"__attribute__((opencl_unroll_hint(1)))\n"
"do\n"
"{\n"
"int patch_row = 0;\n"
"#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n"
"curr_y = saved_y;\n"
"#endif\n"
"__attribute__((opencl_unroll_hint(1)))\n"
"do\n"
"{\n"
"const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;\n"
"#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0\n"
"#if KERNEL_WIDTH == 3\n"
"Dtype_t blockA00 = vload3(0, src0_read);\n"
"Dtype*  pblockA00 = (Dtype*)(&blockA00);\n"
"#else\n"
"Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];\n"
"Dtype*  pblockA00 = (Dtype*)(&blockA00);\n"
"#endif\n"
"#else\n"
"Dtype_t blockA00;\n"
"Dtype*  pblockA00 = (Dtype*)(&blockA00);\n"
"int pos = 0;\n"
"LOOP(KERNEL_WIDTH, pos,\n"
"{\n"
"if (curr_y >= INPUT_PAD_H &&\n"
"curr_y < input_height + INPUT_PAD_H &&\n"
"curr_x + pos * DILATION_X >= INPUT_PAD_W &&\n"
"curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)\n"
"pblockA00[pos] = src0_read[pos * DILATION_X];\n"
"else\n"
"pblockA00[pos] = 0;\n"
"})\n"
"curr_y += DILATION_Y;\n"
"#endif\n"
"src0_read += ROW_PITCH * DILATION_Y;\n"
"INT_TYPE blockB00[KERNEL_WIDTH * 2];\n"
"INT_TYPE4* p4BlockB00 = (INT_TYPE4*)blockB00;\n"
"INT_TYPE2* p2BlockB00 = (INT_TYPE2*)blockB00;\n"
"Dtype* pBlockB00  = (Dtype*)blockB00;\n"
"interleaved_y = 0;\n"
"LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n"
"{\n"
"p4BlockB00[interleaved_y] = SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read );\n"
"src1_read += WIDTH1 * 2;\n"
"} )\n"
"if ( kernel_width_is_odd )\n"
"{\n"
"p2BlockB00[KERNEL_WIDTH - 1] = SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read );\n"
"src1_read += WIDTH1 * 2;\n"
"}\n"
"kernel_idx = 0;\n"
"interleaved_y = 0;\n"
"LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n"
"{\n"
"kernel_y = interleaved_y * 2;\n"
"DOT_PRODUCT_16( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_16( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"} )\n"
"if ( kernel_width_is_odd )\n"
"{\n"
"kernel_y = interleaved_y * 2;\n"
"DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"}\n"
"}\n"
"while( ++patch_row < KERNEL_HEIGHT );\n"
"src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );\n"
"}\n"
"while ( ++patch_depth < INPUT_DEPTH );\n"
"int out_offset = global_z * out_pitch_z\n"
"+ ( group_x * TILE_N ) * out_pitch_y\n"
"+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X\n"
"+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;\n"
"__global Dtype *out = dst + out_offset;\n"
"#if APPLY_BIAS\n"
"Dtype bias[2];\n"
"Dtype2 *bias_vec;\n"
"bias_vec = (Dtype2*)bias;\n"
"*bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));\n"
"if (group_x > 0xFFFFFFFEul) {\n"
"dst[0] = bias[0] + bias[1];\n"
"}\n"
"#else\n"
"const Dtype bias[2] = {0, 0};\n"
"#endif\n"
"INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);\n"
"}\n"
"#endif\n"
"#ifdef GEMM_LIKE_CONV_32_2_SIMD16\n"
"#define TILE_M          2\n"
"#define TILE_K          KERNEL_WIDTH\n"
"#define TILE_N          32\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)\n"
"{\n"
"const __global Dtype *src0 = src0_ptr + src0_offset;\n"
"const __global Dtype *src1 = src1_ptr + src1_offset;\n"
"#if APPLY_BIAS\n"
"__global Dtype* biases_base = biases_base_ptr + biases_base_offset;\n"
"#endif\n"
"__global Dtype *dst = dst_base + dst_offset;\n"
"#ifdef FUSED_CONV_ELTWISE\n"
"__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;\n"
"#endif\n"
"const int group_x = get_group_id(0);\n"
"const int group_y = get_group_id(1);\n"
"const int global_x = get_global_id(0);\n"
"const int global_y = get_global_id(1);\n"
"const int global_z = get_global_id(2);\n"
"int interleaved_y;\n"
"int kernel_y;\n"
"int kernel_idx;\n"
"#define DOT_PRODUCT_16( _result, _rowA, colB )    \\\n"
"{   \\\n"
"_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 );  \\\n"
"_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 );  \\\n"
"_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 );  \\\n"
"_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 );  \\\n"
"_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 );  \\\n"
"_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 );  \\\n"
"_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 );  \\\n"
"_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 );  \\\n"
"_result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 );  \\\n"
"_result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 );  \\\n"
"_result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa );  \\\n"
"_result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb );  \\\n"
"_result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc );  \\\n"
"_result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd );  \\\n"
"_result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se );  \\\n"
"_result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf );  \\\n"
"}\n"
"typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;\n"
"{\n"
"Dtype16  blockC00 = 0.f;\n"
"Dtype16  blockC10 = 0.f;\n"
"Dtype16  blockC01 = 0.f;\n"
"Dtype16  blockC11 = 0.f;\n"
"int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;\n"
"int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;\n"
"int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;\n"
"int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;\n"
"#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n"
"int saved_y0 = curr_y0;\n"
"int saved_y1 = curr_y1;\n"
"#endif\n"
"const __global Dtype *src0_read0 = src0\n"
"+ aligned_input_size * global_z\n"
"+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH\n"
"+ curr_x0 - INPUT_PAD_W;\n"
"const __global Dtype *src0_read1 = src0\n"
"+ aligned_input_size * global_z\n"
"+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH\n"
"+ curr_x1 - INPUT_PAD_W;\n"
"const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);\n"
"int patch_depth = 0;\n"
"do\n"
"{\n"
"int patch_row = 0;\n"
"do\n"
"{\n"
"const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;\n"
"#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0\n"
"Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[  0  ]; src0_read0 += ROW_PITCH;\n"
"Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[  0  ]; src0_read1 += ROW_PITCH;\n"
"Dtype*  pblockA00 = (Dtype*)(&blockA00);\n"
"Dtype*  pblockA01 = (Dtype*)(&blockA01);\n"
"#else\n"
"Dtype_t blockA00;\n"
"Dtype*  pblockA00 = (Dtype*)(&blockA00);\n"
"int pos = 0;\n"
"LOOP(KERNEL_WIDTH, pos,\n"
"{\n"
"if (curr_y0 >= INPUT_PAD_H &&\n"
"curr_y0 < input_height + INPUT_PAD_H &&\n"
"curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&\n"
"curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)\n"
"pblockA00[pos] = src0_read0[pos * DILATION_X];\n"
"else\n"
"pblockA00[pos] = 0;\n"
"})\n"
"curr_y0 += DILATION_Y;\n"
"Dtype_t blockA01;\n"
"Dtype*  pblockA01 = (Dtype*)(&blockA01);\n"
"pos = 0;\n"
"LOOP(KERNEL_WIDTH, pos,\n"
"{\n"
"if (curr_y1 >= INPUT_PAD_H &&\n"
"curr_y1 < input_height + INPUT_PAD_H &&\n"
"curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&\n"
"curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)\n"
"pblockA01[pos] = src0_read1[pos * DILATION_X];\n"
"else\n"
"pblockA01[pos] = 0;\n"
"})\n"
"curr_y1 += DILATION_Y;\n"
"src0_read0 += (ROW_PITCH * DILATION_Y);\n"
"src0_read1 += (ROW_PITCH * DILATION_Y);\n"
"#endif\n"
"Dtype blockB00[KERNEL_WIDTH*2];\n"
"Dtype4* p4BlockB00 = (Dtype4*)blockB00;\n"
"Dtype2* p2BlockB00 = (Dtype2*)blockB00;\n"
"Dtype*  pBlockB00 =  (Dtype* )blockB00;\n"
"interleaved_y = 0;\n"
"LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n"
"{\n"
"p4BlockB00[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );\n"
"src1_read += WIDTH1 * 2;\n"
"} )\n"
"if ( kernel_width_is_odd )\n"
"{\n"
"p2BlockB00[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );\n"
"src1_read += WIDTH1 * 2;\n"
"}\n"
"kernel_idx = 0;\n"
"interleaved_y = 0;\n"
"LOOP(KERNEL_WIDTH_DIV2, interleaved_y,\n"
"{\n"
"kernel_y = interleaved_y * 2;\n"
"DOT_PRODUCT_16( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_16( blockC01, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_16( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_16( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_16( blockC11, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_16( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"} )\n"
"if ( kernel_width_is_odd )\n"
"{\n"
"kernel_y = interleaved_y * 2;\n"
"DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_16( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );\n"
"DOT_PRODUCT_16( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;\n"
"}\n"
"}\n"
"while( ++patch_row < KERNEL_HEIGHT );\n"
"#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0\n"
"curr_y0 = saved_y0;\n"
"curr_y1 = saved_y1;\n"
"#endif\n"
"src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);\n"
"src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);\n"
"}\n"
"while ( ++patch_depth < INPUT_DEPTH );\n"
"int out0_offset = global_z * out_pitch_z\n"
"+ ( group_x * TILE_N ) * out_pitch_y\n"
"+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X\n"
"+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;\n"
"int out1_offset = global_z * out_pitch_z\n"
"+ ( group_x * TILE_N ) * out_pitch_y\n"
"+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X\n"
"+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;\n"
"#if APPLY_BIAS\n"
"Dtype bias[2];\n"
"Dtype2 *bias_vec;\n"
"bias_vec = (Dtype2*)bias;\n"
"*bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));\n"
"if (group_x > 0xFFFFFFFEul) {\n"
"dst[0] = bias[0] + bias[1];\n"
"}\n"
"#else\n"
"const Dtype bias[2] = {0, 0};\n"
"#endif\n"
"INTERLEAVED_SIMD16_OUTPUT(dst, out0_offset, 0);\n"
"INTERLEAVED_SIMD16_OUTPUT(dst, out1_offset, 1);\n"
"}\n"
"}\n"
"#endif\n"
"#elif defined KERNEL_DWCONV\n"
"__kernel void DWCONV(\n"
"ELTWISE_DATA_ARG\n"
"FUSED_ARG\n"
"__global Dtype* image_data,\n"
"__global Dtype* kernel_data,\n"
"BIAS_KERNEL_ARG\n"
"__global Dtype* convolved_image_base,\n"
"const int convolved_image_offset,\n"
"const ushort input_width,\n"
"const ushort input_height,\n"
"const ushort output_width,\n"
"const ushort output_height) {\n"
"__global Dtype* convolved_image = convolved_image_base + convolved_image_offset;\n"
"const int out_idx = get_global_id(0);\n"
"const int plane_size = output_width * output_height;\n"
"const int out_plane_idx = out_idx % plane_size;\n"
"const int outputZ = out_idx / plane_size;\n"
"const int outputY = out_plane_idx / output_width;\n"
"const int outputX = out_plane_idx % output_width;\n"
"if (outputZ < OUTPUT_Z)\n"
"{\n"
"Dtype sum = 0.;\n"
"const int org_y = outputY * STRIDE_Y - INPUT_PAD_H;\n"
"const int org_x = outputX * STRIDE_X - INPUT_PAD_W;\n"
"const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS);\n"
"const int biasIndex=outputZ%CHANNELS;\n"
"const int local_image_offset = org_y*input_width + org_x;\n"
"const int imageSize = input_width*input_height;\n"
"__global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset));\n"
"__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));\n"
"for(int y = 0; y < KERNEL_H; y++)\n"
"{\n"
"for(int x = 0; x < KERNEL_W; x++)\n"
"{\n"
"if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width))\n"
"{\n"
"continue;\n"
"}\n"
"sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x];\n"
"}\n"
"image_dataPtrFloat += input_width * DILATION_Y;\n"
"kernel_dataPtrFloat += KERNEL_W;\n"
"}\n"
"#if APPLY_BIAS\n"
"int offset = outputZ*output_height*output_width + outputY*output_width + outputX;\n"
"ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex);\n"
"#else\n"
"int offset = outputZ*output_height*output_width + outputY*output_width + outputX;\n"
"ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex);\n"
"#endif\n"
"}\n"
"}\n"
"#endif\n"
, "3c78cbca36e239b2dec7831380734a49", NULL};
struct cv::ocl::internal::ProgramEntry conv_spatial_helper_oclsrc={moduleName, "conv_spatial_helper",
"#ifdef HALF_SUPPORT\n"
"#ifdef cl_khr_fp16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16:enable\n"
"#endif\n"
"#endif\n"
"#define CONCAT(A,B) A##_##B\n"
"#define TEMPLATE(name,type) CONCAT(name,type)\n"
"__kernel void TEMPLATE(copyWeightsSwizzled, Dtype)\n"
"(__global Dtype* weightIn,\n"
"__global Dtype* weightOut,\n"
"const int kernel_w,\n"
"const int kernel_h,\n"
"const int channels,\n"
"const int outputs,\n"
"const int swizzleFactor) {\n"
"unsigned int sX = get_global_id(0);\n"
"int filter = sX / (kernel_w*kernel_h*channels);\n"
"int kernel_X = sX % kernel_w;\n"
"int kernel_Y = (sX / kernel_w) % kernel_h;\n"
"int kernel_C = (sX / (kernel_w * kernel_h)) % channels;\n"
"int FP = filter / swizzleFactor;\n"
"int F1 = filter % swizzleFactor;\n"
"int idxOut = FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1;\n"
"int idxIn = filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X;\n"
"Dtype v = (filter < outputs) ? weightIn[idxIn] : (Dtype)0;\n"
"weightOut[idxOut] = v;\n"
"}\n"
, "e973c981815e5a6c4cc7675de1232b3b", NULL};
struct cv::ocl::internal::ProgramEntry detection_output_oclsrc={moduleName, "detection_output",
"#define Dtype float\n"
"#define Dtype4 float4\n"
"__kernel void DecodeBBoxesCORNER(const int nthreads,\n"
"__global const Dtype* loc_data,\n"
"__global const Dtype* prior_data,\n"
"const int variance_encoded_in_target,\n"
"const int num_priors,\n"
"const int share_location,\n"
"const int num_loc_classes,\n"
"const int background_label_id,\n"
"const int clip_bbox,\n"
"const int locPredTransposed,\n"
"__global Dtype* bbox_data)\n"
"{\n"
"for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))\n"
"{\n"
"Dtype bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax;\n"
"const int i = index % 4;\n"
"const int p = ((index / 4 / num_loc_classes) % num_priors) * 4;\n"
"const int c = (index / 4) % num_loc_classes;\n"
"int label = share_location ? -1 : c;\n"
"if (label == background_label_id)\n"
"return;\n"
"Dtype4 loc_vec = vload4(0, loc_data + index - i);\n"
"Dtype4 bbox_vec, prior_variance;\n"
"if (variance_encoded_in_target)\n"
"{\n"
"bbox_vec = loc_vec;\n"
"} else {\n"
"const int start_index = num_priors * 4 + p;\n"
"prior_variance = vload4(0, prior_data + start_index);\n"
"bbox_vec = loc_vec * prior_variance;\n"
"}\n"
"if (locPredTransposed)\n"
"{\n"
"bbox_ymin = bbox_vec.x;\n"
"bbox_xmin = bbox_vec.y;\n"
"bbox_ymax = bbox_vec.z;\n"
"bbox_xmax = bbox_vec.w;\n"
"} else {\n"
"bbox_xmin = bbox_vec.x;\n"
"bbox_ymin = bbox_vec.y;\n"
"bbox_xmax = bbox_vec.z;\n"
"bbox_ymax = bbox_vec.w;\n"
"}\n"
"Dtype4 prior_vec = vload4(0, prior_data + p);\n"
"Dtype val;\n"
"switch (i)\n"
"{\n"
"case 0:\n"
"val = prior_vec.x + bbox_xmin;\n"
"break;\n"
"case 1:\n"
"val = prior_vec.y + bbox_ymin;\n"
"break;\n"
"case 2:\n"
"val = prior_vec.z + bbox_xmax;\n"
"break;\n"
"case 3:\n"
"val = prior_vec.w + bbox_ymax;\n"
"break;\n"
"}\n"
"if (clip_bbox)\n"
"val = max(min(val, (Dtype)1.), (Dtype)0.);\n"
"bbox_data[index] = val;\n"
"}\n"
"}\n"
"__kernel void DecodeBBoxesCENTER_SIZE(const int nthreads,\n"
"__global const Dtype* loc_data,\n"
"__global const Dtype* prior_data,\n"
"const int variance_encoded_in_target,\n"
"const int num_priors,\n"
"const int share_location,\n"
"const int num_loc_classes,\n"
"const int background_label_id,\n"
"const int clip_bbox,\n"
"const int locPredTransposed,\n"
"__global Dtype* bbox_data)\n"
"{\n"
"for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))\n"
"{\n"
"Dtype bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax;\n"
"const int i = index % 4;\n"
"const int p = ((index / 4 / num_loc_classes) % num_priors) * 4;\n"
"const int c = (index / 4) % num_loc_classes;\n"
"int label = share_location ? -1 : c;\n"
"if (label == background_label_id)\n"
"return;\n"
"Dtype4 loc_vec = vload4(0, loc_data + index - i);\n"
"Dtype4 bbox_vec, prior_variance;\n"
"if (variance_encoded_in_target)\n"
"{\n"
"bbox_vec = loc_vec;\n"
"} else {\n"
"const int start_index = num_priors * 4 + p;\n"
"prior_variance = vload4(0, prior_data + start_index);\n"
"bbox_vec = loc_vec * prior_variance;\n"
"}\n"
"if (locPredTransposed)\n"
"{\n"
"bbox_ymin = bbox_vec.x;\n"
"bbox_xmin = bbox_vec.y;\n"
"bbox_ymax = bbox_vec.z;\n"
"bbox_xmax = bbox_vec.w;\n"
"} else {\n"
"bbox_xmin = bbox_vec.x;\n"
"bbox_ymin = bbox_vec.y;\n"
"bbox_xmax = bbox_vec.z;\n"
"bbox_ymax = bbox_vec.w;\n"
"}\n"
"Dtype4 prior_vec = vload4(0, prior_data + p);\n"
"Dtype prior_width = prior_vec.z - prior_vec.x;\n"
"Dtype prior_height = prior_vec.w - prior_vec.y;\n"
"Dtype prior_center_x = (prior_vec.x + prior_vec.z) * .5;\n"
"Dtype prior_center_y = (prior_vec.y + prior_vec.w) * .5;\n"
"Dtype decode_bbox_center_x, decode_bbox_center_y;\n"
"Dtype decode_bbox_width, decode_bbox_height;\n"
"decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;\n"
"decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;\n"
"decode_bbox_width = exp(bbox_xmax) * prior_width;\n"
"decode_bbox_height = exp(bbox_ymax) * prior_height;\n"
"Dtype val;\n"
"switch (i)\n"
"{\n"
"case 0:\n"
"val = decode_bbox_center_x - decode_bbox_width * .5;\n"
"break;\n"
"case 1:\n"
"val = decode_bbox_center_y - decode_bbox_height * .5;\n"
"break;\n"
"case 2:\n"
"val = decode_bbox_center_x + decode_bbox_width * .5;\n"
"break;\n"
"case 3:\n"
"val = decode_bbox_center_y + decode_bbox_height * .5;\n"
"break;\n"
"}\n"
"if (clip_bbox)\n"
"val = max(min(val, (Dtype)1.), (Dtype)0.);\n"
"bbox_data[index] = val;\n"
"}\n"
"}\n"
, "0817e73f5a1af5ed94be692d3f7a2ee3", NULL};
struct cv::ocl::internal::ProgramEntry dummy_oclsrc={moduleName, "dummy",
"__kernel void dummy_kernel()\n"
"{\n"
"}\n"
, "697bd1a0f09685d066b8946e159d42bc", NULL};
struct cv::ocl::internal::ProgramEntry eltwise_oclsrc={moduleName, "eltwise",
"#if defined(cl_khr_fp16)\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"__kernel void op_sum4(__global const Dtype * A,\n"
"__global const Dtype * B,\n"
"unsigned int A_col_size,\n"
"const float coeff1,\n"
"const float coeff2,\n"
"__global Dtype * C)\n"
"{\n"
"unsigned int row_gid = get_group_id(0);\n"
"unsigned int lid = get_local_id(0);\n"
"const __global Dtype *src0_read = A + row_gid * 4 * A_col_size;\n"
"const __global Dtype *src1_read = B + row_gid * 4 * A_col_size;\n"
"__global Dtype *dst0_read = C + row_gid * 4 * A_col_size;\n"
"Dtype4 a0, a1, a2, a3;\n"
"Dtype4 dot0, dot1, dot2, dot3;\n"
"unsigned int i = lid;\n"
"while( i < A_col_size / 4)\n"
"{\n"
"const Dtype4 b0 = vload4(i, src1_read);\n"
"const Dtype4 b1 = vload4(i, src1_read + A_col_size);\n"
"const Dtype4 b2 = vload4(i, src1_read + 2 * A_col_size);\n"
"const Dtype4 b3 = vload4(i, src1_read + 3 * A_col_size);\n"
"#if LOOP == 0\n"
"a0 = vload4(i, src0_read);\n"
"a1 = vload4(i, src0_read + A_col_size);\n"
"a2 = vload4(i, src0_read + 2 * A_col_size);\n"
"a3 = vload4(i, src0_read + 3 * A_col_size);\n"
"dot0 = a0 * (Dtype4)coeff1 + b0 * (Dtype4)coeff2;\n"
"dot1 = a1 * (Dtype4)coeff1 + b1 * (Dtype4)coeff2;\n"
"dot2 = a2 * (Dtype4)coeff1 + b2 * (Dtype4)coeff2;\n"
"dot3 = a3 * (Dtype4)coeff1 + b3 * (Dtype4)coeff2;\n"
"#else\n"
"a0 = vload4(i, dst0_read);\n"
"a1 = vload4(i, dst0_read + A_col_size);\n"
"a2 = vload4(i, dst0_read + 2 * A_col_size);\n"
"a3 = vload4(i, dst0_read + 3 * A_col_size);\n"
"dot0 = a0 + b0 * (Dtype4)coeff2;\n"
"dot1 = a1 + b1 * (Dtype4)coeff2;\n"
"dot2 = a2 + b2 * (Dtype4)coeff2;\n"
"dot3 = a3 + b3 * (Dtype4)coeff2;\n"
"#endif\n"
"vstore4(dot0, i, dst0_read);\n"
"vstore4(dot1, i, dst0_read + A_col_size);\n"
"vstore4(dot2, i, dst0_read + 2 * A_col_size);\n"
"vstore4(dot3, i, dst0_read + 3 * A_col_size);\n"
"i += get_local_size(0);\n"
"}\n"
"}\n"
, "c01078058d3ab56727d0b26c2965434e", NULL};
struct cv::ocl::internal::ProgramEntry gemm_buffer_oclsrc={moduleName, "gemm_buffer",
"#if defined(cl_khr_fp16)\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define CONCAT(A,B) A##_##B\n"
"#define TEMPLATE(name,type) CONCAT(name,type)\n"
"#define KERNEL_ARG_DTYPE float\n"
"#define TYPE_FLOAT  1\n"
"#define TYPE_HALF   2\n"
"#if TYPE == TYPE_HALF\n"
"#define Dtype  half\n"
"#define Dtype2 half2\n"
"#define Dtype4 half4\n"
"#define Dtype8 half8\n"
"#define Dtype16 half16\n"
"#define as_Dtype  as_half\n"
"#define as_Dtype2 as_half2\n"
"#define as_Dtype4 as_half4\n"
"#define as_Dtype8 as_half8\n"
"#define as_Dtype16 as_half16\n"
"#else\n"
"#define Dtype  float\n"
"#define Dtype2 float2\n"
"#define Dtype4 float4\n"
"#define Dtype8 float8\n"
"#define Dtype16 float16\n"
"#define as_Dtype  as_float\n"
"#define as_Dtype2 as_float2\n"
"#define as_Dtype4 as_float4\n"
"#define as_Dtype8 as_float8\n"
"#define as_Dtype16 as_float16\n"
"#endif\n"
"#if TYPE == TYPE_HALF\n"
"#define SHUFFLE_TYPE2(val) as_ushort2(val)\n"
"#define SHUFFLE_TYPE8(val) as_ushort8(val)\n"
"#define SIMD_SIZE_GEMM 16\n"
"#else\n"
"#define SHUFFLE_TYPE2(val) val\n"
"#define SHUFFLE_TYPE8(val) val\n"
"#define SIMD_SIZE_GEMM 8\n"
"#endif\n"
"#if defined(cl_intel_subgroups)\n"
"#pragma OPENCL EXTENSION  cl_intel_subgroups : enable\n"
"#endif\n"
"#ifdef ZERO_BETA\n"
"#define BETA_ZERO_CHECK(b0, v)  (b0)\n"
"#else\n"
"#define BETA_ZERO_CHECK(b0, v)  (v)\n"
"#endif\n"
"#define VEC_SIZE        4\n"
"#define LWG_HEIGHT      4\n"
"#define TILE_M          8\n"
"#if TYPE == TYPE_HALF\n"
"#define TILE_K          32\n"
"#define TILE_N          64\n"
"#else\n"
"#define TILE_K          16\n"
"#define TILE_N          32\n"
"#endif\n"
"__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, LWG_HEIGHT, 1)))\n"
"__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM)))\n"
"__kernel void TEMPLATE(gemm_buffer_NN, Dtype)(\n"
"const __global Dtype *src0, int off0,\n"
"const __global Dtype *src1, int off1,\n"
"__global Dtype *dst, int offd,\n"
"int M,\n"
"int N,\n"
"int K,\n"
"KERNEL_ARG_DTYPE alpha_in,\n"
"KERNEL_ARG_DTYPE beta_in,\n"
"int start_index)\n"
"{\n"
"const Dtype alpha = (Dtype)alpha_in;\n"
"const Dtype beta = (Dtype)beta_in;\n"
"const int group_x = get_group_id(0);\n"
"const int group_y = get_group_id(1);\n"
"const int local_x = get_local_id(0);\n"
"const int local_y = get_local_id(1);\n"
"const int global_x = get_global_id(0);\n"
"const int global_y = get_global_id(1);\n"
"Dtype4 brow;\n"
"Dtype2 arow0, arow1, arow2, arow3, arow4, arow5, arow6, arow7;\n"
"__global Dtype *dst_write0 = dst + local_x * VEC_SIZE + (group_x * TILE_N) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * N + offd;\n"
"const __global Dtype *src0_read = src0 + local_x * (TILE_K / SIMD_SIZE_GEMM) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * K + start_index + off0;\n"
"const __global Dtype *src1_read0 = src1 + local_x * VEC_SIZE + (group_x * TILE_N) + start_index * N + off1;\n"
"int border = -(group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M);\n"
"int row0 = mad24(global_y, TILE_M, 0) < M ? 0 : border;\n"
"int row1 = mad24(global_y, TILE_M, 1) < M ? 1 : border;\n"
"int row2 = mad24(global_y, TILE_M, 2) < M ? 2 : border;\n"
"int row3 = mad24(global_y, TILE_M, 3) < M ? 3 : border;\n"
"int row4 = mad24(global_y, TILE_M, 4) < M ? 4 : border;\n"
"int row5 = mad24(global_y, TILE_M, 5) < M ? 5 : border;\n"
"int row6 = mad24(global_y, TILE_M, 6) < M ? 6 : border;\n"
"int row7 = mad24(global_y, TILE_M, 7) < M ? 7 : border;\n"
"Dtype4 dot00 = (start_index != 0) ? vload4(0, dst_write0) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0));\n"
"Dtype4 dot01 = (start_index != 0) ? vload4(0, dst_write0 + 1 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 1 * N));\n"
"Dtype4 dot02 = (start_index != 0) ? vload4(0, dst_write0 + 2 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 2 * N));\n"
"Dtype4 dot03 = (start_index != 0) ? vload4(0, dst_write0 + 3 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 3 * N));\n"
"Dtype4 dot04 = (start_index != 0) ? vload4(0, dst_write0 + 4 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 4 * N));\n"
"Dtype4 dot05 = (start_index != 0) ? vload4(0, dst_write0 + 5 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 5 * N));\n"
"Dtype4 dot06 = (start_index != 0) ? vload4(0, dst_write0 + 6 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 6 * N));\n"
"Dtype4 dot07 = (start_index != 0) ? vload4(0, dst_write0 + 7 * N) : BETA_ZERO_CHECK((Dtype4)0, beta * vload4(0, dst_write0 + 7 * N));\n"
"int end_index = min(start_index + 256, K);\n"
"int w = start_index;\n"
"while( w + TILE_K <= end_index ) {\n"
"arow0 = alpha * vload2(0, src0_read + row0 * K);\n"
"arow1 = alpha * vload2(0, src0_read + row1 * K);\n"
"arow2 = alpha * vload2(0, src0_read + row2 * K);\n"
"arow3 = alpha * vload2(0, src0_read + row3 * K);\n"
"arow4 = alpha * vload2(0, src0_read + row4 * K);\n"
"arow5 = alpha * vload2(0, src0_read + row5 * K);\n"
"arow6 = alpha * vload2(0, src0_read + row6 * K);\n"
"arow7 = alpha * vload2(0, src0_read + row7 * K);\n"
"#define MM_DOT_PRODUCT( index, suffix )   \\\n"
"brow = vload4(0, src1_read0);  src1_read0 += N; \\\n"
"dot00 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow0), index )).s##suffix), brow, dot00 ); \\\n"
"dot01 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow1), index )).s##suffix), brow, dot01 ); \\\n"
"dot02 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow2), index )).s##suffix), brow, dot02 ); \\\n"
"dot03 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow3), index )).s##suffix), brow, dot03 ); \\\n"
"dot04 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow4), index )).s##suffix), brow, dot04 ); \\\n"
"dot05 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow5), index )).s##suffix), brow, dot05 ); \\\n"
"dot06 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow6), index )).s##suffix), brow, dot06 ); \\\n"
"dot07 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow7), index )).s##suffix), brow, dot07 );\n"
"MM_DOT_PRODUCT(0, 0);\n"
"MM_DOT_PRODUCT(0, 1);\n"
"MM_DOT_PRODUCT(1, 0);\n"
"MM_DOT_PRODUCT(1, 1);\n"
"MM_DOT_PRODUCT(2, 0);\n"
"MM_DOT_PRODUCT(2, 1);\n"
"MM_DOT_PRODUCT(3, 0);\n"
"MM_DOT_PRODUCT(3, 1);\n"
"MM_DOT_PRODUCT(4, 0);\n"
"MM_DOT_PRODUCT(4, 1);\n"
"MM_DOT_PRODUCT(5, 0);\n"
"MM_DOT_PRODUCT(5, 1);\n"
"MM_DOT_PRODUCT(6, 0);\n"
"MM_DOT_PRODUCT(6, 1);\n"
"MM_DOT_PRODUCT(7, 0);\n"
"MM_DOT_PRODUCT(7, 1);\n"
"#if TYPE == TYPE_HALF\n"
"MM_DOT_PRODUCT(8, 0);\n"
"MM_DOT_PRODUCT(8, 1);\n"
"MM_DOT_PRODUCT(9, 0);\n"
"MM_DOT_PRODUCT(9, 1);\n"
"MM_DOT_PRODUCT(10, 0);\n"
"MM_DOT_PRODUCT(10, 1);\n"
"MM_DOT_PRODUCT(11, 0);\n"
"MM_DOT_PRODUCT(11, 1);\n"
"MM_DOT_PRODUCT(12, 0);\n"
"MM_DOT_PRODUCT(12, 1);\n"
"MM_DOT_PRODUCT(13, 0);\n"
"MM_DOT_PRODUCT(13, 1);\n"
"MM_DOT_PRODUCT(14, 0);\n"
"MM_DOT_PRODUCT(14, 1);\n"
"MM_DOT_PRODUCT(15, 0);\n"
"MM_DOT_PRODUCT(15, 1);\n"
"#endif\n"
"#undef MM_DOT_PRODUCT\n"
"src0_read += TILE_K;\n"
"w += TILE_K;\n"
"}\n"
"if(w < end_index) {\n"
"arow0.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row0 * K)[0] : 0.0f;\n"
"arow0.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row0 * K)[1] : 0.0f;\n"
"arow1.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row1 * K)[0] : 0.0f;\n"
"arow1.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row1 * K)[1] : 0.0f;\n"
"arow2.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row2 * K)[0] : 0.0f;\n"
"arow2.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row2 * K)[1] : 0.0f;\n"
"arow3.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row3 * K)[0] : 0.0f;\n"
"arow3.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row3 * K)[1] : 0.0f;\n"
"arow4.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row4 * K)[0] : 0.0f;\n"
"arow4.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row4 * K)[1] : 0.0f;\n"
"arow5.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row5 * K)[0] : 0.0f;\n"
"arow5.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row5 * K)[1] : 0.0f;\n"
"arow6.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row6 * K)[0] : 0.0f;\n"
"arow6.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row6 * K)[1] : 0.0f;\n"
"arow7.x = ((w + local_x * 2) < K) ? alpha * (src0_read + row7 * K)[0] : 0.0f;\n"
"arow7.y = ((w + local_x * 2 + 1) < K) ? alpha * (src0_read + row7 * K)[1] : 0.0f;\n"
"#define MM_DOT_PRODUCT( index, suffix )   \\\n"
"brow = (w < K) ? vload4(0, src1_read0) : (Dtype4)0.0f;  src1_read0 += N; w++; \\\n"
"dot00 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow0), index )).s##suffix), brow, dot00 ); \\\n"
"dot01 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow1), index )).s##suffix), brow, dot01 ); \\\n"
"dot02 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow2), index )).s##suffix), brow, dot02 ); \\\n"
"dot03 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow3), index )).s##suffix), brow, dot03 ); \\\n"
"dot04 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow4), index )).s##suffix), brow, dot04 ); \\\n"
"dot05 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow5), index )).s##suffix), brow, dot05 ); \\\n"
"dot06 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow6), index )).s##suffix), brow, dot06 ); \\\n"
"dot07 = mad( (Dtype4)(as_Dtype2(intel_sub_group_shuffle( SHUFFLE_TYPE2(arow7), index )).s##suffix), brow, dot07 );\n"
"MM_DOT_PRODUCT(0, 0);\n"
"MM_DOT_PRODUCT(0, 1);\n"
"MM_DOT_PRODUCT(1, 0);\n"
"MM_DOT_PRODUCT(1, 1);\n"
"MM_DOT_PRODUCT(2, 0);\n"
"MM_DOT_PRODUCT(2, 1);\n"
"MM_DOT_PRODUCT(3, 0);\n"
"MM_DOT_PRODUCT(3, 1);\n"
"MM_DOT_PRODUCT(4, 0);\n"
"MM_DOT_PRODUCT(4, 1);\n"
"MM_DOT_PRODUCT(5, 0);\n"
"MM_DOT_PRODUCT(5, 1);\n"
"MM_DOT_PRODUCT(6, 0);\n"
"MM_DOT_PRODUCT(6, 1);\n"
"MM_DOT_PRODUCT(7, 0);\n"
"MM_DOT_PRODUCT(7, 1);\n"
"#if TYPE == TYPE_HALF\n"
"MM_DOT_PRODUCT(8, 0);\n"
"MM_DOT_PRODUCT(8, 1);\n"
"MM_DOT_PRODUCT(9, 0);\n"
"MM_DOT_PRODUCT(9, 1);\n"
"MM_DOT_PRODUCT(10, 0);\n"
"MM_DOT_PRODUCT(10, 1);\n"
"MM_DOT_PRODUCT(11, 0);\n"
"MM_DOT_PRODUCT(11, 1);\n"
"MM_DOT_PRODUCT(12, 0);\n"
"MM_DOT_PRODUCT(12, 1);\n"
"MM_DOT_PRODUCT(13, 0);\n"
"MM_DOT_PRODUCT(13, 1);\n"
"MM_DOT_PRODUCT(14, 0);\n"
"MM_DOT_PRODUCT(14, 1);\n"
"MM_DOT_PRODUCT(15, 0);\n"
"MM_DOT_PRODUCT(15, 1);\n"
"#endif\n"
"#undef MM_DOT_PRODUCT\n"
"}\n"
"if(global_x * 4 < N && global_y * 8 < M) {\n"
"if(mad24(global_x, 4, 3) < N) {\n"
"vstore4(dot00, 0, dst_write0); dst_write0 += N;\n"
"if(mad24(global_y, 8, 1) < M) { vstore4(dot01, 0, dst_write0); dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 2) < M) { vstore4(dot02, 0, dst_write0); dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 3) < M) { vstore4(dot03, 0, dst_write0); dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 4) < M) { vstore4(dot04, 0, dst_write0); dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 5) < M) { vstore4(dot05, 0, dst_write0); dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 6) < M) { vstore4(dot06, 0, dst_write0); dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 7) < M) { vstore4(dot07, 0, dst_write0); }\n"
"} else if(mad24(global_x, 4, 2) < N) {\n"
"vstore2(dot00.xy, 0, dst_write0);\n"
"dst_write0[2] = dot00.z;\n"
"dst_write0 += N;\n"
"if(mad24(global_y, 8, 1) < M) {\n"
"vstore2(dot01.xy, 0, dst_write0);\n"
"dst_write0[2] = dot01.z;\n"
"dst_write0 += N;\n"
"} else\n"
"return;\n"
"if(mad24(global_y, 8, 2) < M) {\n"
"vstore2(dot02.xy, 0, dst_write0);\n"
"dst_write0[2] = dot02.z;\n"
"dst_write0 += N;\n"
"} else\n"
"return;\n"
"if(mad24(global_y, 8, 3) < M) {\n"
"vstore2(dot03.xy, 0, dst_write0);\n"
"dst_write0[2] = dot03.z;\n"
"dst_write0 += N;\n"
"} else\n"
"return;\n"
"if(mad24(global_y, 8, 4) < M) {\n"
"vstore2(dot04.xy, 0, dst_write0);\n"
"dst_write0[2] = dot04.z;\n"
"dst_write0 += N;\n"
"} else\n"
"return;\n"
"if(mad24(global_y, 8, 5) < M) {\n"
"vstore2(dot05.xy, 0, dst_write0);\n"
"dst_write0[2] = dot05.z;\n"
"dst_write0 += N;\n"
"} else\n"
"return;\n"
"if(mad24(global_y, 8, 6) < M) {\n"
"vstore2(dot06.xy, 0, dst_write0);\n"
"dst_write0[2] = dot06.z;\n"
"dst_write0 += N;\n"
"} else\n"
"return;\n"
"if(mad24(global_y, 8, 7) < M) {\n"
"vstore2(dot07.xy, 0, dst_write0);\n"
"dst_write0[2] = dot07.z;\n"
"}\n"
"} else if(mad24(global_x, 4, 1) < N) {\n"
"vstore2(dot00.xy, 0, dst_write0); dst_write0 += N;\n"
"if(mad24(global_y, 8, 1) < M) { vstore2(dot01.xy, 0, dst_write0); dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 2) < M) { vstore2(dot02.xy, 0, dst_write0); dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 3) < M) { vstore2(dot03.xy, 0, dst_write0); dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 4) < M) { vstore2(dot04.xy, 0, dst_write0); dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 5) < M) { vstore2(dot05.xy, 0, dst_write0); dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 6) < M) { vstore2(dot06.xy, 0, dst_write0); dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 7) < M) { vstore2(dot07.xy, 0, dst_write0); }\n"
"} else {\n"
"dst_write0[0] = dot00.x; dst_write0 += N;\n"
"if(mad24(global_y, 8, 1) < M) { dst_write0[0] = dot01.x; dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 2) < M) { dst_write0[0] = dot02.x; dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 3) < M) { dst_write0[0] = dot03.x; dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 4) < M) { dst_write0[0] = dot04.x; dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 5) < M) { dst_write0[0] = dot05.x; dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 6) < M) { dst_write0[0] = dot06.x; dst_write0 += N; }\n"
"else return;\n"
"if(mad24(global_y, 8, 7) < M) { dst_write0[0] = dot07.x; }\n"
"}\n"
"}\n"
"}\n"
"#undef VEC_SIZE\n"
"#undef LWG_HEIGHT\n"
"#undef TILE_M\n"
"#undef TILE_K\n"
"#undef TILE_N\n"
"#define VEC_SIZE        1\n"
"#define TILE_M          8\n"
"#define TILE_N          8\n"
"#define SLM_BLOCK       128\n"
"#if TYPE == TYPE_HALF\n"
"#define LWG_HEIGHT      2\n"
"#define TILE_K          64\n"
"#else\n"
"#define LWG_HEIGHT      4\n"
"#define TILE_K          32\n"
"#endif\n"
"#if TYPE == TYPE_HALF\n"
"__attribute__((reqd_work_group_size(8, LWG_HEIGHT, 1)))\n"
"__attribute__((intel_reqd_sub_group_size(8)))\n"
"__kernel void TEMPLATE(gemm_buffer_NT, Dtype)(\n"
"const __global Dtype *src0, int off0,\n"
"const __global Dtype *src1, int off1,\n"
"__global Dtype *dst, int offd,\n"
"int M,\n"
"int N,\n"
"int K,\n"
"KERNEL_ARG_DTYPE alpha_in,\n"
"KERNEL_ARG_DTYPE beta_in)\n"
"{\n"
"const Dtype alpha = (Dtype)alpha_in;\n"
"const Dtype beta = (Dtype)beta_in;\n"
"const int group_x = get_group_id(0);\n"
"const int group_y = get_group_id(1);\n"
"const int local_x = get_local_id(0);\n"
"const int local_y = get_local_id(1);\n"
"const int global_x = get_global_id(0);\n"
"const int global_y = get_global_id(1);\n"
"Dtype8 dot00 = 0.f;\n"
"Dtype8 dot01 = 0.f;\n"
"Dtype8 dot02 = 0.f;\n"
"Dtype8 dot03 = 0.f;\n"
"Dtype8 dot04 = 0.f;\n"
"Dtype8 dot05 = 0.f;\n"
"Dtype8 dot06 = 0.f;\n"
"Dtype8 dot07 = 0.f;\n"
"Dtype8 brow0;\n"
"Dtype8 brow1;\n"
"Dtype8 brow2;\n"
"Dtype8 brow3;\n"
"Dtype8 brow4;\n"
"Dtype8 brow5;\n"
"Dtype8 brow6;\n"
"Dtype8 brow7;\n"
"__global Dtype *dst_write0 = dst + local_x * VEC_SIZE + (group_x * TILE_N) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * N + offd;\n"
"const __global Dtype *src0_read = src0 + local_x * (TILE_K / 8) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * K + off0;\n"
"const __global Dtype *src1_read0 = src1 + (group_x * TILE_N) * K + off1;\n"
"__local Dtype slm_brow[8 * SLM_BLOCK];\n"
"__local Dtype* slm_brow0;\n"
"int local_index = mad24(local_y, 8, local_x) * 8;\n"
"int w;\n"
"for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) {\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"vstore4(vload4(0, (__global float *)(src1_read0 + mad24(0, K, local_index))), 0, (__local float *)(slm_brow + mad24(0, SLM_BLOCK, local_index)));\n"
"vstore4(vload4(0, (__global float *)(src1_read0 + mad24(1, K, local_index))), 0, (__local float *)(slm_brow + mad24(1, SLM_BLOCK, local_index)));\n"
"vstore4(vload4(0, (__global float *)(src1_read0 + mad24(2, K, local_index))), 0, (__local float *)(slm_brow + mad24(2, SLM_BLOCK, local_index)));\n"
"vstore4(vload4(0, (__global float *)(src1_read0 + mad24(3, K, local_index))), 0, (__local float *)(slm_brow + mad24(3, SLM_BLOCK, local_index)));\n"
"vstore4(vload4(0, (__global float *)(src1_read0 + mad24(4, K, local_index))), 0, (__local float *)(slm_brow + mad24(4, SLM_BLOCK, local_index)));\n"
"vstore4(vload4(0, (__global float *)(src1_read0 + mad24(5, K, local_index))), 0, (__local float *)(slm_brow + mad24(5, SLM_BLOCK, local_index)));\n"
"vstore4(vload4(0, (__global float *)(src1_read0 + mad24(6, K, local_index))), 0, (__local float *)(slm_brow + mad24(6, SLM_BLOCK, local_index)));\n"
"vstore4(vload4(0, (__global float *)(src1_read0 + mad24(7, K, local_index))), 0, (__local float *)(slm_brow + mad24(7, SLM_BLOCK, local_index)));\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"slm_brow0 = slm_brow + local_x * (TILE_K / 8);\n"
"w = b_tile;\n"
"int end_w = min(b_tile + SLM_BLOCK, K);\n"
"while( w + TILE_K <= end_w ) {\n"
"Dtype8 arow;\n"
"brow0 = as_half8(vload4(0, (__local float *)(slm_brow0 + 0 * SLM_BLOCK)));\n"
"brow1 = as_half8(vload4(0, (__local float *)(slm_brow0 + 1 * SLM_BLOCK)));\n"
"brow2 = as_half8(vload4(0, (__local float *)(slm_brow0 + 2 * SLM_BLOCK)));\n"
"brow3 = as_half8(vload4(0, (__local float *)(slm_brow0 + 3 * SLM_BLOCK)));\n"
"brow4 = as_half8(vload4(0, (__local float *)(slm_brow0 + 4 * SLM_BLOCK)));\n"
"brow5 = as_half8(vload4(0, (__local float *)(slm_brow0 + 5 * SLM_BLOCK)));\n"
"brow6 = as_half8(vload4(0, (__local float *)(slm_brow0 + 6 * SLM_BLOCK)));\n"
"brow7 = as_half8(vload4(0, (__local float *)(slm_brow0 + 7 * SLM_BLOCK)));\n"
"#define MM_DOT_PRODUCT( _row, _dot )   \\\n"
"arow = as_half8(vload4(0, (__global float *)(src0_read + _row * K)));                           \\\n"
"_dot = mad( (Dtype8)(arow.s0), (Dtype8)(brow0.s0, brow1.s0, brow2.s0, brow3.s0, brow4.s0, brow5.s0, brow6.s0, brow7.s0), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.s1), (Dtype8)(brow0.s1, brow1.s1, brow2.s1, brow3.s1, brow4.s1, brow5.s1, brow6.s1, brow7.s1), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.s2), (Dtype8)(brow0.s2, brow1.s2, brow2.s2, brow3.s2, brow4.s2, brow5.s2, brow6.s2, brow7.s2), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.s3), (Dtype8)(brow0.s3, brow1.s3, brow2.s3, brow3.s3, brow4.s3, brow5.s3, brow6.s3, brow7.s3), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.s4), (Dtype8)(brow0.s4, brow1.s4, brow2.s4, brow3.s4, brow4.s4, brow5.s4, brow6.s4, brow7.s4), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.s5), (Dtype8)(brow0.s5, brow1.s5, brow2.s5, brow3.s5, brow4.s5, brow5.s5, brow6.s5, brow7.s5), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.s6), (Dtype8)(brow0.s6, brow1.s6, brow2.s6, brow3.s6, brow4.s6, brow5.s6, brow6.s6, brow7.s6), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.s7), (Dtype8)(brow0.s7, brow1.s7, brow2.s7, brow3.s7, brow4.s7, brow5.s7, brow6.s7, brow7.s7), _dot );\n"
"MM_DOT_PRODUCT( 0, dot00 );\n"
"MM_DOT_PRODUCT( 1, dot01 );\n"
"MM_DOT_PRODUCT( 2, dot02 );\n"
"MM_DOT_PRODUCT( 3, dot03 );\n"
"MM_DOT_PRODUCT( 4, dot04 );\n"
"MM_DOT_PRODUCT( 5, dot05 );\n"
"MM_DOT_PRODUCT( 6, dot06 );\n"
"MM_DOT_PRODUCT( 7, dot07 );\n"
"#undef MM_DOT_PRODUCT\n"
"src0_read += TILE_K;\n"
"slm_brow0 += TILE_K;\n"
"w += TILE_K;\n"
"}\n"
"src1_read0 += SLM_BLOCK;\n"
"}\n"
"if(w < K) {\n"
"Dtype8 arow;\n"
"#define READ_BROW(_brow, _row) \\\n"
"_brow = as_half8(vload4(0, (__local float *)(slm_brow0 + _row * SLM_BLOCK))); \\\n"
"_brow.s0 = (mad24(local_x, 8, w) < K) ? _brow.s0 : 0.0f; \\\n"
"_brow.s1 = (mad24(local_x, 8, w + 1) < K) ? _brow.s1 : 0.0f; \\\n"
"_brow.s2 = (mad24(local_x, 8, w + 2) < K) ? _brow.s2 : 0.0f; \\\n"
"_brow.s3 = (mad24(local_x, 8, w + 3) < K) ? _brow.s3 : 0.0f; \\\n"
"_brow.s4 = (mad24(local_x, 8, w + 4) < K) ? _brow.s4 : 0.0f; \\\n"
"_brow.s5 = (mad24(local_x, 8, w + 5) < K) ? _brow.s5 : 0.0f; \\\n"
"_brow.s6 = (mad24(local_x, 8, w + 6) < K) ? _brow.s6 : 0.0f; \\\n"
"_brow.s7 = (mad24(local_x, 8, w + 7) < K) ? _brow.s7 : 0.0f;\n"
"READ_BROW(brow0, 0);\n"
"READ_BROW(brow1, 1);\n"
"READ_BROW(brow2, 2);\n"
"READ_BROW(brow3, 3);\n"
"READ_BROW(brow4, 4);\n"
"READ_BROW(brow5, 5);\n"
"READ_BROW(brow6, 6);\n"
"READ_BROW(brow7, 7);\n"
"#undef READ_BROW\n"
"#define MM_DOT_PRODUCT( _row, _dot )   \\\n"
"arow = as_half8(vload4(0, (__global float *)(src0_read + _row * K)));                           \\\n"
"arow.s0 = (mad24(local_x, 8, w) < K) ? arow.s0 : 0.0f; \\\n"
"arow.s1 = (mad24(local_x, 8, w + 1) < K) ? arow.s1 : 0.0f; \\\n"
"arow.s2 = (mad24(local_x, 8, w + 2) < K) ? arow.s2 : 0.0f; \\\n"
"arow.s3 = (mad24(local_x, 8, w + 3) < K) ? arow.s3 : 0.0f; \\\n"
"arow.s4 = (mad24(local_x, 8, w + 4) < K) ? arow.s4 : 0.0f; \\\n"
"arow.s5 = (mad24(local_x, 8, w + 5) < K) ? arow.s5 : 0.0f; \\\n"
"arow.s6 = (mad24(local_x, 8, w + 6) < K) ? arow.s6 : 0.0f; \\\n"
"arow.s7 = (mad24(local_x, 8, w + 7) < K) ? arow.s7 : 0.0f; \\\n"
"_dot = mad( (Dtype8)(arow.s0), (Dtype8)(brow0.s0, brow1.s0, brow2.s0, brow3.s0, brow4.s0, brow5.s0, brow6.s0, brow7.s0), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.s1), (Dtype8)(brow0.s1, brow1.s1, brow2.s1, brow3.s1, brow4.s1, brow5.s1, brow6.s1, brow7.s1), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.s2), (Dtype8)(brow0.s2, brow1.s2, brow2.s2, brow3.s2, brow4.s2, brow5.s2, brow6.s2, brow7.s2), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.s3), (Dtype8)(brow0.s3, brow1.s3, brow2.s3, brow3.s3, brow4.s3, brow5.s3, brow6.s3, brow7.s3), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.s4), (Dtype8)(brow0.s4, brow1.s4, brow2.s4, brow3.s4, brow4.s4, brow5.s4, brow6.s4, brow7.s4), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.s5), (Dtype8)(brow0.s5, brow1.s5, brow2.s5, brow3.s5, brow4.s5, brow5.s5, brow6.s5, brow7.s5), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.s6), (Dtype8)(brow0.s6, brow1.s6, brow2.s6, brow3.s6, brow4.s6, brow5.s6, brow6.s6, brow7.s6), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.s7), (Dtype8)(brow0.s7, brow1.s7, brow2.s7, brow3.s7, brow4.s7, brow5.s7, brow6.s7, brow7.s7), _dot );\n"
"MM_DOT_PRODUCT( 0, dot00 );\n"
"MM_DOT_PRODUCT( 1, dot01 );\n"
"MM_DOT_PRODUCT( 2, dot02 );\n"
"MM_DOT_PRODUCT( 3, dot03 );\n"
"MM_DOT_PRODUCT( 4, dot04 );\n"
"MM_DOT_PRODUCT( 5, dot05 );\n"
"MM_DOT_PRODUCT( 6, dot06 );\n"
"MM_DOT_PRODUCT( 7, dot07 );\n"
"#undef MM_DOT_PRODUCT\n"
"}\n"
"#define REDUCE(_dot) \\\n"
"_dot = as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 0)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 1)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 2)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 3)) +  \\\n"
"as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 4)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 5)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 6)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 7));\n"
"REDUCE(dot00);\n"
"REDUCE(dot01);\n"
"REDUCE(dot02);\n"
"REDUCE(dot03);\n"
"REDUCE(dot04);\n"
"REDUCE(dot05);\n"
"REDUCE(dot06);\n"
"REDUCE(dot07);\n"
"#undef REDUCE\n"
"Dtype output = 0.0f;\n"
"#define OUTPUT( _dot) \\\n"
"output = (local_x == 0) ? _dot.s0 : output; \\\n"
"output = (local_x == 1) ? _dot.s1 : output; \\\n"
"output = (local_x == 2) ? _dot.s2 : output; \\\n"
"output = (local_x == 3) ? _dot.s3 : output; \\\n"
"output = (local_x == 4) ? _dot.s4 : output; \\\n"
"output = (local_x == 5) ? _dot.s5 : output; \\\n"
"output = (local_x == 6) ? _dot.s6 : output; \\\n"
"output = (local_x == 7) ? _dot.s7 : output; \\\n"
"dst_write0[0] = BETA_ZERO_CHECK(alpha * output, mad(output, alpha, beta * dst_write0[0])); \\\n"
"dst_write0 += N;\n"
"if(global_x < N && global_y * 8 < M) {\n"
"OUTPUT(dot00);\n"
"if(mad24(global_y, 8, 1) < M) { OUTPUT(dot01); }\n"
"if(mad24(global_y, 8, 2) < M) { OUTPUT(dot02); }\n"
"if(mad24(global_y, 8, 3) < M) { OUTPUT(dot03); }\n"
"if(mad24(global_y, 8, 4) < M) { OUTPUT(dot04); }\n"
"if(mad24(global_y, 8, 5) < M) { OUTPUT(dot05); }\n"
"if(mad24(global_y, 8, 6) < M) { OUTPUT(dot06); }\n"
"if(mad24(global_y, 8, 7) < M) { OUTPUT(dot07); }\n"
"}\n"
"#undef OUTPUT\n"
"}\n"
"#else\n"
"__attribute__((reqd_work_group_size(8, LWG_HEIGHT, 1)))\n"
"__attribute__((intel_reqd_sub_group_size(8)))\n"
"__kernel void TEMPLATE(gemm_buffer_NT, Dtype)(\n"
"const __global Dtype *src0, int off0,\n"
"const __global Dtype *src1, int off1,\n"
"__global Dtype *dst, int offd,\n"
"int M,\n"
"int N,\n"
"int K,\n"
"KERNEL_ARG_DTYPE alpha_in,\n"
"KERNEL_ARG_DTYPE beta_in)\n"
"{\n"
"const Dtype alpha = (Dtype)alpha_in;\n"
"const Dtype beta = (Dtype)beta_in;\n"
"const int group_x = get_group_id(0);\n"
"const int group_y = get_group_id(1);\n"
"const int local_x = get_local_id(0);\n"
"const int local_y = get_local_id(1);\n"
"const int global_x = get_global_id(0);\n"
"const int global_y = get_global_id(1);\n"
"Dtype8 dot00 = 0.f;\n"
"Dtype8 dot01 = 0.f;\n"
"Dtype8 dot02 = 0.f;\n"
"Dtype8 dot03 = 0.f;\n"
"Dtype8 dot04 = 0.f;\n"
"Dtype8 dot05 = 0.f;\n"
"Dtype8 dot06 = 0.f;\n"
"Dtype8 dot07 = 0.f;\n"
"Dtype4 brow0;\n"
"Dtype4 brow1;\n"
"Dtype4 brow2;\n"
"Dtype4 brow3;\n"
"Dtype4 brow4;\n"
"Dtype4 brow5;\n"
"Dtype4 brow6;\n"
"Dtype4 brow7;\n"
"__global Dtype *dst_write0 = dst + local_x * VEC_SIZE + (group_x * TILE_N) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * N + offd;\n"
"const __global Dtype *src0_read = src0 + local_x * (TILE_K / 8) + (group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * K + off0;\n"
"const __global Dtype *src1_read0 = src1 + (group_x * TILE_N) * K + off1;\n"
"__local Dtype slm_brow[8 * SLM_BLOCK];\n"
"__local Dtype* slm_brow0;\n"
"int local_index = mad24(local_y, 8, local_x) * 4;\n"
"int w;\n"
"for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) {\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"vstore4(vload4(0, src1_read0 + mad24(0, K, local_index)), 0, slm_brow + mad24(0, SLM_BLOCK, local_index));\n"
"vstore4(vload4(0, src1_read0 + mad24(1, K, local_index)), 0, slm_brow + mad24(1, SLM_BLOCK, local_index));\n"
"vstore4(vload4(0, src1_read0 + mad24(2, K, local_index)), 0, slm_brow + mad24(2, SLM_BLOCK, local_index));\n"
"vstore4(vload4(0, src1_read0 + mad24(3, K, local_index)), 0, slm_brow + mad24(3, SLM_BLOCK, local_index));\n"
"vstore4(vload4(0, src1_read0 + mad24(4, K, local_index)), 0, slm_brow + mad24(4, SLM_BLOCK, local_index));\n"
"vstore4(vload4(0, src1_read0 + mad24(5, K, local_index)), 0, slm_brow + mad24(5, SLM_BLOCK, local_index));\n"
"vstore4(vload4(0, src1_read0 + mad24(6, K, local_index)), 0, slm_brow + mad24(6, SLM_BLOCK, local_index));\n"
"vstore4(vload4(0, src1_read0 + mad24(7, K, local_index)), 0, slm_brow + mad24(7, SLM_BLOCK, local_index));\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"slm_brow0 = slm_brow + local_x * (TILE_K / 8);\n"
"w = b_tile;\n"
"int end_w = min(b_tile + SLM_BLOCK, K);\n"
"while( w + TILE_K <= end_w ) {\n"
"Dtype4 arow;\n"
"brow0 = vload4(0, slm_brow0 + 0 * SLM_BLOCK);\n"
"brow1 = vload4(0, slm_brow0 + 1 * SLM_BLOCK);\n"
"brow2 = vload4(0, slm_brow0 + 2 * SLM_BLOCK);\n"
"brow3 = vload4(0, slm_brow0 + 3 * SLM_BLOCK);\n"
"brow4 = vload4(0, slm_brow0 + 4 * SLM_BLOCK);\n"
"brow5 = vload4(0, slm_brow0 + 5 * SLM_BLOCK);\n"
"brow6 = vload4(0, slm_brow0 + 6 * SLM_BLOCK);\n"
"brow7 = vload4(0, slm_brow0 + 7 * SLM_BLOCK);\n"
"#define MM_DOT_PRODUCT( _row, _dot )   \\\n"
"arow = vload4(0, src0_read + _row * K);                           \\\n"
"_dot = mad( (Dtype8)(arow.x), (Dtype8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.y), (Dtype8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.z), (Dtype8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.w), (Dtype8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot );\n"
"MM_DOT_PRODUCT( 0, dot00 );\n"
"MM_DOT_PRODUCT( 1, dot01 );\n"
"MM_DOT_PRODUCT( 2, dot02 );\n"
"MM_DOT_PRODUCT( 3, dot03 );\n"
"MM_DOT_PRODUCT( 4, dot04 );\n"
"MM_DOT_PRODUCT( 5, dot05 );\n"
"MM_DOT_PRODUCT( 6, dot06 );\n"
"MM_DOT_PRODUCT( 7, dot07 );\n"
"#undef MM_DOT_PRODUCT\n"
"src0_read += TILE_K;\n"
"slm_brow0 += TILE_K;\n"
"w += TILE_K;\n"
"}\n"
"src1_read0 += SLM_BLOCK;\n"
"}\n"
"if(w < K) {\n"
"Dtype4 arow;\n"
"#define READ_BROW(_brow, _row) \\\n"
"_brow = vload4(0, slm_brow0 + _row * SLM_BLOCK); \\\n"
"_brow.x = (mad24(local_x, 4, w) < K) ? _brow.x : 0.0f; \\\n"
"_brow.y = (mad24(local_x, 4, w + 1) < K) ? _brow.y : 0.0f; \\\n"
"_brow.z = (mad24(local_x, 4, w + 2) < K) ? _brow.z : 0.0f; \\\n"
"_brow.w = (mad24(local_x, 4, w + 3) < K) ? _brow.w : 0.0f;\n"
"READ_BROW(brow0, 0);\n"
"READ_BROW(brow1, 1);\n"
"READ_BROW(brow2, 2);\n"
"READ_BROW(brow3, 3);\n"
"READ_BROW(brow4, 4);\n"
"READ_BROW(brow5, 5);\n"
"READ_BROW(brow6, 6);\n"
"READ_BROW(brow7, 7);\n"
"#undef READ_BROW\n"
"#define MM_DOT_PRODUCT( _row, _dot )   \\\n"
"arow = vload4(0, src0_read + _row * K);                           \\\n"
"arow.x = (mad24(local_x, 4, w) < K) ? arow.x : 0.0f; \\\n"
"arow.y = (mad24(local_x, 4, w + 1) < K) ? arow.y : 0.0f; \\\n"
"arow.z = (mad24(local_x, 4, w + 2) < K) ? arow.z : 0.0f; \\\n"
"arow.w = (mad24(local_x, 4, w + 3) < K) ? arow.w : 0.0f; \\\n"
"_dot = mad( (Dtype8)(arow.x), (Dtype8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.y), (Dtype8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.z), (Dtype8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \\\n"
"_dot = mad( (Dtype8)(arow.w), (Dtype8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot );\n"
"MM_DOT_PRODUCT( 0, dot00 );\n"
"MM_DOT_PRODUCT( 1, dot01 );\n"
"MM_DOT_PRODUCT( 2, dot02 );\n"
"MM_DOT_PRODUCT( 3, dot03 );\n"
"MM_DOT_PRODUCT( 4, dot04 );\n"
"MM_DOT_PRODUCT( 5, dot05 );\n"
"MM_DOT_PRODUCT( 6, dot06 );\n"
"MM_DOT_PRODUCT( 7, dot07 );\n"
"#undef MM_DOT_PRODUCT\n"
"}\n"
"#define REDUCE(_dot) \\\n"
"_dot = as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 0)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 1)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 2)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 3)) +  \\\n"
"as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 4)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 5)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 6)) + as_Dtype8(intel_sub_group_shuffle(SHUFFLE_TYPE8(_dot), 7));\n"
"REDUCE(dot00);\n"
"REDUCE(dot01);\n"
"REDUCE(dot02);\n"
"REDUCE(dot03);\n"
"REDUCE(dot04);\n"
"REDUCE(dot05);\n"
"REDUCE(dot06);\n"
"REDUCE(dot07);\n"
"#undef REDUCE\n"
"Dtype output = 0.0f;\n"
"#define OUTPUT( _dot) \\\n"
"output = (local_x == 0) ? _dot.s0 : output; \\\n"
"output = (local_x == 1) ? _dot.s1 : output; \\\n"
"output = (local_x == 2) ? _dot.s2 : output; \\\n"
"output = (local_x == 3) ? _dot.s3 : output; \\\n"
"output = (local_x == 4) ? _dot.s4 : output; \\\n"
"output = (local_x == 5) ? _dot.s5 : output; \\\n"
"output = (local_x == 6) ? _dot.s6 : output; \\\n"
"output = (local_x == 7) ? _dot.s7 : output; \\\n"
"dst_write0[0] = BETA_ZERO_CHECK(alpha * output, mad(output, alpha, beta * dst_write0[0])); \\\n"
"dst_write0 += N;\n"
"if(global_x < N && global_y * 8 < M) {\n"
"OUTPUT(dot00);\n"
"if(mad24(global_y, 8, 1) < M) { OUTPUT(dot01); }\n"
"if(mad24(global_y, 8, 2) < M) { OUTPUT(dot02); }\n"
"if(mad24(global_y, 8, 3) < M) { OUTPUT(dot03); }\n"
"if(mad24(global_y, 8, 4) < M) { OUTPUT(dot04); }\n"
"if(mad24(global_y, 8, 5) < M) { OUTPUT(dot05); }\n"
"if(mad24(global_y, 8, 6) < M) { OUTPUT(dot06); }\n"
"if(mad24(global_y, 8, 7) < M) { OUTPUT(dot07); }\n"
"}\n"
"#undef OUTPUT\n"
"}\n"
"#endif\n"
"#undef VEC_SIZE\n"
"#undef LWG_HEIGHT\n"
"#undef TILE_M\n"
"#undef TILE_K\n"
"#undef TILE_N\n"
"#undef SLM_BLOCK\n"
"#define SLM_SIZE 64\n"
"void TEMPLATE(gemm_buffer_NT_M_2_edgerows,Dtype)(\n"
"const __global Dtype* srca_read0,\n"
"const __global Dtype* srca_read1,\n"
"const __global Dtype* srcb_read,\n"
"__local Dtype4* work0,\n"
"__local Dtype4* work1,\n"
"int N,\n"
"int K,\n"
"int x_gid,\n"
"int lid,\n"
"Dtype alpha,\n"
"Dtype beta,\n"
"__global Dtype* dstc0,\n"
"__global Dtype* dstc1)\n"
"{\n"
"__local Dtype* work_each0 = (__local Dtype*)work0;\n"
"__local Dtype* work_each1 = (__local Dtype*)work1;\n"
"int rows = N - x_gid * 4;\n"
"Dtype4 dot0[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n"
"Dtype4 dot1[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n"
"int i = lid;\n"
"while( i < K / 4) {\n"
"const Dtype4 b0 = {srca_read0[i*4], srca_read0[(i*4+1)], srca_read0[(i*4+2)], srca_read0[(i*4+3)]};\n"
"const Dtype4 b1 = {srca_read1[i*4], srca_read1[(i*4+1)], srca_read1[(i*4+2)], srca_read1[(i*4+3)]};\n"
"#pragma unroll\n"
"for(int j = 0; j < rows; ++j) {\n"
"Dtype4 a = vload4(i, srcb_read + j * K);\n"
"dot0[j] += b0 * a;\n"
"dot1[j] += b1 * a;\n"
"}\n"
"i += get_local_size(0);\n"
"}\n"
"#pragma unroll\n"
"for(int j = 0; j < rows; ++j) {\n"
"work_each0[lid * 4 + j] = dot0[j].x + dot0[j].y + dot0[j].z + dot0[j].w;\n"
"work_each1[lid * 4 + j] = dot1[j].x + dot1[j].y + dot1[j].z + dot1[j].w;\n"
"}\n"
"if(i == K / 4) {\n"
"short tail_items = K % 4;\n"
"if(tail_items != 0) {\n"
"const __global Dtype *srcb_tail = srcb_read + i * 4;\n"
"const __global Dtype *srca_tail0 = srca_read0 + i * 4;\n"
"const __global Dtype *srca_tail1 = srca_read1 + i * 4;\n"
"#pragma unroll\n"
"for(short i = 0; i < tail_items; ++i) {\n"
"const Dtype at0 = srca_tail0[i];\n"
"const Dtype at1 = srca_tail1[i];\n"
"#pragma unroll\n"
"for(int j = 0; j < rows; ++j) {\n"
"work_each0[lid * 4 + j] += at0 * srcb_tail[i + j * K];\n"
"work_each1[lid * 4 + j] += at1 * srcb_tail[i + j * K];\n"
"}\n"
"}\n"
"}\n"
"}\n"
"for(int stride = get_local_size(0) >> 1; stride > 0 ; stride >>= 1) {\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"if(lid < stride) {\n"
"work0[lid] += work0[lid+stride];\n"
"work1[lid] += work1[lid+stride];\n"
"}\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"if(lid == 0) {\n"
"#pragma unroll\n"
"for(int j = 0; j < rows; ++j) {\n"
"#ifdef ZERO_BETA\n"
"Dtype a0 = alpha * work_each0[j];\n"
"Dtype a1 = alpha * work_each1[j];\n"
"#else\n"
"Dtype a0 = alpha * work_each0[j] + beta * dstc0[(x_gid * 4 + j)];\n"
"Dtype a1 = alpha * work_each1[j] + beta * dstc1[(x_gid * 4 + j)];\n"
"#endif\n"
"dstc0[(x_gid * 4  + j)] = a0;\n"
"dstc1[(x_gid * 4  + j)] = a1;\n"
"}\n"
"}\n"
"}\n"
"__kernel void TEMPLATE(gemm_buffer_NT_M_2,Dtype)(\n"
"__global const Dtype * A,\n"
"int offA,\n"
"__global const Dtype * B,\n"
"int offB,\n"
"__global Dtype * C,\n"
"int offC,\n"
"int M,\n"
"int N,\n"
"int K,\n"
"KERNEL_ARG_DTYPE alpha_f,\n"
"KERNEL_ARG_DTYPE beta_f)\n"
"{\n"
"Dtype alpha = (Dtype)alpha_f;\n"
"Dtype beta = (Dtype)beta_f;\n"
"int x_gid = get_group_id(0);\n"
"int lid = get_local_id(0);\n"
"const __global Dtype *srca_read0 = A + offA;\n"
"const __global Dtype *srca_read1 = srca_read0 + K;\n"
"const __global Dtype *srcb_read = B + x_gid * 4 * K + offB;\n"
"__global Dtype4 *dstc0 = (__global Dtype4*)(C + offC);\n"
"__global Dtype4 *dstc1 = (__global Dtype4*)((__global Dtype*)(dstc0) + N);\n"
"__local Dtype4 work0[SLM_SIZE];\n"
"__local Dtype4 work1[SLM_SIZE];\n"
"__local Dtype* work_each0 = (__local Dtype*)work0;\n"
"__local Dtype* work_each1 = (__local Dtype*)work1;\n"
"if(x_gid == N / 4) {\n"
"TEMPLATE(gemm_buffer_NT_M_2_edgerows,Dtype) \\\n"
"(srca_read0, srca_read1, srcb_read, work0, work1, N, K, x_gid, lid, alpha, beta, (__global Dtype*)dstc0, (__global Dtype*)dstc1);\n"
"} else {\n"
"Dtype4 dot0[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n"
"Dtype4 dot1[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n"
"int i = lid;\n"
"while( i < K / 4) {\n"
"const Dtype4 b0 = vload4(i, srca_read0);\n"
"const Dtype4 b1 = vload4(i, srca_read1);\n"
"#pragma unroll\n"
"for(int j = 0; j < 4; ++j) {\n"
"Dtype4 a = vload4(i, srcb_read + j * K);\n"
"dot0[j] += b0 * a;\n"
"dot1[j] += b1 * a;\n"
"}\n"
"i += get_local_size(0);\n"
"}\n"
"#pragma unroll\n"
"for(int j = 0; j < 4; ++j) {\n"
"work_each0[lid * 4 + j] = dot0[j].x + dot0[j].y + dot0[j].z + dot0[j].w;\n"
"work_each1[lid * 4 + j] = dot1[j].x + dot1[j].y + dot1[j].z + dot1[j].w;\n"
"}\n"
"if(i == K / 4) {\n"
"short tail_items = K % 4;\n"
"if(tail_items != 0) {\n"
"const __global Dtype *srcb_tail = srcb_read + i * 4;\n"
"const __global Dtype *srca_tail0 = srca_read0 + i * 4;\n"
"const __global Dtype *srca_tail1 = srca_read1 + i * 4;\n"
"#pragma unroll\n"
"for(short i = 0; i < tail_items; ++i) {\n"
"const Dtype at0 = srca_tail0[i];\n"
"const Dtype at1 = srca_tail1[i];\n"
"#pragma unroll\n"
"for(int j = 0; j < 4; ++j) {\n"
"work_each0[lid * 4 + j] += at0 * srcb_tail[i + j * K];\n"
"work_each1[lid * 4 + j] += at1 * srcb_tail[i + j * K];\n"
"}\n"
"}\n"
"}\n"
"}\n"
"for(int stride = get_local_size(0) >> 1; stride > 0 ; stride >>= 1) {\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"if(lid < stride) {\n"
"work0[lid] += work0[lid+stride];\n"
"work1[lid] += work1[lid+stride];\n"
"}\n"
"}\n"
"if(lid == 0)\n"
"{\n"
"#ifdef ZERO_BETA\n"
"dstc0[x_gid] = alpha * work0[0];\n"
"dstc1[x_gid] = alpha * work1[0];\n"
"#else\n"
"dstc0[x_gid] = alpha * work0[0] + beta * dstc0[x_gid];\n"
"dstc1[x_gid] = alpha * work1[0] + beta * dstc1[x_gid];\n"
"#endif\n"
"}\n"
"}\n"
"}\n"
"#undef SLM_SIZE\n"
"#define SLM_SIZE 32\n"
"void TEMPLATE(gemm_buffer_NT_M_4_edgerows,Dtype)(\n"
"const __global Dtype* srca_read0,\n"
"const __global Dtype* srca_read1,\n"
"const __global Dtype* srca_read2,\n"
"const __global Dtype* srca_read3,\n"
"const __global Dtype* srcb_read,\n"
"__local Dtype4* work0,\n"
"__local Dtype4* work1,\n"
"__local Dtype4* work2,\n"
"__local Dtype4* work3,\n"
"int N,\n"
"int K,\n"
"int x_gid,\n"
"int lid,\n"
"Dtype alpha,\n"
"Dtype beta,\n"
"__global Dtype* dstc0,\n"
"__global Dtype* dstc1,\n"
"__global Dtype* dstc2,\n"
"__global Dtype* dstc3)\n"
"{\n"
"__local Dtype* work_each0 = (__local Dtype*)(work0 + lid);\n"
"__local Dtype* work_each1 = (__local Dtype*)(work1 + lid);\n"
"__local Dtype* work_each2 = (__local Dtype*)(work2 + lid);\n"
"__local Dtype* work_each3 = (__local Dtype*)(work3 + lid);\n"
"int rows = N - x_gid * 4;\n"
"Dtype4 dot0[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n"
"Dtype4 dot1[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n"
"Dtype4 dot2[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n"
"Dtype4 dot3[3] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n"
"int i = lid;\n"
"while( i < K / 4) {\n"
"const Dtype4 a0 = {srca_read0[i*4], srca_read0[(i*4+1)], srca_read0[(i*4+2)], srca_read0[(i*4+3)]};\n"
"const Dtype4 a1 = {srca_read1[i*4], srca_read1[(i*4+1)], srca_read1[(i*4+2)], srca_read1[(i*4+3)]};\n"
"const Dtype4 a2 = {srca_read2[i*4], srca_read2[(i*4+1)], srca_read2[(i*4+2)], srca_read2[(i*4+3)]};\n"
"const Dtype4 a3 = {srca_read3[i*4], srca_read3[(i*4+1)], srca_read3[(i*4+2)], srca_read3[(i*4+3)]};\n"
"#pragma unrol\n"
"for(int j = 0; j < rows; ++j) {\n"
"dot0[j] += a0 * vload4(i, srcb_read + j * K);\n"
"dot1[j] += a1 * vload4(i, srcb_read + j * K);\n"
"dot2[j] += a2 * vload4(i, srcb_read + j * K);\n"
"dot3[j] += a3 * vload4(i, srcb_read + j * K);\n"
"}\n"
"i += get_local_size(0);\n"
"}\n"
"#pragma unroll\n"
"for(int j = 0; j < rows; ++j) {\n"
"work_each0[j] = dot0[j].x + dot0[j].y + dot0[j].z + dot0[j].w;\n"
"work_each1[j] = dot1[j].x + dot1[j].y + dot1[j].z + dot1[j].w;\n"
"work_each2[j] = dot2[j].x + dot2[j].y + dot2[j].z + dot2[j].w;\n"
"work_each3[j] = dot3[j].x + dot3[j].y + dot3[j].z + dot3[j].w;\n"
"}\n"
"if(i == K / 4) {\n"
"short tail_items = K % 4;\n"
"if(tail_items != 0) {\n"
"const __global Dtype *srcb_tail = srcb_read + i * 4;\n"
"const __global Dtype *srca_tail0 = srca_read0 + i * 4;\n"
"const __global Dtype *srca_tail1 = srca_read1 + i * 4;\n"
"const __global Dtype *srca_tail2 = srca_read2 + i * 4;\n"
"const __global Dtype *srca_tail3 = srca_read3 + i * 4;\n"
"#pragma unroll\n"
"for(short i = 0; i < tail_items; ++i) {\n"
"const Dtype at0 = srca_tail0[i];\n"
"const Dtype at1 = srca_tail1[i];\n"
"const Dtype at2 = srca_tail2[i];\n"
"const Dtype at3 = srca_tail3[i];\n"
"#pragma unroll\n"
"for(int j = 0; j < rows; ++j) {\n"
"work_each0[j] += at0 * srcb_tail[i + j * K];\n"
"work_each1[j] += at1 * srcb_tail[i + j * K];\n"
"work_each2[j] += at2 * srcb_tail[i + j * K];\n"
"work_each3[j] += at3 * srcb_tail[i + j * K];\n"
"}\n"
"}\n"
"}\n"
"}\n"
"for(int stride = get_local_size(0) >> 1; stride > 0 ; stride >>= 1) {\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"if(lid < stride) {\n"
"work0[lid] += work0[lid+stride];\n"
"work1[lid] += work1[lid+stride];\n"
"work2[lid] += work2[lid+stride];\n"
"work3[lid] += work3[lid+stride];\n"
"}\n"
"}\n"
"if(lid == 0) {\n"
"#pragma unroll\n"
"for(int j = 0; j < rows; ++j) {\n"
"#ifdef ZERO_BETA\n"
"dstc0[(x_gid * 4  + j)] = alpha * work_each0[j];\n"
"dstc1[(x_gid * 4  + j)] = alpha * work_each1[j];\n"
"dstc2[(x_gid * 4  + j)] = alpha * work_each2[j];\n"
"dstc3[(x_gid * 4  + j)] = alpha * work_each3[j];\n"
"#else\n"
"dstc0[(x_gid * 4  + j)] = alpha * work_each0[j] + beta * dstc0[(x_gid * 4 + j)];\n"
"dstc1[(x_gid * 4  + j)] = alpha * work_each1[j] + beta * dstc1[(x_gid * 4 + j)];\n"
"dstc2[(x_gid * 4  + j)] = alpha * work_each2[j] + beta * dstc2[(x_gid * 4 + j)];\n"
"dstc3[(x_gid * 4  + j)] = alpha * work_each3[j] + beta * dstc3[(x_gid * 4 + j)];\n"
"#endif\n"
"}\n"
"}\n"
"}\n"
"__kernel void TEMPLATE(gemm_buffer_NT_M_4,Dtype)(\n"
"__global const Dtype * A,\n"
"int offA,\n"
"__global const Dtype * B,\n"
"int offB,\n"
"__global Dtype * C,\n"
"int offC,\n"
"int M,\n"
"int N,\n"
"int K,\n"
"KERNEL_ARG_DTYPE alpha_f,\n"
"KERNEL_ARG_DTYPE beta_f)\n"
"{\n"
"Dtype alpha = (Dtype)alpha_f;\n"
"Dtype beta = (Dtype)beta_f;\n"
"int x_gid = get_group_id(0);\n"
"int lid = get_local_id(0);\n"
"int lsize = get_local_size(0);\n"
"const __global Dtype *srca_read0 = A + offA;\n"
"const __global Dtype *srca_read1 = srca_read0 + K;\n"
"const __global Dtype *srca_read2 = srca_read1 + K;\n"
"const __global Dtype *srca_read3 = srca_read2 + K;\n"
"const __global Dtype *srcb_read = B + x_gid * 4 * K + offB;\n"
"__global Dtype4 *dstc0 = (__global Dtype4*)(C + offC);\n"
"__global Dtype4 *dstc1 = (__global Dtype4*)((__global Dtype*)(dstc0) + N);\n"
"__global Dtype4 *dstc2 = (__global Dtype4*)((__global Dtype*)(dstc1) + N);\n"
"__global Dtype4 *dstc3 = (__global Dtype4*)((__global Dtype*)(dstc2) + N);\n"
"__local Dtype4 work0[SLM_SIZE];\n"
"__local Dtype4 work1[SLM_SIZE];\n"
"__local Dtype4 work2[SLM_SIZE];\n"
"__local Dtype4 work3[SLM_SIZE];\n"
"__local Dtype* work_each0 = (__local Dtype*)(work0 + lid);\n"
"__local Dtype* work_each1 = (__local Dtype*)(work1 + lid);\n"
"__local Dtype* work_each2 = (__local Dtype*)(work2 + lid);\n"
"__local Dtype* work_each3 = (__local Dtype*)(work3 + lid);\n"
"if(x_gid == N / 4) {\n"
"TEMPLATE(gemm_buffer_NT_M_4_edgerows,Dtype) \\\n"
"(srca_read0, srca_read1, srca_read2, srca_read3, srcb_read, \\\n"
"work0, work1, work2, work3, N, K, x_gid, lid, alpha, beta, \\\n"
"(__global Dtype*)dstc0, (__global Dtype*)dstc1, (__global Dtype*)dstc2, (__global Dtype*)dstc3);\n"
"} else {\n"
"Dtype4 dot0[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n"
"Dtype4 dot1[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n"
"Dtype4 dot2[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n"
"Dtype4 dot3[4] = {(Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.), (Dtype4)(0.)};\n"
"int kid = lid;\n"
"while( kid < K / 4) {\n"
"const Dtype4 b0 = vload4(kid, srca_read0);\n"
"const Dtype4 b1 = vload4(kid, srca_read1);\n"
"const Dtype4 b2 = vload4(kid, srca_read2);\n"
"const Dtype4 b3 = vload4(kid, srca_read3);\n"
"#pragma unroll\n"
"for(int j = 0; j < 4; ++j) {\n"
"Dtype4 a = vload4(kid, srcb_read + j * K);\n"
"dot0[j] += b0 * a;\n"
"dot1[j] += b1 * a;\n"
"dot2[j] += b2 * a;\n"
"dot3[j] += b3 * a;\n"
"}\n"
"kid += lsize;\n"
"}\n"
"#pragma unroll\n"
"for(int j = 0; j < 4; ++j) {\n"
"work_each0[j] = dot0[j].x + dot0[j].y + dot0[j].z + dot0[j].w;\n"
"work_each1[j] = dot1[j].x + dot1[j].y + dot1[j].z + dot1[j].w;\n"
"work_each2[j] = dot2[j].x + dot2[j].y + dot2[j].z + dot2[j].w;\n"
"work_each3[j] = dot3[j].x + dot3[j].y + dot3[j].z + dot3[j].w;\n"
"}\n"
"if(kid == (K >> 2)) {\n"
"short tail_items = K % 4;\n"
"if(tail_items != 0) {\n"
"int offset = kid << 2;\n"
"const __global Dtype *srcb_tail = srcb_read + offset;\n"
"const __global Dtype *srca_tail0 = srca_read0 + offset;\n"
"const __global Dtype *srca_tail1 = srca_read1 + offset;\n"
"const __global Dtype *srca_tail2 = srca_read2 + offset;\n"
"const __global Dtype *srca_tail3 = srca_read3 + offset;\n"
"#pragma unroll\n"
"for(short i = 0; i < tail_items; ++i) {\n"
"const Dtype at0 = srca_tail0[i];\n"
"const Dtype at1 = srca_tail1[i];\n"
"const Dtype at2 = srca_tail2[i];\n"
"const Dtype at3 = srca_tail3[i];\n"
"#pragma unroll\n"
"for(int j = 0; j < 4; ++j) {\n"
"work_each0[j] += at0 * srcb_tail[i + j * K];\n"
"work_each1[j] += at1 * srcb_tail[i + j * K];\n"
"work_each2[j] += at2 * srcb_tail[i + j * K];\n"
"work_each3[j] += at3 * srcb_tail[i + j * K];\n"
"}\n"
"}\n"
"}\n"
"}\n"
"for(int stride = get_local_size(0) >> 1; stride > 0 ; stride >>= 1) {\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"if(lid < stride) {\n"
"work0[lid] += work0[lid+stride];\n"
"work1[lid] += work1[lid+stride];\n"
"work2[lid] += work2[lid+stride];\n"
"work3[lid] += work3[lid+stride];\n"
"}\n"
"}\n"
"if(lid == 0) {\n"
"#ifdef ZERO_BETA\n"
"dstc0[x_gid] = alpha * work0[0];\n"
"dstc1[x_gid] = alpha * work1[0];\n"
"dstc2[x_gid] = alpha * work2[0];\n"
"dstc3[x_gid] = alpha * work3[0];\n"
"#else\n"
"dstc0[x_gid] = alpha * work0[0] + beta * dstc0[x_gid];\n"
"dstc1[x_gid] = alpha * work1[0] + beta * dstc1[x_gid];\n"
"dstc2[x_gid] = alpha * work2[0] + beta * dstc2[x_gid];\n"
"dstc3[x_gid] = alpha * work3[0] + beta * dstc3[x_gid];\n"
"#endif\n"
"}\n"
"}\n"
"}\n"
"#undef SLM_SIZE\n"
"#define SLM_SIZE 16\n"
"__kernel void TEMPLATE(gemm_buffer_NT_M_8,Dtype)(\n"
"__global const Dtype * A,\n"
"int offA,\n"
"__global const Dtype * B,\n"
"int offB,\n"
"__global Dtype * C,\n"
"int offC,\n"
"int M,\n"
"int N,\n"
"int K,\n"
"KERNEL_ARG_DTYPE alpha_f,\n"
"KERNEL_ARG_DTYPE beta_f)\n"
"{\n"
"Dtype alpha = (Dtype)alpha_f;\n"
"Dtype beta = (Dtype)beta_f;\n"
"int x_gid = get_group_id(0);\n"
"int lid = get_local_id(0);\n"
"int lsize = get_local_size(0);\n"
"const __global Dtype *srca_read0 = A + offA;\n"
"const __global Dtype *srca_read1 = srca_read0 + K;\n"
"const __global Dtype *srca_read2 = srca_read1 + K;\n"
"const __global Dtype *srca_read3 = srca_read2 + K;\n"
"const __global Dtype *srca_read4 = srca_read3 + K;\n"
"const __global Dtype *srca_read5 = srca_read4 + K;\n"
"const __global Dtype *srca_read6 = srca_read5 + K;\n"
"const __global Dtype *srca_read7 = srca_read6 + K;\n"
"const __global Dtype *srcb_read = B + x_gid * K + offB;\n"
"__global Dtype *dstc0 = C + offC;\n"
"__global Dtype *dstc1 = dstc0 + N;\n"
"__global Dtype *dstc2 = dstc1 + N;\n"
"__global Dtype *dstc3 = dstc2 + N;\n"
"__global Dtype *dstc4 = dstc3 + N;\n"
"__global Dtype *dstc5 = dstc4 + N;\n"
"__global Dtype *dstc6 = dstc5 + N;\n"
"__global Dtype *dstc7 = dstc6 + N;\n"
"__local Dtype work0[SLM_SIZE];\n"
"__local Dtype work1[SLM_SIZE];\n"
"__local Dtype work2[SLM_SIZE];\n"
"__local Dtype work3[SLM_SIZE];\n"
"__local Dtype work4[SLM_SIZE];\n"
"__local Dtype work5[SLM_SIZE];\n"
"__local Dtype work6[SLM_SIZE];\n"
"__local Dtype work7[SLM_SIZE];\n"
"Dtype4 dot0 = (Dtype4)(0.);\n"
"Dtype4 dot1 = (Dtype4)(0.);\n"
"Dtype4 dot2 = (Dtype4)(0.);\n"
"Dtype4 dot3 = (Dtype4)(0.);\n"
"Dtype4 dot4 = (Dtype4)(0.);\n"
"Dtype4 dot5 = (Dtype4)(0.);\n"
"Dtype4 dot6 = (Dtype4)(0.);\n"
"Dtype4 dot7 = (Dtype4)(0.);\n"
"int kid = lid;\n"
"while( kid < K / 4) {\n"
"const Dtype4 a0 = vload4(kid, srca_read0);\n"
"const Dtype4 a1 = vload4(kid, srca_read1);\n"
"const Dtype4 a2 = vload4(kid, srca_read2);\n"
"const Dtype4 a3 = vload4(kid, srca_read3);\n"
"const Dtype4 a4 = vload4(kid, srca_read4);\n"
"const Dtype4 a5 = vload4(kid, srca_read5);\n"
"const Dtype4 a6 = vload4(kid, srca_read6);\n"
"const Dtype4 a7 = vload4(kid, srca_read7);\n"
"Dtype4 b = vload4(kid, srcb_read);\n"
"dot0 += a0 * b;\n"
"dot1 += a1 * b;\n"
"dot2 += a2 * b;\n"
"dot3 += a3 * b;\n"
"dot4 += a4 * b;\n"
"dot5 += a5 * b;\n"
"dot6 += a6 * b;\n"
"dot7 += a7 * b;\n"
"kid += lsize;\n"
"}\n"
"work0[lid] = dot0.x + dot0.y + dot0.z + dot0.w;\n"
"work1[lid] = dot1.x + dot1.y + dot1.z + dot1.w;\n"
"work2[lid] = dot2.x + dot2.y + dot2.z + dot2.w;\n"
"work3[lid] = dot3.x + dot3.y + dot3.z + dot3.w;\n"
"work4[lid] = dot4.x + dot4.y + dot4.z + dot4.w;\n"
"work5[lid] = dot5.x + dot5.y + dot5.z + dot5.w;\n"
"work6[lid] = dot6.x + dot6.y + dot6.z + dot6.w;\n"
"work7[lid] = dot7.x + dot7.y + dot7.z + dot7.w;\n"
"if(kid == (K >> 2)) {\n"
"short tail_items = K % 4;\n"
"if(tail_items != 0) {\n"
"int offset = kid << 2;\n"
"const __global Dtype *srcb_tail = srcb_read + offset;\n"
"const __global Dtype *srca_tail0 = srca_read0 + offset;\n"
"const __global Dtype *srca_tail1 = srca_read1 + offset;\n"
"const __global Dtype *srca_tail2 = srca_read2 + offset;\n"
"const __global Dtype *srca_tail3 = srca_read3 + offset;\n"
"const __global Dtype *srca_tail4 = srca_read4 + offset;\n"
"const __global Dtype *srca_tail5 = srca_read5 + offset;\n"
"const __global Dtype *srca_tail6 = srca_read6 + offset;\n"
"const __global Dtype *srca_tail7 = srca_read7 + offset;\n"
"#pragma unroll\n"
"for(short item = 0; item < tail_items; ++item) {\n"
"work0[lid] += srca_tail0[item] * srcb_tail[item];\n"
"work1[lid] += srca_tail1[item] * srcb_tail[item];\n"
"work2[lid] += srca_tail2[item] * srcb_tail[item];\n"
"work3[lid] += srca_tail3[item] * srcb_tail[item];\n"
"work4[lid] += srca_tail4[item] * srcb_tail[item];\n"
"work5[lid] += srca_tail5[item] * srcb_tail[item];\n"
"work6[lid] += srca_tail6[item] * srcb_tail[item];\n"
"work7[lid] += srca_tail7[item] * srcb_tail[item];\n"
"}\n"
"}\n"
"}\n"
"for(int stride = get_local_size(0) >> 1; stride > 0 ; stride >>= 1) {\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"if(lid < stride) {\n"
"work0[lid] += work0[lid+stride];\n"
"work1[lid] += work1[lid+stride];\n"
"work2[lid] += work2[lid+stride];\n"
"work3[lid] += work3[lid+stride];\n"
"work4[lid] += work4[lid+stride];\n"
"work5[lid] += work5[lid+stride];\n"
"work6[lid] += work6[lid+stride];\n"
"work7[lid] += work7[lid+stride];\n"
"}\n"
"}\n"
"if(lid == 0) {\n"
"#ifdef ZERO_BETA\n"
"dstc0[x_gid] = alpha * work0[0];\n"
"dstc1[x_gid] = alpha * work1[0];\n"
"dstc2[x_gid] = alpha * work2[0];\n"
"dstc3[x_gid] = alpha * work3[0];\n"
"dstc4[x_gid] = alpha * work4[0];\n"
"dstc5[x_gid] = alpha * work5[0];\n"
"dstc6[x_gid] = alpha * work6[0];\n"
"dstc7[x_gid] = alpha * work7[0];\n"
"#else\n"
"dstc0[x_gid] = alpha * work0[0] + beta * dstc0[x_gid];\n"
"dstc1[x_gid] = alpha * work1[0] + beta * dstc1[x_gid];\n"
"dstc2[x_gid] = alpha * work2[0] + beta * dstc2[x_gid];\n"
"dstc3[x_gid] = alpha * work3[0] + beta * dstc3[x_gid];\n"
"dstc4[x_gid] = alpha * work4[0] + beta * dstc4[x_gid];\n"
"dstc5[x_gid] = alpha * work5[0] + beta * dstc5[x_gid];\n"
"dstc6[x_gid] = alpha * work6[0] + beta * dstc6[x_gid];\n"
"dstc7[x_gid] = alpha * work7[0] + beta * dstc7[x_gid];\n"
"#endif\n"
"}\n"
"}\n"
"#undef SLM_SIZE\n"
"#undef VEC_SIZE\n"
"#undef LWG_HEIGHT\n"
"#undef TILE_M\n"
"#undef TILE_K\n"
"#undef TILE_N\n"
"#undef SIMD_SIZE_GEMM\n"
"#undef SHUFFLE_TYPE2\n"
"#undef SHUFFLE_TYPE8\n"
, "11f94a50f6b8bb41e89301f396ba7921", NULL};
struct cv::ocl::internal::ProgramEntry gemm_image_oclsrc={moduleName, "gemm_image",
"#if defined(cl_khr_fp16)\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define CONCAT(A,B) A##_##B\n"
"#define TEMPLATE(name,type) CONCAT(name,type)\n"
"#define KERNEL_ARG_DTYPE float\n"
"#define TYPE_FLOAT  1\n"
"#define TYPE_HALF   2\n"
"#if TYPE == TYPE_HALF\n"
"#define Dtype  half\n"
"#define Dtype2 half2\n"
"#define Dtype4 half4\n"
"#define Dtype8 half8\n"
"#define Dtype16 half16\n"
"#define as_Dtype  as_half\n"
"#define as_Dtype2 as_half2\n"
"#define as_Dtype4 as_half4\n"
"#define as_Dtype8 as_half8\n"
"#define as_Dtype16 as_half16\n"
"#else\n"
"#define Dtype  float\n"
"#define Dtype2 float2\n"
"#define Dtype4 float4\n"
"#define Dtype8 float8\n"
"#define Dtype16 float16\n"
"#define as_Dtype  as_float\n"
"#define as_Dtype2 as_float2\n"
"#define as_Dtype4 as_float4\n"
"#define as_Dtype8 as_float8\n"
"#define as_Dtype16 as_float16\n"
"#endif\n"
"#if defined(cl_intel_subgroups)\n"
"#pragma OPENCL EXTENSION  cl_intel_subgroups : enable\n"
"#endif\n"
"#define TILE_M          32\n"
"#define TILE_K          8\n"
"#if TYPE == TYPE_HALF\n"
"#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read_us8( __image, __coord )\n"
"#define SHUFFLE_TYPE2(val) as_ushort2(val)\n"
"#define SHUFFLE_TYPE8(val) as_ushort8(val)\n"
"#define READ_IMAGE(__image, __coord) read_imageh(__image, sampler, __coord)\n"
"#define SIZE_OF_ELEMENT sizeof(ushort)\n"
"#define SIMD_SIZE_GEMM 16\n"
"#define TILE_N 16\n"
"#else\n"
"#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read8( __image, __coord )\n"
"#define SHUFFLE_TYPE2(val) val\n"
"#define SHUFFLE_TYPE8(val) val\n"
"#define READ_IMAGE(__image, __coord) read_imagef(__image, sampler, __coord)\n"
"#define SIZE_OF_ELEMENT sizeof(uint)\n"
"#define SIMD_SIZE_GEMM 8\n"
"#define TILE_N 8\n"
"#endif\n"
"#ifdef USE_IMAGE_C\n"
"#if TYPE == TYPE_HALF\n"
"#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read_us8( _C, _coordC ) )\n"
"#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write_us8( _C, _coordC, as_ushort8( _val ) )\n"
"#else\n"
"#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read8( _C, _coordC ) )\n"
"#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write8( _C, _coordC, as_uint8( _val ) )\n"
"#endif\n"
"#define MATC_PARAMETER __read_only image2d_t C, __write_only image2d_t dst\n"
"#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, C, dst, sizeof(uint))\n"
"#else\n"
"#define BLOCKC_READ8( _C, _coordC ) \\\n"
"(Dtype8) ( (_coordC.x + get_local_id(0) < N && _coordC.y < M) ? _C[ _coordC.y * ldc + _coordC.x + get_local_id(0) ] : 0, \\\n"
"(_coordC.x + get_local_id(0) < N && _coordC.y + 1 < M) ? _C[ ( _coordC.y + 1 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \\\n"
"(_coordC.x + get_local_id(0) < N && _coordC.y + 2 < M) ? _C[ ( _coordC.y + 2 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \\\n"
"(_coordC.x + get_local_id(0) < N && _coordC.y + 3 < M) ? _C[ ( _coordC.y + 3 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \\\n"
"(_coordC.x + get_local_id(0) < N && _coordC.y + 4 < M) ? _C[ ( _coordC.y + 4 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \\\n"
"(_coordC.x + get_local_id(0) < N && _coordC.y + 5 < M) ? _C[ ( _coordC.y + 5 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \\\n"
"(_coordC.x + get_local_id(0) < N && _coordC.y + 6 < M) ? _C[ ( _coordC.y + 6 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \\\n"
"(_coordC.x + get_local_id(0) < N && _coordC.y + 7 < M) ? _C[ ( _coordC.y + 7 ) * ldc + _coordC.x + get_local_id(0) ] : 0)\n"
"#define BLOCKC_WRITE8( _C, _coordC, _val) do {\\\n"
"if (_coordC.x + get_local_id(0) < N) { \\\n"
"if (_coordC.y < M) \\\n"
"_C[ _coordC.y * ldc + _coordC.x + get_local_id(0) ] = _val.s0; \\\n"
"if (_coordC.y + 1 < M) \\\n"
"_C[ ( _coordC.y + 1 )* ldc + _coordC.x + get_local_id(0) ] = _val.s1; \\\n"
"if (_coordC.y + 2 < M) \\\n"
"_C[ ( _coordC.y + 2 )* ldc + _coordC.x + get_local_id(0) ] = _val.s2; \\\n"
"if (_coordC.y + 3 < M) \\\n"
"_C[ ( _coordC.y + 3 )* ldc + _coordC.x + get_local_id(0) ] = _val.s3; \\\n"
"if (_coordC.y + 4 < M) \\\n"
"_C[ ( _coordC.y + 4 )* ldc + _coordC.x + get_local_id(0) ] = _val.s4; \\\n"
"if (_coordC.y + 5 < M) \\\n"
"_C[ ( _coordC.y + 5 )* ldc + _coordC.x + get_local_id(0) ] = _val.s5; \\\n"
"if (_coordC.y + 6 < M) \\\n"
"_C[ ( _coordC.y + 6 )* ldc + _coordC.x + get_local_id(0) ] = _val.s6; \\\n"
"if (_coordC.y + 7 < M) \\\n"
"_C[ ( _coordC.y + 7 )* ldc + _coordC.x + get_local_id(0) ] = _val.s7; \\\n"
"}} while(0)\n"
"#define MATC_PARAMETER __global Dtype * C, const int offC, const int M, const int N, const int ldc\n"
"#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, (C + offC), (C + offC), 1)\n"
"#endif\n"
"#define GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, _C, _dst, _C_step) \\\n"
"int2    coordDst = (int2)( ( group_x * TILE_N ) * _C_step, ( group_y * TILE_M ) ); \\\n"
"int2    coordC = coordDst; \\\n"
"Dtype8 blockC00; \\\n"
"Dtype8 blockC01; \\\n"
"Dtype8 blockC02; \\\n"
"Dtype8 blockC03; \\\n"
"if (BETA_NOT0) { \\\n"
"blockC00 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \\\n"
"blockC01 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \\\n"
"blockC02 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \\\n"
"blockC03 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); \\\n"
"if (!ALPHA1) { \\\n"
"blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \\\n"
"blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \\\n"
"blockC02 = mad(blockAxB02, (Dtype8)alpha, blockC02); \\\n"
"blockC03 = mad(blockAxB03, (Dtype8)alpha, blockC03); \\\n"
"} else { \\\n"
"blockC00 += blockAxB00; \\\n"
"blockC01 += blockAxB01; \\\n"
"blockC02 += blockAxB02; \\\n"
"blockC03 += blockAxB03; \\\n"
"} \\\n"
"} else { \\\n"
"blockC00 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \\\n"
"blockC01 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \\\n"
"blockC02 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \\\n"
"blockC03 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); \\\n"
"if (!ALPHA1) { \\\n"
"blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \\\n"
"blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \\\n"
"blockC02 = mad(blockAxB02, (Dtype8)alpha, blockC02); \\\n"
"blockC03 = mad(blockAxB03, (Dtype8)alpha, blockC03); \\\n"
"} else { \\\n"
"blockC00 += blockAxB00; \\\n"
"blockC01 += blockAxB01; \\\n"
"blockC02 += blockAxB02; \\\n"
"blockC03 += blockAxB03; \\\n"
"} \\\n"
"} \\\n"
"BLOCKC_WRITE8( _dst, coordDst, blockC00 );    coordDst.y += 8; \\\n"
"BLOCKC_WRITE8( _dst, coordDst, blockC01 );    coordDst.y += 8; \\\n"
"BLOCKC_WRITE8( _dst, coordDst, blockC02 );    coordDst.y += 8; \\\n"
"BLOCKC_WRITE8( _dst, coordDst, blockC03 );\n"
"#define TRANSPOSE_BLOCK_8( _block, _col )   \\\n"
"(Dtype8)( intel_sub_group_shuffle( _block.s0, _col ),   \\\n"
"intel_sub_group_shuffle( _block.s1, _col ),   \\\n"
"intel_sub_group_shuffle( _block.s2, _col ),   \\\n"
"intel_sub_group_shuffle( _block.s3, _col ),   \\\n"
"intel_sub_group_shuffle( _block.s4, _col ),   \\\n"
"intel_sub_group_shuffle( _block.s5, _col ),   \\\n"
"intel_sub_group_shuffle( _block.s6, _col ),   \\\n"
"intel_sub_group_shuffle( _block.s7, _col ) );\n"
"#if TYPE == TYPE_HALF\n"
"#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB00, _blockB01 )    \\\n"
"{   \\\n"
"const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \\\n"
"const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 );    \\\n"
"const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 );    \\\n"
"const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 );    \\\n"
"const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 );    \\\n"
"const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 );    \\\n"
"const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 );    \\\n"
"const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 );    \\\n"
"const Dtype8    acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 );    \\\n"
"const Dtype8    acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 );    \\\n"
"const Dtype8    acola = TRANSPOSE_BLOCK_8( _blockA, 10 );    \\\n"
"const Dtype8    acolb = TRANSPOSE_BLOCK_8( _blockA, 11 );    \\\n"
"const Dtype8    acolc = TRANSPOSE_BLOCK_8( _blockA, 12 );    \\\n"
"const Dtype8    acold = TRANSPOSE_BLOCK_8( _blockA, 13 );    \\\n"
"const Dtype8    acole = TRANSPOSE_BLOCK_8( _blockA, 14 );    \\\n"
"const Dtype8    acolf = TRANSPOSE_BLOCK_8( _blockA, 15 );    \\\n"
"_result = mad( (Dtype8)(_blockB00.s0), acol0, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB00.s1), acol1, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB00.s2), acol2, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB00.s3), acol3, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB00.s4), acol4, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB00.s5), acol5, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB00.s6), acol6, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB00.s7), acol7, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB01.s0), acol8, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB01.s1), acol9, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB01.s2), acola, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB01.s3), acolb, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB01.s4), acolc, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB01.s5), acold, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB01.s6), acole, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB01.s7), acolf, _result );      \\\n"
"}\n"
"#else\n"
"#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB )    \\\n"
"{   \\\n"
"const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \\\n"
"const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 );    \\\n"
"const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 );    \\\n"
"const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 );    \\\n"
"const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 );    \\\n"
"const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 );    \\\n"
"const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 );    \\\n"
"const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 );    \\\n"
"_result = mad( (Dtype8)(_blockB.s0), acol0, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB.s1), acol1, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB.s2), acol2, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB.s3), acol3, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB.s4), acol4, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB.s5), acol5, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB.s6), acol6, _result );      \\\n"
"_result = mad( (Dtype8)(_blockB.s7), acol7, _result );      \\\n"
"}\n"
"#endif\n"
"#if TYPE == TYPE_HALF\n"
"#define GEMM_NN(ALPHA1, BETA_NOT0) \\\n"
"__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \\\n"
"__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \\\n"
"__kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \\\n"
"__read_only image2d_t A, \\\n"
"__read_only image2d_t B, \\\n"
"MATC_PARAMETER, \\\n"
"KERNEL_ARG_DTYPE alpha_in, \\\n"
"KERNEL_ARG_DTYPE beta_in, \\\n"
"int width0, \\\n"
"int isFirstColBlock) \\\n"
"{ \\\n"
"const Dtype alpha = (Dtype)alpha_in; \\\n"
"const Dtype beta = (Dtype)beta_in; \\\n"
"const int group_x = get_group_id(0); \\\n"
"const int group_y = get_group_id(1); \\\n"
"Dtype8 blockAxB00 = 0; \\\n"
"Dtype8 blockAxB01 = 0; \\\n"
"Dtype8 blockAxB02 = 0; \\\n"
"Dtype8 blockAxB03 = 0; \\\n"
"int2    coordA = (int2)( 0, group_y * TILE_M ); \\\n"
"int2    coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ); \\\n"
"do \\\n"
"{  \\\n"
"int2    coordBTemp = coordB; \\\n"
"Dtype8  blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K; \\\n"
"Dtype8  blockB01 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K; \\\n"
"int2    coordATemp = coordA; \\\n"
"Dtype8  blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \\\n"
"Dtype8  blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \\\n"
"Dtype8  blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \\\n"
"Dtype8  blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, blockB01 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00, blockB01 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00, blockB01 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00, blockB01 ); \\\n"
"} \\\n"
"while( coordB.y < width0 ); \\\n"
"GEMM_OUTPUT(ALPHA1, BETA_NOT0);  \\\n"
"}\n"
"#else\n"
"#define GEMM_NN(ALPHA1, BETA_NOT0) \\\n"
"__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \\\n"
"__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \\\n"
"__kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \\\n"
"__read_only image2d_t A, \\\n"
"__read_only image2d_t B, \\\n"
"MATC_PARAMETER, \\\n"
"KERNEL_ARG_DTYPE alpha_in, \\\n"
"KERNEL_ARG_DTYPE beta_in, \\\n"
"int width0, \\\n"
"int isFirstColBlock) \\\n"
"{ \\\n"
"const Dtype alpha = (Dtype)alpha_in; \\\n"
"const Dtype beta = (Dtype)beta_in; \\\n"
"const int group_x = get_group_id(0); \\\n"
"const int group_y = get_group_id(1); \\\n"
"Dtype8 blockAxB00 = 0.0f; \\\n"
"Dtype8 blockAxB01 = 0.0f; \\\n"
"Dtype8 blockAxB02 = 0.0f; \\\n"
"Dtype8 blockAxB03 = 0.0f; \\\n"
"int2    coordA = (int2)( 0, group_y * TILE_M ); \\\n"
"int2    coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ); \\\n"
"do \\\n"
"{  \\\n"
"int2    coordBTemp = coordB; \\\n"
"Dtype8  blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K; \\\n"
"int2    coordATemp = coordA; \\\n"
"Dtype8  blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \\\n"
"Dtype8  blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \\\n"
"Dtype8  blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \\\n"
"Dtype8  blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.x += TILE_K * SIZE_OF_ELEMENT; \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \\\n"
"} \\\n"
"while( coordB.y < width0 ); \\\n"
"GEMM_OUTPUT(ALPHA1, BETA_NOT0); \\\n"
"}\n"
"#endif\n"
"GEMM_NN(1, 0)\n"
"GEMM_NN(1, 1)\n"
"GEMM_NN(0, 0)\n"
"GEMM_NN(0, 1)\n"
"#undef TRANSPOSE_BLOCK_8\n"
"#undef MULTIPLY_BLOCKS_8x8\n"
"#undef GEMM_NN\n"
"#define TRANSPOSE_BLOCK_8(_vec, _col) \\\n"
"(Dtype8)( intel_sub_group_shuffle(_vec, _col + 0), \\\n"
"intel_sub_group_shuffle(_vec, _col + 1), \\\n"
"intel_sub_group_shuffle(_vec, _col + 2), \\\n"
"intel_sub_group_shuffle(_vec, _col + 3), \\\n"
"intel_sub_group_shuffle(_vec, _col + 4), \\\n"
"intel_sub_group_shuffle(_vec, _col + 5), \\\n"
"intel_sub_group_shuffle(_vec, _col + 6), \\\n"
"intel_sub_group_shuffle(_vec, _col + 7) )\n"
"#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB, _col )    \\\n"
"{   \\\n"
"_result = mad( (Dtype8)(_blockB.s0), TRANSPOSE_BLOCK_8(_blockA.s0, _col), _result );      \\\n"
"_result = mad( (Dtype8)(_blockB.s1), TRANSPOSE_BLOCK_8(_blockA.s1, _col), _result );      \\\n"
"_result = mad( (Dtype8)(_blockB.s2), TRANSPOSE_BLOCK_8(_blockA.s2, _col), _result );      \\\n"
"_result = mad( (Dtype8)(_blockB.s3), TRANSPOSE_BLOCK_8(_blockA.s3, _col), _result );      \\\n"
"_result = mad( (Dtype8)(_blockB.s4), TRANSPOSE_BLOCK_8(_blockA.s4, _col), _result );      \\\n"
"_result = mad( (Dtype8)(_blockB.s5), TRANSPOSE_BLOCK_8(_blockA.s5, _col), _result );      \\\n"
"_result = mad( (Dtype8)(_blockB.s6), TRANSPOSE_BLOCK_8(_blockA.s6, _col), _result );      \\\n"
"_result = mad( (Dtype8)(_blockB.s7), TRANSPOSE_BLOCK_8(_blockA.s7, _col), _result );      \\\n"
"}\n"
"#if TYPE == TYPE_HALF\n"
"#define GEMM_TN(ALPHA1, BETA_NOT0) \\\n"
"__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \\\n"
"__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \\\n"
"__kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \\\n"
"__read_only image2d_t A, \\\n"
"__read_only image2d_t B, \\\n"
"MATC_PARAMETER, \\\n"
"KERNEL_ARG_DTYPE alpha_in, \\\n"
"KERNEL_ARG_DTYPE beta_in, \\\n"
"int width0, \\\n"
"int isFirstColBlock) \\\n"
"{ \\\n"
"const Dtype alpha = (Dtype)alpha_in; \\\n"
"const Dtype beta = (Dtype)beta_in; \\\n"
"const int group_x = get_group_id(0);\\\n"
"const int group_y = get_group_id(1);\\\n"
"Dtype8 blockAxB00 = 0;\\\n"
"Dtype8 blockAxB01 = 0;\\\n"
"Dtype8 blockAxB02 = 0;\\\n"
"Dtype8 blockAxB03 = 0;\\\n"
"int2    coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 );\\\n"
"int2    coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 );\\\n"
"do\\\n"
"{\\\n"
"int2    coordBTemp = coordB;\\\n"
"Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K;\\\n"
"int2    coordATemp = coordA;\\\n"
"Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 16 * SIZE_OF_ELEMENT;\\\n"
"Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.y += TILE_K;\\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \\\n"
"} \\\n"
"while( coordB.y < width0 ); \\\n"
"GEMM_OUTPUT(ALPHA1, BETA_NOT0); \\\n"
"}\n"
"#else\n"
"#define GEMM_TN(ALPHA1, BETA_NOT0) \\\n"
"__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \\\n"
"__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \\\n"
"__kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \\\n"
"__read_only image2d_t A, \\\n"
"__read_only image2d_t B, \\\n"
"MATC_PARAMETER, \\\n"
"KERNEL_ARG_DTYPE alpha_in, \\\n"
"KERNEL_ARG_DTYPE beta_in, \\\n"
"int width0, \\\n"
"int isFirstColBlock) \\\n"
"{ \\\n"
"const Dtype alpha = (Dtype)alpha_in; \\\n"
"const Dtype beta = (Dtype)beta_in; \\\n"
"const int group_x = get_group_id(0);\\\n"
"const int group_y = get_group_id(1);\\\n"
"Dtype8 blockAxB00 = 0.0f;\\\n"
"Dtype8 blockAxB01 = 0.0f;\\\n"
"Dtype8 blockAxB02 = 0.0f;\\\n"
"Dtype8 blockAxB03 = 0.0f;\\\n"
"int2    coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 );\\\n"
"int2    coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 );\\\n"
"do\\\n"
"{\\\n"
"int2    coordBTemp = coordB;\\\n"
"Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K;\\\n"
"int2    coordATemp = coordA;\\\n"
"Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT;\\\n"
"Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT;\\\n"
"Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT;\\\n"
"Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.y += TILE_K;\\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00, 0 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00, 0 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00, 0 ); \\\n"
"} \\\n"
"while( coordB.y < width0 ); \\\n"
"GEMM_OUTPUT(ALPHA1, BETA_NOT0); \\\n"
"}\n"
"#endif\n"
"GEMM_TN(1, 0)\n"
"GEMM_TN(1, 1)\n"
"GEMM_TN(0, 0)\n"
"GEMM_TN(0, 1)\n"
"#undef MULTIPLY_BLOCKS_8x8\n"
"#undef TRANSPOSE_BLOCK_8\n"
"#undef GEMM_TN\n"
"#define TRANSPOSE_BLOCK_8( _block, _col )   \\\n"
"(Dtype8)( intel_sub_group_shuffle( _block.s0, _col),   \\\n"
"intel_sub_group_shuffle( _block.s1, _col),   \\\n"
"intel_sub_group_shuffle( _block.s2, _col),   \\\n"
"intel_sub_group_shuffle( _block.s3, _col),   \\\n"
"intel_sub_group_shuffle( _block.s4, _col),   \\\n"
"intel_sub_group_shuffle( _block.s5, _col),   \\\n"
"intel_sub_group_shuffle( _block.s6, _col),   \\\n"
"intel_sub_group_shuffle( _block.s7, _col) )\n"
"#if TYPE == TYPE_HALF\n"
"#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB )    \\\n"
"{   \\\n"
"const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \\\n"
"const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 );    \\\n"
"const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 );    \\\n"
"const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 );    \\\n"
"const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 );    \\\n"
"const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 );    \\\n"
"const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 );    \\\n"
"const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 );    \\\n"
"const Dtype8    acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 );    \\\n"
"const Dtype8    acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 );    \\\n"
"const Dtype8    acola = TRANSPOSE_BLOCK_8( _blockA, 10 );    \\\n"
"const Dtype8    acolb = TRANSPOSE_BLOCK_8( _blockA, 11 );    \\\n"
"const Dtype8    acolc = TRANSPOSE_BLOCK_8( _blockA, 12 );    \\\n"
"const Dtype8    acold = TRANSPOSE_BLOCK_8( _blockA, 13 );    \\\n"
"const Dtype8    acole = TRANSPOSE_BLOCK_8( _blockA, 14 );    \\\n"
"const Dtype8    acolf = TRANSPOSE_BLOCK_8( _blockA, 15 );    \\\n"
"_result = mad( (Dtype8)_blockB.s0, acol0, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s1, acol1, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s2, acol2, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s3, acol3, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s4, acol4, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s5, acol5, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s6, acol6, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s7, acol7, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s8, acol8, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s9, acol9, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.sa, acola, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.sb, acolb, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.sc, acolc, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.sd, acold, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.se, acole, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.sf, acolf, _result );      \\\n"
"}\n"
"#else\n"
"#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB )    \\\n"
"{   \\\n"
"const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \\\n"
"const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 );    \\\n"
"const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 );    \\\n"
"const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 );    \\\n"
"const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 );    \\\n"
"const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 );    \\\n"
"const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 );    \\\n"
"const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 );    \\\n"
"_result = mad( (Dtype8)_blockB.s0, acol0, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s1, acol1, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s2, acol2, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s3, acol3, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s4, acol4, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s5, acol5, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s6, acol6, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s7, acol7, _result );      \\\n"
"}\n"
"#endif\n"
"#if TYPE == TYPE_HALF\n"
"#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \\\n"
"__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \\\n"
"__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \\\n"
"__kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \\\n"
"__read_only image2d_t A, \\\n"
"MATB_PARAMETER, \\\n"
"MATC_PARAMETER, \\\n"
"KERNEL_ARG_DTYPE alpha_in, \\\n"
"KERNEL_ARG_DTYPE beta_in, \\\n"
"int padded_k, \\\n"
"int k, \\\n"
"int isFirstColBlock) \\\n"
"{ \\\n"
"const Dtype alpha = (Dtype)alpha_in; \\\n"
"const Dtype beta = (Dtype)beta_in; \\\n"
"const int group_x = get_group_id(0); \\\n"
"const int group_y = get_group_id(1); \\\n"
"Dtype8 blockAxB00 = 0; \\\n"
"Dtype8 blockAxB01 = 0; \\\n"
"Dtype8 blockAxB02 = 0; \\\n"
"Dtype8 blockAxB03 = 0; \\\n"
"int2    coordA = (int2)( 0, group_y * TILE_M ); \\\n"
"int2    coordB = (int2)( 0, ( group_x * TILE_N )); \\\n"
"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \\\n"
"do \\\n"
"{ \\\n"
"Dtype16 blockB00; \\\n"
"BLOCKB_READ8(blockB00, B, coordB); \\\n"
"int2    coordATemp = coordA; \\\n"
"Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \\\n"
"Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \\\n"
"Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \\\n"
"Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \\\n"
"} \\\n"
"while( coordB.x < padded_k / VECSIZE ); \\\n"
"GEMM_OUTPUT(ALPHA1, BETA_NOT0); \\\n"
"}\n"
"#else\n"
"#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \\\n"
"__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \\\n"
"__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \\\n"
"__kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \\\n"
"__read_only image2d_t A, \\\n"
"MATB_PARAMETER, \\\n"
"MATC_PARAMETER, \\\n"
"KERNEL_ARG_DTYPE alpha_in, \\\n"
"KERNEL_ARG_DTYPE beta_in, \\\n"
"int padded_k, \\\n"
"int k, \\\n"
"int isFirstColBlock) \\\n"
"{ \\\n"
"const Dtype alpha = (Dtype)alpha_in; \\\n"
"const Dtype beta = (Dtype)beta_in; \\\n"
"const int group_x = get_group_id(0); \\\n"
"const int group_y = get_group_id(1); \\\n"
"Dtype8 blockAxB00 = 0.0f; \\\n"
"Dtype8 blockAxB01 = 0.0f; \\\n"
"Dtype8 blockAxB02 = 0.0f; \\\n"
"Dtype8 blockAxB03 = 0.0f; \\\n"
"int2    coordA = (int2)( 0, group_y * TILE_M ); \\\n"
"int2    coordB = (int2)( 0, ( group_x * TILE_N )); \\\n"
"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \\\n"
"do \\\n"
"{ \\\n"
"Dtype8 blockB00;  \\\n"
"BLOCKB_READ8(blockB00, B, coordB); \\\n"
"int2    coordATemp = coordA; \\\n"
"Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \\\n"
"Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \\\n"
"Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \\\n"
"Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.x += TILE_K * SIZE_OF_ELEMENT; \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \\\n"
"} \\\n"
"while( coordB.x < padded_k / VECSIZE ); \\\n"
"GEMM_OUTPUT(ALPHA1, BETA_NOT0); \\\n"
"}\n"
"#endif\n"
"#if TYPE == TYPE_HALF\n"
"#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n"
"int2 _coordBTemp = _coordB; \\\n"
"_coordBTemp.y += get_local_id(0); \\\n"
"_blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s89ab = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.scdef = READ_IMAGE(_B, _coordBTemp); _coordB.x += 4;\n"
"#else\n"
"#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n"
"int2 _coordBTemp = _coordB; \\\n"
"_coordBTemp.y += get_local_id(0); \\\n"
"_blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;\n"
"#endif\n"
"#define MATB_PARAMETER __read_only image2d_t B\n"
"GEMM_NT(1, 0, VEC4, 4)\n"
"GEMM_NT(1, 1, VEC4, 4)\n"
"GEMM_NT(0, 0, VEC4, 4)\n"
"GEMM_NT(0, 1, VEC4, 4)\n"
"#undef BLOCKB_READ8\n"
"#undef MATB_PARAMETER\n"
"#if TYPE == TYPE_HALF\n"
"#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n"
"int2 _coordBTemp = _coordB; \\\n"
"_coordBTemp.y += get_local_id(0); \\\n"
"const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \\\n"
"_blockb = as_Dtype16(as_ushort16(vload8(0, B_read))); \\\n"
"_coordB.x += TILE_K * 2;\n"
"#else\n"
"#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n"
"int2 _coordBTemp = _coordB; \\\n"
"_coordBTemp.y += get_local_id(0); \\\n"
"const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \\\n"
"_blockb = vload8(0, B_read); \\\n"
"_coordB.x += TILE_K;\n"
"#endif\n"
"#define MATB_PARAMETER __global Dtype *B, int offB, int ldb\n"
"GEMM_NT(1, 0, BUFFER, 1)\n"
"GEMM_NT(1, 1, BUFFER, 1)\n"
"GEMM_NT(0, 0, BUFFER, 1)\n"
"GEMM_NT(0, 1, BUFFER, 1)\n"
"#undef BLOCKB_READ8\n"
"#undef MATB_PARAMETER\n"
"#if TYPE == TYPE_HALF\n"
"#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n"
"int2 _coordBTemp = _coordB; \\\n"
"_coordBTemp.y += get_local_id(0); \\\n"
"Dtype4 temp; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s0 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s1 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s2 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s3 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s4 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s5 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s6 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s7 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s8 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s9 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.sa = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.sb = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.sc = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.sd = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.se = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.sf = temp.s0; \\\n"
"_coordB.x += 16;\n"
"#else\n"
"#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n"
"int2 _coordBTemp = _coordB; \\\n"
"_coordBTemp.y += get_local_id(0); \\\n"
"Dtype4 temp; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s0 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s1 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s2 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s3 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s4 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s5 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s6 = temp.s0; \\\n"
"temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s7 = temp.s0; \\\n"
"_coordB.x += 8;\n"
"#endif\n"
"#define MATB_PARAMETER __read_only image2d_t B\n"
"GEMM_NT(1, 0, SCALAR, 1)\n"
"GEMM_NT(1, 1, SCALAR, 1)\n"
"GEMM_NT(0, 0, SCALAR, 1)\n"
"GEMM_NT(0, 1, SCALAR, 1)\n"
"#undef BLOCKB_READ8\n"
"#undef MATB_PARAMETER\n"
"#undef MULTIPLY_BLOCKS_8x8\n"
"#undef TRANSPOSE_BLOCK_8\n"
"#undef GEMM_NT\n"
"#define TRANSPOSE_BLOCK_8(_vec, _col) \\\n"
"(Dtype8)( intel_sub_group_shuffle(_vec, _col + 0), \\\n"
"intel_sub_group_shuffle(_vec, _col + 1), \\\n"
"intel_sub_group_shuffle(_vec, _col + 2), \\\n"
"intel_sub_group_shuffle(_vec, _col + 3), \\\n"
"intel_sub_group_shuffle(_vec, _col + 4), \\\n"
"intel_sub_group_shuffle(_vec, _col + 5), \\\n"
"intel_sub_group_shuffle(_vec, _col + 6), \\\n"
"intel_sub_group_shuffle(_vec, _col + 7) );\n"
"#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB, _col )    \\\n"
"{   \\\n"
"const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA.s0, _col );    \\\n"
"const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA.s1, _col );    \\\n"
"const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA.s2, _col );    \\\n"
"const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA.s3, _col );    \\\n"
"const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA.s4, _col );    \\\n"
"const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA.s5, _col );    \\\n"
"const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA.s6, _col );    \\\n"
"const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA.s7, _col );    \\\n"
"_result = mad( (Dtype8)_blockB.s0, acol0, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s1, acol1, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s2, acol2, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s3, acol3, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s4, acol4, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s5, acol5, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s6, acol6, _result );      \\\n"
"_result = mad( (Dtype8)_blockB.s7, acol7, _result );      \\\n"
"}\n"
"#if TYPE == TYPE_HALF\n"
"#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \\\n"
"__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \\\n"
"__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \\\n"
"__kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \\\n"
"__read_only image2d_t A, \\\n"
"MATB_PARAMETER, \\\n"
"MATC_PARAMETER, \\\n"
"KERNEL_ARG_DTYPE alpha_in, \\\n"
"KERNEL_ARG_DTYPE beta_in, \\\n"
"int padded_k, \\\n"
"int k, \\\n"
"int isFirstColBlock) \\\n"
"{ \\\n"
"const Dtype alpha = (Dtype)alpha_in; \\\n"
"const Dtype beta = (Dtype)beta_in; \\\n"
"const int group_x = get_group_id(0); \\\n"
"const int group_y = get_group_id(1); \\\n"
"Dtype8 blockAxB00 = 0; \\\n"
"Dtype8 blockAxB01 = 0; \\\n"
"Dtype8 blockAxB02 = 0; \\\n"
"Dtype8 blockAxB03 = 0; \\\n"
"int2    coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ); \\\n"
"int2    coordB = (int2)( 0, ( group_x * TILE_N )); \\\n"
"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \\\n"
"do \\\n"
"{ \\\n"
"Dtype8 blockB00;             \\\n"
"BLOCKB_READ8(blockB00, B, coordB); \\\n"
"int2    coordATemp = coordA; \\\n"
"Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 16 * SIZE_OF_ELEMENT;\\\n"
"Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.y += TILE_K;\\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \\\n"
"} \\\n"
"while( coordB.x < padded_k / VECSIZE ); \\\n"
"GEMM_OUTPUT(ALPHA1, BETA_NOT0);\\\n"
"}\n"
"#else\n"
"#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \\\n"
"__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \\\n"
"__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \\\n"
"__kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \\\n"
"__read_only image2d_t A, \\\n"
"MATB_PARAMETER, \\\n"
"MATC_PARAMETER, \\\n"
"KERNEL_ARG_DTYPE alpha_in, \\\n"
"KERNEL_ARG_DTYPE beta_in, \\\n"
"int padded_k, \\\n"
"int k, \\\n"
"int isFirstColBlock) \\\n"
"{ \\\n"
"const Dtype alpha = (Dtype)alpha_in; \\\n"
"const Dtype beta = (Dtype)beta_in; \\\n"
"const int group_x = get_group_id(0); \\\n"
"const int group_y = get_group_id(1); \\\n"
"Dtype8 blockAxB00 = 0.0f; \\\n"
"Dtype8 blockAxB01 = 0.0f; \\\n"
"Dtype8 blockAxB02 = 0.0f; \\\n"
"Dtype8 blockAxB03 = 0.0f; \\\n"
"int2    coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ); \\\n"
"int2    coordB = (int2)( 0, ( group_x * TILE_N )); \\\n"
"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \\\n"
"do \\\n"
"{ \\\n"
"Dtype8 blockB00;             \\\n"
"BLOCKB_READ8(blockB00, B, coordB); \\\n"
"int2    coordATemp = coordA; \\\n"
"Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT; \\\n"
"Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT; \\\n"
"Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT; \\\n"
"Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.y += TILE_K; \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00 , blockB00, 0 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01 , blockB00, 0 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02 , blockB00, 0 ); \\\n"
"MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03 , blockB00, 0 ); \\\n"
"} \\\n"
"while( coordB.x < padded_k / VECSIZE ); \\\n"
"GEMM_OUTPUT(ALPHA1, BETA_NOT0);\\\n"
"}\n"
"#endif\n"
"#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n"
"int2 _coordBTemp = _coordB; \\\n"
"_coordBTemp.y += get_local_id(0); \\\n"
"_blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;\n"
"#define MATB_PARAMETER __read_only image2d_t B\n"
"GEMM_TT(1, 0, VEC4, 4)\n"
"GEMM_TT(1, 1, VEC4, 4)\n"
"GEMM_TT(0, 0, VEC4, 4)\n"
"GEMM_TT(0, 1, VEC4, 4)\n"
"#undef BLOCKB_READ8\n"
"#undef MATB_PARAMETER\n"
"#if TYPE == TYPE_HALF\n"
"#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n"
"int2 _coordBTemp = _coordB; \\\n"
"_coordBTemp.y += get_local_id(0); \\\n"
"const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \\\n"
"_blockb = as_Dtype8(as_ushort8(vload4(0, B_read))); \\\n"
"_coordB.x += TILE_K;\n"
"#else\n"
"#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n"
"int2 _coordBTemp = _coordB; \\\n"
"_coordBTemp.y += get_local_id(0); \\\n"
"const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \\\n"
"_blockb = vload8(0, B_read); \\\n"
"_coordB.x += TILE_K;\n"
"#endif\n"
"#define MATB_PARAMETER __global Dtype *B, int offB, int ldb\n"
"GEMM_TT(1, 0, BUFFER, 1)\n"
"GEMM_TT(1, 1, BUFFER, 1)\n"
"GEMM_TT(0, 0, BUFFER, 1)\n"
"GEMM_TT(0, 1, BUFFER, 1)\n"
"#undef BLOCKB_READ8\n"
"#undef MATB_PARAMETER\n"
"#define BLOCKB_READ8(_blockb, _B, _coordB) \\\n"
"int2 _coordBTemp = _coordB; \\\n"
"_coordBTemp.y += get_local_id(0); \\\n"
"Dtype4 temp; \\\n"
"temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s0 = temp.s0; \\\n"
"temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s1 = temp.s0; \\\n"
"temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s2 = temp.s0; \\\n"
"temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s3 = temp.s0; \\\n"
"temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s4 = temp.s0; \\\n"
"temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s5 = temp.s0; \\\n"
"temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s6 = temp.s0; \\\n"
"temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \\\n"
"_blockb.s7 = temp.s0; \\\n"
"_coordB.x += 8;\n"
"#define MATB_PARAMETER __read_only image2d_t B\n"
"GEMM_TT(1, 0, SCALAR, 1)\n"
"GEMM_TT(1, 1, SCALAR, 1)\n"
"GEMM_TT(0, 0, SCALAR, 1)\n"
"GEMM_TT(0, 1, SCALAR, 1)\n"
"#undef BLOCKB_READ8\n"
"#undef MATB_PARAMETER\n"
"#undef MULTIPLY_BLOCKS_8x8\n"
"#undef TRANSPOSE_BLOCK_8\n"
"#undef GEMM_TT\n"
"#undef TILE_M\n"
"#undef TILE_K\n"
"#undef TILE_N\n"
"#undef SUBGROUP_BLOCK_READ8\n"
"#undef READ_IMAGE\n"
"#undef SIZE_OF_ELEMENT\n"
"__kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(\n"
"__global Dtype* A,\n"
"__write_only image2d_t ImA,\n"
"int offA,\n"
"int width,\n"
"int height,\n"
"int ldA)\n"
"{\n"
"const int gidx = get_global_id(0);\n"
"const int gidy = get_global_id(1);\n"
"if (gidx >= width || gidy >= height)\n"
"return;\n"
"int2 coord_dst = (int2)(gidx, gidy);\n"
"__global Dtype* A_off = A + offA;\n"
"Dtype srcA = A_off[gidy * ldA + gidx];\n"
"#if TYPE == TYPE_HALF\n"
"write_imageh(ImA, coord_dst, (Dtype4)srcA);\n"
"#else\n"
"write_imagef(ImA, coord_dst, (Dtype4)srcA);\n"
"#endif\n"
"}\n"
"__kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(\n"
"__global Dtype* A,\n"
"__write_only image2d_t ImA,\n"
"int offA,\n"
"int padded_width,\n"
"int padded_height,\n"
"int width,\n"
"int height,\n"
"int ldA)\n"
"{\n"
"const int gidx = get_global_id(0);\n"
"const int gidy = get_global_id(1);\n"
"if (gidx >= padded_width || gidy >= padded_height)\n"
"return;\n"
"int2 coord_dst = (int2)(gidx, gidy);\n"
"#if TYPE == TYPE_HALF\n"
"if (gidx >= width || gidy >= height) {\n"
"write_imageh(ImA, coord_dst, 0);\n"
"return;\n"
"}\n"
"__global Dtype* A_off = A + offA;\n"
"write_imageh(ImA, coord_dst, A_off[gidy * ldA + gidx]);\n"
"#else\n"
"if (gidx >= width || gidy >= height) {\n"
"write_imageui(ImA, coord_dst, (uint4)0);\n"
"return;\n"
"}\n"
"__global Dtype* A_off = A + offA;\n"
"uint4 srcA = convert_uint4(as_uchar4(A_off[gidy * ldA + gidx]));\n"
"write_imageui(ImA, coord_dst, srcA);\n"
"#endif\n"
"}\n"
, "3b788ad998ad54977a6af81e04e13c15", NULL};
struct cv::ocl::internal::ProgramEntry im2col_oclsrc={moduleName, "im2col",
"__kernel void im2col(__global const T *im_src, int im_src_offset,\n"
"int channels, int height_inp, int width_inp,\n"
"int kernel_h, int kernel_w, int pad_h, int pad_w, int stride_h, int stride_w,\n"
"int height_out, int width_out,\n"
"__global T *im_col, int im_col_offset\n"
")\n"
"{\n"
"int index = get_global_id(0);\n"
"if (index >= height_out * width_out * channels)\n"
"return;\n"
"int j_out = index % width_out;\n"
"int i_out = (index / width_out) % height_out;\n"
"int c_inp = (index / width_out) / height_out;\n"
"int c_out = c_inp * kernel_h * kernel_w;\n"
"int i_inp = i_out * stride_h - pad_h;\n"
"int j_inp = j_out * stride_w - pad_w;\n"
"im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset;\n"
"im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset;\n"
"for (int ki = 0; ki < kernel_h; ++ki)\n"
"for (int kj = 0; kj < kernel_w; ++kj) {\n"
"int i = i_inp + ki;\n"
"int j = j_inp + kj;\n"
"*im_col = (i >= 0 && j >= 0 && i < height_inp && j < width_inp) ?\n"
"im_src[ki * width_inp + kj] : 0;\n"
"im_col += height_out * width_out;\n"
"}\n"
"}\n"
, "609f199a321eef4535e1eff3ab281090", NULL};
struct cv::ocl::internal::ProgramEntry lrn_oclsrc={moduleName, "lrn",
"/*************************************************************************************\n"
"* Copyright (c) 2015, Advanced Micro Devices, Inc.\n"
"* All rights reserved.\n"
"*\n"
"* Redistribution and use in source and binary forms, with or without modification,\n"
"* are permitted provided that the following conditions are met:\n"
"*\n"
"* 1. Redistributions of source code must retain the above copyright notice, this\n"
"* list of conditions and the following disclaimer.\n"
"*\n"
"* 2. Redistributions in binary form must reproduce the above copyright notice,\n"
"* this list of conditions and the following disclaimer in the documentation and/or\n"
"*  other materials provided with the distribution.\n"
"*\n"
"* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n"
"* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n"
"* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\n"
"* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,\n"
"* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,\n"
"* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,\n"
"* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,\n"
"* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n"
"* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\n"
"* POSSIBILITY OF SUCH DAMAGE.\n"
"**************************************************************************************/\n"
"__kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) {\n"
"int index = get_global_id(0);\n"
"int tmp = get_global_size(0);\n"
"for(index; index < nthreads; index += tmp)\n"
"out[index] = in[index] * pow(scale[index], negative_beta);\n"
"}\n"
"__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) {\n"
"int index = get_global_id(0);\n"
"int tmp = get_global_size(0);\n"
"for(index; index < nthreads; index += tmp) {\n"
"const int w = index % width;\n"
"const int h = (index / width) % height;\n"
"const int n = index / width / height;\n"
"const int offset = (n * channels * height + h) * width + w;\n"
"const int step = height * width;\n"
"in = in + offset;\n"
"scale = scale + offset;\n"
"int head = 0;\n"
"const int pre_pad = (size - 1) / 2;\n"
"const int post_pad = size - pre_pad - 1;\n"
"T accum_scale = 0;\n"
"while (head < post_pad && head < channels) {\n"
"accum_scale += in[head * step] * in[head * step];\n"
"++head;\n"
"}\n"
"while (head < channels) {\n"
"accum_scale += in[head * step] * in[head * step];\n"
"if (head - size >= 0) {\n"
"accum_scale -= in[(head - size) * step]\n"
"* in[(head - size) * step];\n"
"}\n"
"scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;\n"
"++head;\n"
"}\n"
"while (head < channels + post_pad) {\n"
"if (head - size >= 0) {\n"
"accum_scale -= in[(head - size) * step]\n"
"* in[(head - size) * step];\n"
"}\n"
"scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;\n"
"++head;\n"
"}\n"
"}\n"
"}\n"
, "0c65eb40713b6261f88bfa6731e32733", NULL};
struct cv::ocl::internal::ProgramEntry math_oclsrc={moduleName, "math",
"#if defined(cl_khr_fp16)\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define CONCAT(A,B) A##_##B\n"
"#define TEMPLATE(name,type) CONCAT(name,type)\n"
"#define KERNEL_ARG_DTYPE float\n"
"__kernel void TEMPLATE(axpy,Dtype)(const int n, const KERNEL_ARG_DTYPE alpha, __global const Dtype* x,\n"
"const int offx, __global Dtype* y,\n"
"const int offy) {\n"
"for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n"
"Dtype src = x[offx + index];\n"
"Dtype dst = y[offy + index];\n"
"y[offy + index] = convert_Dtype(alpha) * src + dst;\n"
"}\n"
"}\n"
, "a76839299bc739767433b6d55915e1b7", NULL};
struct cv::ocl::internal::ProgramEntry matvec_mul_oclsrc={moduleName, "matvec_mul",
"#if defined(cl_khr_fp16)\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define CONCAT(A,B) A##_##B\n"
"#define TEMPLATE(name,type) CONCAT(name,type)\n"
"#define KERNEL_ARG_DTYPE float\n"
"__kernel void TEMPLATE(matvec_mul4,Dtype)(\n"
"__global const Dtype * A,\n"
"int offA,\n"
"unsigned int A_col_size,\n"
"unsigned int trail_item,\n"
"__global const Dtype * v,\n"
"int offv,\n"
"KERNEL_ARG_DTYPE alpha,\n"
"KERNEL_ARG_DTYPE beta,\n"
"__global Dtype4* result,\n"
"int offr,\n"
"__local Dtype4* work)\n"
"{\n"
"unsigned int row_gid = get_group_id(0);\n"
"unsigned int lid = get_local_id(0);\n"
"const __global Dtype *src0_read = A + row_gid * 4 * A_col_size + offA;\n"
"const __global Dtype *src1_read = v + offv;\n"
"result = (__global Dtype4*)((__global Dtype*)result + offr);\n"
"Dtype4 dot0 = (Dtype4)(0.f);\n"
"Dtype4 dot1 = (Dtype4)(0.f);\n"
"Dtype4 dot2 = (Dtype4)(0.f);\n"
"Dtype4 dot3 = (Dtype4)(0.f);\n"
"unsigned int i = lid;\n"
"while( i < A_col_size / 4) {\n"
"const Dtype4 a0 = vload4(i, src0_read);\n"
"const Dtype4 a1 = vload4(i, src0_read + A_col_size);\n"
"const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);\n"
"const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);\n"
"const Dtype4 b0 = vload4(i, src1_read);\n"
"dot0 += a0 * b0;\n"
"dot1 += a1 * b0;\n"
"dot2 += a2 * b0;\n"
"dot3 += a3 * b0;\n"
"i += get_local_size(0);\n"
"}\n"
"work[lid].s0 = dot0.x + dot0.y + dot0.z + dot0.w;\n"
"work[lid].s1 = dot1.x + dot1.y + dot1.z + dot1.w;\n"
"work[lid].s2 = dot2.x + dot2.y + dot2.z + dot2.w;\n"
"work[lid].s3 = dot3.x + dot3.y + dot3.z + dot3.w;\n"
"if(i == A_col_size / 4)\n"
"{\n"
"if(trail_item != 0)\n"
"{\n"
"const __global Dtype *src0_trail = src0_read + i * 4;\n"
"const __global Dtype *src1_trail = src1_read + i * 4;\n"
"for(unsigned int i = 0; i < trail_item; ++i) {\n"
"const Dtype at0 = src0_trail[i];\n"
"const Dtype at1 = src0_trail[i + A_col_size];\n"
"const Dtype at2 = src0_trail[i + 2 * A_col_size];\n"
"const Dtype at3 = src0_trail[i + 3 * A_col_size];\n"
"const Dtype bt = src1_trail[i];\n"
"work[lid].s0 += at0 * bt;\n"
"work[lid].s1 += at1 * bt;\n"
"work[lid].s2 += at2 * bt;\n"
"work[lid].s3 += at3 * bt;\n"
"}\n"
"}\n"
"}\n"
"for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) {\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"if(lid < stride)\n"
"work[lid] += work[lid+stride];\n"
"}\n"
"if(lid == 0) {\n"
"if(beta == (Dtype)0)\n"
"result[row_gid] = convert_Dtype(alpha) * work[0];\n"
"else\n"
"result[row_gid] = convert_Dtype(alpha) * work[0] + convert_Dtype(beta) * result[row_gid];\n"
"}\n"
"}\n"
"__kernel void TEMPLATE(matvec_mul1,Dtype)(\n"
"__global const Dtype * A,\n"
"int offA,\n"
"unsigned int A_col_size,\n"
"unsigned int row_offset,\n"
"unsigned int trail_item,\n"
"__global const Dtype * v,\n"
"int offv,\n"
"KERNEL_ARG_DTYPE alpha,\n"
"KERNEL_ARG_DTYPE beta,\n"
"__global Dtype * result,\n"
"int offr,\n"
"__local Dtype * work)\n"
"{\n"
"unsigned int row_gid = get_group_id(0);\n"
"unsigned int lid = get_local_id(0);\n"
"const __global Dtype *src0_read = A + (row_offset + row_gid) * A_col_size + offA;\n"
"const __global Dtype *src1_read = v + + offv;\n"
"result = result + offr;\n"
"Dtype4 dot0 = (Dtype4)(0.f);\n"
"unsigned int i = lid;\n"
"while( i < A_col_size / 4)\n"
"{\n"
"const Dtype4 a0 = vload4(i, src0_read);\n"
"const Dtype4 b0 = vload4(i, src1_read);\n"
"dot0 += a0 * b0;\n"
"i += get_local_size(0);\n"
"}\n"
"work[lid] = dot0.x + dot0.y + dot0.z + dot0.w;\n"
"if(i == A_col_size / 4)\n"
"{\n"
"if(trail_item != 0)\n"
"{\n"
"const __global Dtype *src0_trail = src0_read + i * 4;\n"
"const __global Dtype *src1_trail = src1_read + i * 4;\n"
"for(unsigned int i = 0; i < trail_item; ++i) {\n"
"const Dtype at0 = src0_trail[i];\n"
"const Dtype bt = src1_trail[i];\n"
"work[lid] += at0 * bt;\n"
"}\n"
"}\n"
"}\n"
"for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) {\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"if(lid < stride)\n"
"work[lid] += work[lid+stride];\n"
"}\n"
"if(lid == 0) {\n"
"if(beta == (Dtype)0) {\n"
"result[row_gid+row_offset] = convert_Dtype(alpha) * work[0];\n"
"} else {\n"
"result[row_gid+row_offset] *= convert_Dtype(beta);\n"
"result[row_gid+row_offset] += convert_Dtype(alpha) * work[0];\n"
"}\n"
"}\n"
"}\n"
, "b1ea7917f8161740ee6102617a54cfe1", NULL};
struct cv::ocl::internal::ProgramEntry mvn_oclsrc={moduleName, "mvn",
"#if defined(cl_khr_fp16)\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define Dtype  float\n"
"#define Dtype4 float4\n"
"#define Dtype8 float8\n"
"#if NUM == 8\n"
"#define load(src, index) vload8(0, src + index)\n"
"#define store(vec, dst, index) vstore8(vec, 0, dst + index)\n"
"#define vec_type Dtype8\n"
"#define CALC_MEAN calc_mean8\n"
"#define MVN mvn8\n"
"#define MEAN_FUSE mean_fuse8\n"
"#define MVN_FUSE mvn_fuse8\n"
"#elif NUM == 4\n"
"#define load(src, index) vload4(0, src + index)\n"
"#define store(vec, dst, index) vstore4(vec, 0, dst + index)\n"
"#define vec_type Dtype4\n"
"#define CALC_MEAN calc_mean4\n"
"#define MVN mvn4\n"
"#define MEAN_FUSE mean_fuse4\n"
"#define MVN_FUSE mvn_fuse4\n"
"#elif NUM == 1\n"
"#define load(src, index) src[index]\n"
"#define store(vec, dst, index) dst[index] = vec\n"
"#define vec_type Dtype\n"
"#define CALC_MEAN calc_mean1\n"
"#define MVN mvn1\n"
"#define MEAN_FUSE mean_fuse1\n"
"#define MVN_FUSE mvn_fuse1\n"
"#endif\n"
"#ifdef KERNEL_MEAN\n"
"__kernel void CALC_MEAN(__global const Dtype* src,\n"
"const int rows,\n"
"const int cols,\n"
"__global Dtype* mean,\n"
"__global Dtype* dst)\n"
"{\n"
"int x = get_global_id(0);\n"
"int y = get_global_id(1) * NUM;\n"
"int index = x * cols + y;\n"
"if (x >= rows || y >= cols)\n"
"return;\n"
"Dtype mean_val = mean[x];\n"
"vec_type src_vec = load(src, index);\n"
"vec_type dst_vec = src_vec - (vec_type)mean_val;\n"
"dst_vec = dst_vec * dst_vec;\n"
"store(dst_vec, dst, index);\n"
"}\n"
"#elif defined KERNEL_MVN\n"
"__kernel void MVN(__global const Dtype* src,\n"
"const int rows,\n"
"const int cols,\n"
"const Dtype eps,\n"
"__global const Dtype* mean,\n"
"__global const Dtype* dev,\n"
"__global const Dtype* bnorm_weight,\n"
"__global const Dtype* bnorm_bias,\n"
"const int channels,\n"
"const float relu_slope,\n"
"__global Dtype* dst)\n"
"{\n"
"int x = get_global_id(0);\n"
"int y = get_global_id(1) * NUM;\n"
"int index = x * cols + y;\n"
"if (x >= rows || y >= cols)\n"
"return;\n"
"Dtype mean_val = mean[x];\n"
"Dtype dev_val = dev[x];\n"
"Dtype alpha;\n"
"#ifdef NORM_VARIANCE\n"
"alpha = 1 / sqrt(eps + dev_val);\n"
"#else\n"
"alpha = 1;\n"
"#endif\n"
"Dtype w = 1.f, b = 0.f;\n"
"#ifdef FUSE_BATCH_NORM\n"
"w = bnorm_weight[x % channels];\n"
"b = bnorm_bias[x % channels];\n"
"#endif\n"
"vec_type src_vec = load(src, index) - (vec_type)mean_val;\n"
"vec_type dst_vec = src_vec * alpha;\n"
"dst_vec = dst_vec * w + (vec_type)b;\n"
"#ifdef FUSE_RELU\n"
"vec_type new_val = dst_vec * relu_slope;\n"
"dst_vec = select(new_val, dst_vec, dst_vec > (vec_type)0.f);\n"
"#endif\n"
"store(dst_vec, dst, index);\n"
"}\n"
"#elif defined KERNEL_MEAN_FUSE\n"
"__kernel void MEAN_FUSE(__global const T * A,\n"
"unsigned int A_col_size,\n"
"float alpha,\n"
"__global T4 * mean,\n"
"__global Dtype * tmp)\n"
"{\n"
"unsigned int row_gid = get_group_id(0);\n"
"unsigned int lid = get_local_id(0);\n"
"const __global T *src0_read = A + row_gid * 4 * A_col_size;\n"
"__global Dtype *dst0_read = tmp + row_gid * 4 * A_col_size;\n"
"Dtype4 dot0, dot1, dot2, dot3;\n"
"dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);\n"
"unsigned int i = lid;\n"
"const Dtype4 b0 = (Dtype4)1.f;\n"
"while( i < A_col_size / 4)\n"
"{\n"
"const T4 a0 = vload4(i, src0_read);\n"
"const T4 a1 = vload4(i, src0_read + A_col_size);\n"
"const T4 a2 = vload4(i, src0_read + 2 * A_col_size);\n"
"const T4 a3 = vload4(i, src0_read + 3 * A_col_size);\n"
"dot0 += convert_float4(a0);\n"
"dot1 += convert_float4(a1);\n"
"dot2 += convert_float4(a2);\n"
"dot3 += convert_float4(a3);\n"
"i += LOCAL_SIZE;\n"
"}\n"
"__local Dtype4 work[LOCAL_SIZE];\n"
"work[lid].s0 = dot(dot0, b0);\n"
"work[lid].s1 = dot(dot1, b0);\n"
"work[lid].s2 = dot(dot2, b0);\n"
"work[lid].s3 = dot(dot3, b0);\n"
"for(unsigned int stride=LOCAL_SIZE/2 ; stride>0 ; stride>>=1)\n"
"{\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"if(lid < stride)\n"
"work[lid] += work[lid+stride];\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"if(lid == 0)\n"
"{\n"
"mean[row_gid] = convert_T(alpha * work[0]);\n"
"}\n"
"Dtype4 sum = work[0] * alpha;\n"
"i = lid;\n"
"while( i < A_col_size / 4)\n"
"{\n"
"const T4 a0 = vload4(i, src0_read);\n"
"const T4 a1 = vload4(i, src0_read + A_col_size);\n"
"const T4 a2 = vload4(i, src0_read + 2 * A_col_size);\n"
"const T4 a3 = vload4(i, src0_read + 3 * A_col_size);\n"
"dot0 = convert_float4(a0) - (Dtype4)sum.x;\n"
"dot1 = convert_float4(a1) - (Dtype4)sum.y;\n"
"dot2 = convert_float4(a2) - (Dtype4)sum.z;\n"
"dot3 = convert_float4(a3) - (Dtype4)sum.w;\n"
"dot0 = dot0 * dot0;\n"
"dot1 = dot1 * dot1;\n"
"dot2 = dot2 * dot2;\n"
"dot3 = dot3 * dot3;\n"
"vstore4(dot0, i, dst0_read);\n"
"vstore4(dot1, i, dst0_read + A_col_size);\n"
"vstore4(dot2, i, dst0_read + 2 * A_col_size);\n"
"vstore4(dot3, i, dst0_read + 3 * A_col_size);\n"
"i += LOCAL_SIZE;\n"
"}\n"
"}\n"
"#elif defined KERNEL_MVN_FUSE\n"
"__kernel void MVN_FUSE(__global const Dtype * tmp,\n"
"__global const T * A,\n"
"__global const T4 * mean,\n"
"unsigned int A_col_size,\n"
"const float alpha_val,\n"
"const float eps,\n"
"const float relu_slope,\n"
"__global const Dtype4 * bnorm_weight,\n"
"__global const Dtype4 * bnorm_bias,\n"
"__global T * B)\n"
"{\n"
"unsigned int row_gid = get_group_id(0);\n"
"unsigned int lid = get_local_id(0);\n"
"const __global Dtype *src0_read = tmp + row_gid * 4 * A_col_size;\n"
"const __global T *src1_read = A + row_gid * 4 * A_col_size;\n"
"__global T *dst0_read = B + row_gid * 4 * A_col_size;\n"
"Dtype4 dot0, dot1, dot2, dot3;\n"
"dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);\n"
"unsigned int i = lid;\n"
"const Dtype4 b0 = (Dtype4)1.f;\n"
"while( i < A_col_size / 4)\n"
"{\n"
"const Dtype4 a0 = vload4(i, src0_read);\n"
"const Dtype4 a1 = vload4(i, src0_read + A_col_size);\n"
"const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);\n"
"const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);\n"
"dot0 += a0;\n"
"dot1 += a1;\n"
"dot2 += a2;\n"
"dot3 += a3;\n"
"i += LOCAL_SIZE;\n"
"}\n"
"__local Dtype4 work[LOCAL_SIZE];\n"
"work[lid].s0 = dot(dot0, b0);\n"
"work[lid].s1 = dot(dot1, b0);\n"
"work[lid].s2 = dot(dot2, b0);\n"
"work[lid].s3 = dot(dot3, b0);\n"
"for(unsigned int stride=LOCAL_SIZE/2 ; stride>0 ; stride>>=1)\n"
"{\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"if(lid < stride)\n"
"work[lid] += work[lid+stride];\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"Dtype4 mean_val = convert_float4(mean[row_gid]);\n"
"Dtype4 dev_val = sqrt(work[0] * alpha_val + (Dtype4)eps);\n"
"Dtype4 alpha = (Dtype4)1.f / dev_val;\n"
"Dtype4 w = (Dtype4)1.f;\n"
"Dtype4 b = (Dtype4)0.f;\n"
"#ifdef FUSE_BATCH_NORM\n"
"w = bnorm_weight[row_gid];\n"
"b = bnorm_bias[row_gid];\n"
"#endif\n"
"i = lid;\n"
"while( i < A_col_size / 4)\n"
"{\n"
"const T4 a0 = vload4(i, src1_read);\n"
"const T4 a1 = vload4(i, src1_read + A_col_size);\n"
"const T4 a2 = vload4(i, src1_read + 2 * A_col_size);\n"
"const T4 a3 = vload4(i, src1_read + 3 * A_col_size);\n"
"dot0 = (convert_float4(a0) - (Dtype4)mean_val.x) * alpha.x;\n"
"dot1 = (convert_float4(a1) - (Dtype4)mean_val.y) * alpha.y;\n"
"dot2 = (convert_float4(a2) - (Dtype4)mean_val.z) * alpha.z;\n"
"dot3 = (convert_float4(a3) - (Dtype4)mean_val.w) * alpha.w;\n"
"dot0 = dot0 * w.x + (Dtype4)b.x;\n"
"dot1 = dot1 * w.y + (Dtype4)b.y;\n"
"dot2 = dot2 * w.z + (Dtype4)b.z;\n"
"dot3 = dot3 * w.w + (Dtype4)b.w;\n"
"#ifdef FUSE_RELU\n"
"Dtype4 new0 = dot0 * relu_slope;\n"
"dot0 = select(new0, dot0, dot0 > (Dtype4)0.f);\n"
"Dtype4 new1 = dot1 * relu_slope;\n"
"dot1 = select(new1, dot1, dot1 > (Dtype4)0.f);\n"
"Dtype4 new2 = dot2 * relu_slope;\n"
"dot2 = select(new2, dot2, dot2 > (Dtype4)0.f);\n"
"Dtype4 new3 = dot3 * relu_slope;\n"
"dot3 = select(new3, dot3, dot3 > (Dtype4)0.f);\n"
"#endif\n"
"vstore4(convert_T(dot0), i, dst0_read);\n"
"vstore4(convert_T(dot1), i, dst0_read + A_col_size);\n"
"vstore4(convert_T(dot2), i, dst0_read + 2 * A_col_size);\n"
"vstore4(convert_T(dot3), i, dst0_read + 3 * A_col_size);\n"
"i += LOCAL_SIZE;\n"
"}\n"
"}\n"
"#else\n"
"#error \"Configuration error!\"\n"
"#endif\n"
, "d0e6334dcdc9ef67a14d01801722c035", NULL};
struct cv::ocl::internal::ProgramEntry ocl4dnn_lrn_oclsrc={moduleName, "ocl4dnn_lrn",
"#define CONCAT(A,B) A##_##B\n"
"#define TEMPLATE(name,type) CONCAT(name,type)\n"
"#define KERNEL_ARG_DTYPE float\n"
"#if defined(cl_khr_fp16)\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"__kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in,\n"
"const int num, const int channels,\n"
"const int height, const int width, const int size,\n"
"const KERNEL_ARG_DTYPE alpha_over_size, const KERNEL_ARG_DTYPE k,\n"
"__global Dtype* const out,\n"
"const KERNEL_ARG_DTYPE negative_beta) {\n"
"for (int index = get_global_id(0); index < nthreads;\n"
"index += get_global_size(0)) {\n"
"const int w = index % width;\n"
"const int h = (index / width) % height;\n"
"const int n = index / width / height;\n"
"const int offset = (n * channels * height + h) * width + w;\n"
"const int step = height * width;\n"
"__global const Dtype* in_off = in + offset;\n"
"__global Dtype* out_off = out + offset;\n"
"KERNEL_ARG_DTYPE scale_val;\n"
"int head = 0;\n"
"const int pre_pad = (size - 1) / 2;\n"
"const int post_pad = size - pre_pad - 1;\n"
"KERNEL_ARG_DTYPE accum_scale = 0;\n"
"while (head < post_pad && head < channels) {\n"
"accum_scale += in_off[head * step] * in_off[head * step];\n"
"++head;\n"
"}\n"
"while (head < channels) {\n"
"accum_scale += in_off[head * step] * in_off[head * step];\n"
"if (head - size >= 0) {\n"
"accum_scale -= in_off[(head - size) * step]\n"
"* in_off[(head - size) * step];\n"
"}\n"
"scale_val = k + accum_scale * alpha_over_size;\n"
"out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr(scale_val, negative_beta);\n"
"++head;\n"
"}\n"
"while (head < channels + post_pad) {\n"
"if (head - size >= 0) {\n"
"accum_scale -= in_off[(head - size) * step]\n"
"* in_off[(head - size) * step];\n"
"}\n"
"scale_val = k + accum_scale * alpha_over_size;\n"
"out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr(scale_val, negative_beta);\n"
"++head;\n"
"}\n"
"}\n"
"}\n"
, "5b3b0615ca2e06228fef74b23250379b", NULL};
struct cv::ocl::internal::ProgramEntry ocl4dnn_pooling_oclsrc={moduleName, "ocl4dnn_pooling",
"#define CONCAT(A,B) A##_##B\n"
"#define TEMPLATE(name,type) CONCAT(name,type)\n"
"#if defined(cl_khr_fp16)\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#if defined KERNEL_MAX_POOL\n"
"__kernel void\n"
"#ifdef HAVE_MASK\n"
"TEMPLATE(max_pool_forward_mask, Dtype)\n"
"#else\n"
"TEMPLATE(max_pool_forward, Dtype)\n"
"#endif\n"
"(\n"
"const int nthreads, __global const Dtype* bottom_data,\n"
"const int channels, const int height, const int width,\n"
"const int pooled_height, const int pooled_width,\n"
"__global Dtype* top_data\n"
"#ifdef HAVE_MASK\n"
", __global Dtype* mask\n"
"#endif\n"
")\n"
"{\n"
"int index = get_global_id(0);\n"
"if (index >= nthreads)\n"
"return;\n"
"const int pw = index % pooled_width;\n"
"const int xx = index / pooled_width;\n"
"const int ph = xx % pooled_height;\n"
"const int ch = xx / pooled_height;\n"
"int hstart = ph * STRIDE_H - PAD_T;\n"
"int wstart = pw * STRIDE_W - PAD_L;\n"
"Dtype maxval = -FLT_MAX;\n"
"int maxidx = -1;\n"
"int in_offset = ch * height * width;\n"
"for (int h = 0; h < KERNEL_H; ++h)\n"
"{\n"
"int off_y = hstart + h;\n"
"if (off_y >= 0 && off_y < height)\n"
"{\n"
"for (int w = 0; w < KERNEL_W; ++w)\n"
"{\n"
"int off_x = wstart + w;\n"
"if (off_x >= 0 && off_x < width)\n"
"{\n"
"Dtype val = bottom_data[in_offset + off_y * width + off_x];\n"
"maxidx = (val > maxval) ? (off_y * width + off_x) : maxidx;\n"
"maxval = fmax(val, maxval);\n"
"}\n"
"}\n"
"}\n"
"}\n"
"top_data[index] = maxval;\n"
"#ifdef HAVE_MASK\n"
"mask[index] = maxidx;\n"
"#endif\n"
"}\n"
"#elif defined KERNEL_AVE_POOL\n"
"__kernel void TEMPLATE(ave_pool_forward, Dtype)(\n"
"const int nthreads, __global const Dtype* bottom_data,\n"
"const int channels, const int height, const int width,\n"
"const int pooled_height, const int pooled_width,\n"
"__global Dtype* top_data)\n"
"{\n"
"int index = get_global_id(0);\n"
"if (index >= nthreads)\n"
"return;\n"
"const int pw = index % pooled_width;\n"
"const int xx = index / pooled_width;\n"
"const int ph = xx % pooled_height;\n"
"const int ch = xx / pooled_height;\n"
"int hstart = ph * STRIDE_H - PAD_T;\n"
"int wstart = pw * STRIDE_W - PAD_L;\n"
"int hend = min(hstart + KERNEL_H, height + PAD_B);\n"
"int wend = min(wstart + KERNEL_W, width + PAD_R);\n"
"int pool_size;\n"
"#ifdef AVE_POOL_PADDING_AREA\n"
"pool_size = (hend - hstart) * (wend - wstart);\n"
"hstart = max(hstart, (int)0);\n"
"wstart = max(wstart, (int)0);\n"
"hend = min(hend, height);\n"
"wend = min(wend, width);\n"
"#else\n"
"hstart = max(hstart, (int)0);\n"
"wstart = max(wstart, (int)0);\n"
"hend = min(hend, height);\n"
"wend = min(wend, width);\n"
"pool_size = (hend - hstart) * (wend - wstart);\n"
"#endif\n"
"Dtype aveval = 0;\n"
"int in_offset = ch * height * width;\n"
"for (int h = hstart; h < hend; ++h)\n"
"{\n"
"for (int w = wstart; w < wend; ++w)\n"
"{\n"
"aveval += bottom_data[in_offset + h * width + w];\n"
"}\n"
"}\n"
"top_data[index] = aveval / pool_size;\n"
"}\n"
"#elif defined KERNEL_STO_POOL\n"
"__kernel void TEMPLATE(sto_pool_forward_test,Dtype)(\n"
"const int nthreads, __global const Dtype* bottom_data,\n"
"const int channels, const int height, const int width,\n"
"const int pooled_height, const int pooled_width,\n"
"__global Dtype* top_data)\n"
"{\n"
"for (int index = get_global_id(0); index < nthreads;\n"
"index += get_global_size(0))\n"
"{\n"
"const int pw = index % pooled_width;\n"
"const int ph = (index / pooled_width) % pooled_height;\n"
"const int c = (index / pooled_width / pooled_height) % channels;\n"
"const int n = index / pooled_width / pooled_height / channels;\n"
"const int hstart = ph * STRIDE_H;\n"
"const int hend = min(hstart + KERNEL_H, height);\n"
"const int wstart = pw * STRIDE_W;\n"
"const int wend = min(wstart + KERNEL_W, width);\n"
"Dtype cumsum = FLT_MIN;\n"
"Dtype cumvalues = 0.;\n"
"__global const Dtype* bottom_slice = bottom_data\n"
"+ (n * channels + c) * height * width;\n"
"for (int h = hstart; h < hend; ++h) {\n"
"for (int w = wstart; w < wend; ++w) {\n"
"Dtype v = bottom_slice[h * width + w];\n"
"cumsum += v;\n"
"cumvalues += v * v;\n"
"}\n"
"}\n"
"top_data[index] = cumvalues / cumsum;\n"
"}\n"
"}\n"
"#endif\n"
, "323321c5f6f114f2693c552e81e87230", NULL};
struct cv::ocl::internal::ProgramEntry permute_oclsrc={moduleName, "permute",
"#if defined(cl_khr_fp16)\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"__kernel void permute(const int nthreads,\n"
"__global Dtype* bottom_data,\n"
"global int* permute_order,\n"
"global int* oldStride,\n"
"global int* newStride,\n"
"const int num_axes,\n"
"__global Dtype* top_data)\n"
"{\n"
"for (int i = get_global_id(0); i < nthreads; i += get_global_size(0))\n"
"{\n"
"int oldPosition = 0;\n"
"int newPosition = i;\n"
"for (int j = 0; j < num_axes; ++j)\n"
"{\n"
"int order = permute_order[j];\n"
"oldPosition += (newPosition / newStride[j]) * oldStride[order];\n"
"newPosition %= newStride[j];\n"
"}\n"
"top_data[i] = bottom_data[oldPosition];\n"
"}\n"
"}\n"
, "81803672217de8c6fc01de8a6e7f283a", NULL};
struct cv::ocl::internal::ProgramEntry pooling_oclsrc={moduleName, "pooling",
"/*************************************************************************************\n"
"* Copyright (c) 2015, Advanced Micro Devices, Inc.\n"
"* All rights reserved.\n"
"*\n"
"* Redistribution and use in source and binary forms, with or without modification,\n"
"* are permitted provided that the following conditions are met:\n"
"*\n"
"* 1. Redistributions of source code must retain the above copyright notice, this\n"
"* list of conditions and the following disclaimer.\n"
"*\n"
"* 2. Redistributions in binary form must reproduce the above copyright notice,\n"
"* this list of conditions and the following disclaimer in the documentation and/or\n"
"*  other materials provided with the distribution.\n"
"*\n"
"* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n"
"* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n"
"* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\n"
"* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,\n"
"* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,\n"
"* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,\n"
"* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,\n"
"* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n"
"* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\n"
"* POSSIBILITY OF SUCH DAMAGE.\n"
"**************************************************************************************/\n"
"__kernel void MaxPoolForward(const int nthreads,\n"
"__global T* bottom_data, const int num, const int channels, const int height, const int width,\n"
"const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w,\n"
"const int stride_h, const int stride_w, const int pad_t, const int pad_l, const int pad_b, const int pad_r,\n"
"__global T* top_data\n"
"#ifdef MASK\n"
", __global float* mask\n"
"#endif\n"
")\n"
"{\n"
"int index = get_global_id(0);\n"
"int tmp = get_global_size(0);\n"
"for(index; index < nthreads; index += tmp) {\n"
"int pw = index % pooled_width;\n"
"int ph = (index / pooled_width) % pooled_height;\n"
"int c = (index / pooled_width / pooled_height) % channels;\n"
"int n = index / pooled_width / pooled_height / channels;\n"
"int hstart = ph * stride_h - pad_t;\n"
"int wstart = pw * stride_w - pad_l;\n"
"const int hend = min(hstart + kernel_h, height);\n"
"const int wend = min(wstart + kernel_w, width);\n"
"hstart = max(hstart, 0);\n"
"wstart = max(wstart, 0);\n"
"T maxval = -FLT_MAX;\n"
"int maxidx = -1;\n"
"bottom_data =\n"
"bottom_data + (n * channels + c) * height * width;\n"
"for (int h = hstart; h < hend; ++h) {\n"
"for (int w = wstart; w < wend; ++w) {\n"
"if (bottom_data[h * width + w] > maxval) {\n"
"maxidx = h * width + w;\n"
"maxval = bottom_data[maxidx];\n"
"}\n"
"}\n"
"}\n"
"top_data[index] = maxval;\n"
"#ifdef MASK\n"
"mask[index] = maxidx;\n"
"#endif\n"
"}\n"
"}\n"
"__kernel void AvePoolForward(const int nthreads,\n"
"__global T* bottom_data, const int num, const int channels, const int height, const int width,\n"
"const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w,\n"
"const int stride_h, const int stride_w, const int pad_t, const int pad_l, const int pad_b, const int pad_r,\n"
"__global T* top_data\n"
"#ifdef MASK\n"
", __global float* mask\n"
"#endif\n"
")\n"
"{\n"
"int index = get_global_id(0);\n"
"int tmp = get_global_size(0);\n"
"for(index; index < nthreads; index+=tmp) {\n"
"int pw = index % pooled_width;\n"
"int ph = (index / pooled_width) % pooled_height;\n"
"int c = (index / pooled_width / pooled_height) % channels;\n"
"int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_t; int wstart = pw * stride_w - pad_l;\n"
"int hend = min(hstart + kernel_h, height + pad_b);\n"
"int wend = min(wstart + kernel_w, width + pad_r);\n"
"const int pool_size = (hend - hstart) * (wend - wstart);\n"
"hstart = max(hstart, 0);\n"
"wstart = max(wstart, 0);\n"
"hend = min(hend, height);\n"
"wend = min(wend, width);\n"
"T aveval = 0;\n"
"bottom_data =\n"
"bottom_data + (n * channels + c) * height * width;\n"
"for (int h = hstart; h < hend; ++h) {\n"
"for (int w = wstart; w < wend; ++w) {\n"
"aveval += bottom_data[h * width + w];\n"
"}\n"
"}\n"
"top_data[index] = aveval / pool_size;\n"
"}\n"
"}\n"
, "d2fa86ff9f1a4b51458f3caf054ff85f", NULL};
struct cv::ocl::internal::ProgramEntry prior_box_oclsrc={moduleName, "prior_box",
"#if defined(cl_khr_fp16)\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"__kernel void prior_box(const int nthreads,\n"
"const float stepX,\n"
"const float stepY,\n"
"__global const float* _offsetsX,\n"
"__global const float* _offsetsY,\n"
"const int offsetsX_size,\n"
"__global const float* _widths,\n"
"__global const float* _heights,\n"
"const int widths_size,\n"
"__global Dtype* dst,\n"
"const int _layerHeight,\n"
"const int _layerWidth,\n"
"const int imgHeight,\n"
"const int imgWidth)\n"
"{\n"
"for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))\n"
"{\n"
"int w = index % _layerWidth;\n"
"int h = index / _layerWidth;\n"
"__global Dtype* outputPtr;\n"
"outputPtr = dst + index * 4 * offsetsX_size * widths_size;\n"
"float _boxWidth, _boxHeight;\n"
"Dtype4 vec;\n"
"for (int i = 0; i < widths_size; ++i)\n"
"{\n"
"_boxWidth = _widths[i];\n"
"_boxHeight = _heights[i];\n"
"for (int j = 0; j < offsetsX_size; ++j)\n"
"{\n"
"Dtype center_x = (w + _offsetsX[j]) * (Dtype)stepX;\n"
"Dtype center_y = (h + _offsetsY[j]) * (Dtype)stepY;\n"
"vec.x = (center_x - _boxWidth * 0.5f) / imgWidth;\n"
"vec.y = (center_y - _boxHeight * 0.5f) / imgHeight;\n"
"vec.z = (center_x + _boxWidth * 0.5f) / imgWidth;\n"
"vec.w = (center_y + _boxHeight * 0.5f) / imgHeight;\n"
"vstore4(vec, 0, outputPtr);\n"
"outputPtr += 4;\n"
"}\n"
"}\n"
"}\n"
"}\n"
"__kernel void set_variance(const int nthreads,\n"
"const int offset,\n"
"const int variance_size,\n"
"__global const float* variance,\n"
"__global Dtype* dst)\n"
"{\n"
"for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))\n"
"{\n"
"Dtype4 var_vec;\n"
"if (variance_size == 1)\n"
"var_vec = (Dtype4)(variance[0]);\n"
"else\n"
"var_vec = convert_T(vload4(0, variance));\n"
"vstore4(var_vec, 0, dst + offset + index * 4);\n"
"}\n"
"}\n"
"__kernel void clip(const int nthreads,\n"
"__global Dtype* dst)\n"
"{\n"
"for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))\n"
"{\n"
"Dtype4 vec = vload4(index, dst);\n"
"vstore4(clamp(vec, (Dtype)0.0f, (Dtype)1.0f), index, dst);\n"
"}\n"
"}\n"
, "1675591e0cb26d5b10d9a5a7e111007e", NULL};
struct cv::ocl::internal::ProgramEntry region_oclsrc={moduleName, "region",
"#define Dtype float\n"
"__kernel void logistic_activ(const int count,\n"
"__global const Dtype* src,\n"
"const int cell_size,\n"
"__global Dtype* dst)\n"
"{\n"
"for (int i = get_global_id(0); i < count; i += get_global_size(0))\n"
"{\n"
"int index = cell_size * i;\n"
"Dtype x = src[index + 4];\n"
"dst[index + 4] = 1.f / (1.f + exp(-x));\n"
"}\n"
"}\n"
"__kernel void softmax_activ(const int count,\n"
"__global const Dtype* src,\n"
"__global const Dtype* biasData,\n"
"const int cell_size,\n"
"const int classes,\n"
"const int classfix,\n"
"const int rows,\n"
"const int cols,\n"
"const int anchors,\n"
"const float thresh,\n"
"__global Dtype* dst)\n"
"{\n"
"for (int index = get_global_id(0); index < count; index += get_global_size(0))\n"
"{\n"
"int box_index = index * cell_size;\n"
"float largest = -FLT_MAX;\n"
"__global const Dtype *input = src + box_index + 5;\n"
"__global Dtype *output = dst + box_index + 5;\n"
"for (int i = 0; i < classes; ++i)\n"
"largest = fmax(largest, input[i]);\n"
"float sum = 0;\n"
"for (int i = 0; i < classes; ++i)\n"
"{\n"
"float e = exp((input[i] - largest));\n"
"sum += e;\n"
"output[i] = e;\n"
"}\n"
"int y = (index / (anchors * cols)) % rows;\n"
"int x = (index / anchors) % cols;\n"
"int a = index % anchors;\n"
"float scale = dst[box_index + 4];\n"
"if (classfix == -1 && scale < .5) scale = 0;\n"
"float v1 = src[box_index + 0];\n"
"float v2 = src[box_index + 1];\n"
"float l1 = 1.f / (1.f + exp(-v1));\n"
"float l2 = 1.f / (1.f + exp(-v2));\n"
"dst[box_index + 0] = (x + l1) / cols;\n"
"dst[box_index + 1] = (y + l2) / rows;\n"
"dst[box_index + 2] = exp(src[box_index + 2]) * biasData[2 * a] / cols;\n"
"dst[box_index + 3] = exp(src[box_index + 3]) * biasData[2 * a + 1] / rows;\n"
"for (int i = 0; i < classes; ++i)\n"
"{\n"
"float prob = scale * output[i] / sum;\n"
"output[i] = (prob > thresh) ? prob : 0;\n"
"}\n"
"}\n"
"}\n"
, "974d8f1dbe16bfbf98914ab49bd55d11", NULL};
struct cv::ocl::internal::ProgramEntry slice_oclsrc={moduleName, "slice",
"#define CONCAT_(A, B) A##B\n"
"#define CONCAT(A, B) CONCAT_(A, B)\n"
"#define BLOCK_COLS_X4 (BLOCK_COLS / 4)\n"
"#define BLOCK_COLS_X16 (BLOCK_COLS / 16)\n"
"__attribute__((reqd_work_group_size(WSZ, 1, 1)))\n"
"__kernel void\n"
"CONCAT(slice_, SLICE_KERNEL_SUFFIX)(\n"
"__global const uchar* src0,\n"
"__global uchar* dst0\n"
")\n"
"{\n"
"uint block_id = get_global_id(1);\n"
"uint dst_offset0 = block_id * BLOCK_SIZE;\n"
"uint src_offset0 = 0;\n"
"{\n"
"#define CALC_SRC_INDEX(dim) \\\n"
"{ \\\n"
"uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \\\n"
"CONCAT(idx_, dim) = block_id / plane_sz; \\\n"
"block_id = block_id - CONCAT(idx_, dim) * plane_sz; \\\n"
"}\n"
"#define UPDATE_SRC_OFFSET(dim) \\\n"
"src_offset0 = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset0);\n"
"#if DIMS > 5\n"
"#error \"invalid configuration\"\n"
"#endif\n"
"#if DIMS > 4\n"
"uint idx_4 = 0;\n"
"#if BLOCK_DIMS <= 4\n"
"CALC_SRC_INDEX(4)\n"
"#endif\n"
"UPDATE_SRC_OFFSET(4)\n"
"#endif\n"
"#if DIMS > 3\n"
"uint idx_3 = 0;\n"
"#if BLOCK_DIMS <= 3\n"
"CALC_SRC_INDEX(3)\n"
"#endif\n"
"UPDATE_SRC_OFFSET(3)\n"
"#endif\n"
"#if DIMS > 2\n"
"uint idx_2 = 0;\n"
"#if BLOCK_DIMS <= 2\n"
"CALC_SRC_INDEX(2)\n"
"#endif\n"
"UPDATE_SRC_OFFSET(2)\n"
"#endif\n"
"#if DIMS > 1\n"
"uint idx_1 = 0;\n"
"#if BLOCK_DIMS <= 1\n"
"CALC_SRC_INDEX(1)\n"
"#endif\n"
"UPDATE_SRC_OFFSET(1)\n"
"#endif\n"
"#if DIMS > 0\n"
"uint idx_0 = 0;\n"
"UPDATE_SRC_OFFSET(0)\n"
"#endif\n"
"}\n"
"#ifdef USE_COPY_1D\n"
"{\n"
"__global const uchar* src = src0 + src_offset0;\n"
"__global uchar* dst = dst0 + dst_offset0;\n"
"uint processed = 0;\n"
"#if BLOCK_COLS_X16 >= 4\n"
"{\n"
"uint i = get_local_id(0) * 16;\n"
"while (i < BLOCK_COLS_X16 * 16)\n"
"{\n"
"uint4 idx0 = (uint4)i;\n"
"uint4 idx = idx0 + (uint4)(0, 16 * WSZ, 32 * WSZ, 48 * WSZ);\n"
"idx = select(idx0, idx, idx < (BLOCK_COLS_X16 * 16));\n"
"uchar16 a0 = vload16(0, src + idx.s0);\n"
"uchar16 a1 = vload16(0, src + idx.s1);\n"
"uchar16 a2 = vload16(0, src + idx.s2);\n"
"uchar16 a3 = vload16(0, src + idx.s3);\n"
"vstore16(a0, 0, dst + idx.s0);\n"
"vstore16(a1, 0, dst + idx.s1);\n"
"vstore16(a2, 0, dst + idx.s2);\n"
"vstore16(a3, 0, dst + idx.s3);\n"
"i += WSZ * 16 * 4;\n"
"}\n"
"processed = BLOCK_COLS_X16 * 16;\n"
"}\n"
"#else\n"
"#define SKIP_1D_BLOCK_COLS_X16 1\n"
"#endif\n"
"#if BLOCK_COLS_X4 > 0 && (defined(SKIP_1D_BLOCK_COLS_X16) || (BLOCK_COLS_X16 * 16 != BLOCK_COLS_X4 * 4))\n"
"{\n"
"uint i = get_local_id(0) * 4 + processed;\n"
"while (i < BLOCK_COLS_X4 * 4)\n"
"{\n"
"uint4 idx0 = (uint4)i;\n"
"uint4 idx = idx0 + (uint4)(0, 4 * WSZ, 8 * WSZ, 12 * WSZ);\n"
"idx = select(idx0, idx, idx < (BLOCK_COLS_X4 * 4));\n"
"uchar4 a0 = vload4(0, src + idx.s0);\n"
"uchar4 a1 = vload4(0, src + idx.s1);\n"
"uchar4 a2 = vload4(0, src + idx.s2);\n"
"uchar4 a3 = vload4(0, src + idx.s3);\n"
"vstore4(a0, 0, dst + idx.s0);\n"
"vstore4(a1, 0, dst + idx.s1);\n"
"vstore4(a2, 0, dst + idx.s2);\n"
"vstore4(a3, 0, dst + idx.s3);\n"
"i += WSZ * 4 * 4;\n"
"}\n"
"processed = BLOCK_COLS_X4 * 4;\n"
"}\n"
"#else\n"
"#define SKIP_1D_BLOCK_COLS_X4 1\n"
"#endif\n"
"#if (defined(SKIP_1D_BLOCK_COLS_X16) && defined(SKIP_1D_BLOCK_COLS_X4)) || BLOCK_COLS_X4 * 4 != BLOCK_COLS\n"
"{\n"
"uint i = get_local_id(0) + processed;\n"
"while (i < BLOCK_COLS)\n"
"{\n"
"uchar a0 = src[i];\n"
"dst[i] = a0;\n"
"i += WSZ;\n"
"}\n"
"}\n"
"#endif\n"
"}\n"
"#else\n"
"{\n"
"__global const uchar* src = src0 + src_offset0;\n"
"__global uchar* dst = dst0 + dst_offset0;\n"
"uint i = get_local_id(0) * 4;\n"
"#define BLOCK_COLS_FILL_X4 (((BLOCK_COLS + 3) / 4) * 4)\n"
"#define BLOCK_SIZE_FILL_X4 (BLOCK_COLS_FILL_X4 * BLOCK_ROWS)\n"
"while (i < BLOCK_SIZE_FILL_X4)\n"
"{\n"
"int row = i / BLOCK_COLS_FILL_X4;\n"
"int col = i % BLOCK_COLS_FILL_X4;\n"
"uint src_offset = row * BLOCK_SRC_STRIDE + col;\n"
"#if BLOCK_COLS_FILL_X4 == BLOCK_COLS\n"
"uint dst_offset = i;\n"
"#else\n"
"uint dst_offset = row * BLOCK_COLS + col;\n"
"#endif\n"
"#if BLOCK_COLS_FILL_X4 != BLOCK_COLS\n"
"if (col <= BLOCK_COLS - 4)\n"
"#endif\n"
"{\n"
"uchar4 a = vload4(0, src + src_offset);\n"
"vstore4(a, 0, dst + dst_offset);\n"
"}\n"
"#if BLOCK_COLS_FILL_X4 != BLOCK_COLS\n"
"else\n"
"{\n"
"uint4 shift = (uint4)(0, 1, 2, 3);\n"
"shift = select((uint4)0, shift, col + shift < BLOCK_COLS);\n"
"dst[dst_offset + shift.s0] = src[src_offset + shift.s0];\n"
"#if BLOCK_COLS_FILL_X4 - BLOCK_COLS <= 2\n"
"dst[dst_offset + shift.s1] = src[src_offset + shift.s1];\n"
"#endif\n"
"#if BLOCK_COLS_FILL_X4 - BLOCK_COLS <= 1\n"
"dst[dst_offset + shift.s2] = src[src_offset + shift.s2];\n"
"#endif\n"
"}\n"
"#endif\n"
"i += WSZ * 4;\n"
"}\n"
"}\n"
"#endif\n"
"}\n"
, "94dad8ab7b7b1da7fd128d8ba047b152", NULL};
struct cv::ocl::internal::ProgramEntry softmax_oclsrc={moduleName, "softmax",
"/*************************************************************************************\n"
"* Copyright (c) 2015, Advanced Micro Devices, Inc.\n"
"* All rights reserved.\n"
"*\n"
"* Redistribution and use in source and binary forms, with or without modification,\n"
"* are permitted provided that the following conditions are met:\n"
"*\n"
"* 1. Redistributions of source code must retain the above copyright notice, this\n"
"* list of conditions and the following disclaimer.\n"
"*\n"
"* 2. Redistributions in binary form must reproduce the above copyright notice,\n"
"* this list of conditions and the following disclaimer in the documentation and/or\n"
"*  other materials provided with the distribution.\n"
"*\n"
"* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n"
"* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n"
"* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\n"
"* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,\n"
"* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,\n"
"* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,\n"
"* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,\n"
"* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n"
"* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\n"
"* POSSIBILITY OF SUCH DAMAGE.\n"
"**************************************************************************************/\n"
"#if defined(cl_khr_fp16)\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"__kernel void kernel_channel_max(const int num, const int channels,\n"
"const int spatial_dim, __global const T* data, __global T* out) {\n"
"int index = get_global_id(0);\n"
"if(index < num * spatial_dim) {\n"
"int n = index / spatial_dim;\n"
"int s = index % spatial_dim;\n"
"T maxval = -FLT_MAX;\n"
"for (int c = 0; c < channels; ++c) {\n"
"maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);\n"
"}\n"
"out[index] = maxval;\n"
"}\n"
"}\n"
"__kernel void kernel_channel_subtract(const int count,\n"
"const int num, const int channels,\n"
"const int spatial_dim, __global const T* channel_max, __global const T* src, __global T* data) {\n"
"int index = get_global_id(0);\n"
"if(index < count) {\n"
"int n = index / channels / spatial_dim;\n"
"int s = index % spatial_dim;\n"
"data[index] = exp(src[index] - channel_max[n * spatial_dim + s]);\n"
"}\n"
"}\n"
"__kernel void kernel_channel_sum(const int num, const int channels,\n"
"const int spatial_dim, __global const T* data, __global T* channel_sum) {\n"
"int index = get_global_id(0);\n"
"if(index < num * spatial_dim) {\n"
"int n = index / spatial_dim;\n"
"int s = index % spatial_dim;\n"
"T sum = 0;\n"
"for (int c = 0; c < channels; ++c) {\n"
"sum += data[(n * channels + c) * spatial_dim + s];\n"
"}\n"
"channel_sum[index] = sum;\n"
"}\n"
"}\n"
"__kernel void kernel_channel_div(const int count,\n"
"const int num, const int channels,\n"
"const int spatial_dim, __global const T* channel_sum, __global T* data) {\n"
"int index = get_global_id(0);\n"
"if(index < count) {\n"
"int n = index / channels / spatial_dim;\n"
"int s = index % spatial_dim;\n"
"T v = data[index] / channel_sum[n * spatial_dim + s];\n"
"#ifdef LOG_SOFTMAX\n"
"v = log(v);\n"
"#endif\n"
"data[index] = v;\n"
"}\n"
"}\n"
, "db5bfbbe4215a169392800a28b6834c4", NULL};
struct cv::ocl::internal::ProgramEntry softmax_loss_oclsrc={moduleName, "softmax_loss",
"#define CONCAT(A,B) A##_##B\n"
"#define TEMPLATE(name,type) CONCAT(name,type)\n"
"#if defined(cl_intel_subgroups)\n"
"#pragma OPENCL EXTENSION  cl_intel_subgroups : enable\n"
"#endif\n"
"#if defined(cl_khr_fp16)\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"__kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels,\n"
"const int spatial_dim,\n"
"__global Dtype* scale,\n"
"__global const Dtype* data,\n"
"__global Dtype* out,\n"
"__local Dtype *out_tmp,\n"
"__local Dtype *scale_tmp,\n"
"__local Dtype *group_tmp) {\n"
"int n = get_global_id(1);\n"
"for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=\n"
"get_global_size(0), ++s) {\n"
"Dtype maxval = -DTYPE_MAX;\n"
"for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {\n"
"Dtype tmp = data[(n * channels + c) * spatial_dim + s];\n"
"maxval = max((Dtype)tmp, (Dtype)maxval);\n"
"}\n"
"maxval = sub_group_reduce_max(maxval);\n"
"group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=\n"
"get_global_size(0)) {\n"
"int s = index / get_max_sub_group_size();\n"
"Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);\n"
"scale_tmp[s] = maxval;\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"for (int index = get_global_id(0); index < channels * spatial_dim;\n"
"index += get_global_size(0)) {\n"
"int s = index % spatial_dim;\n"
"out_tmp[index] = exp(data[n * channels * spatial_dim + index] - scale_tmp[s]);\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=\n"
"get_global_size(0), ++s) {\n"
"Dtype sum = 0;\n"
"for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {\n"
"sum += out_tmp[c * spatial_dim + s];\n"
"}\n"
"sum = sub_group_reduce_add(sum);\n"
"group_tmp[get_sub_group_id() * spatial_dim + s] = sum;\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=\n"
"get_global_size(0)) {\n"
"int s = index / get_max_sub_group_size();\n"
"Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);\n"
"scale_tmp[s] = sum;\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"for (int index = get_global_id(0); index < channels * spatial_dim;\n"
"index += get_global_size(0)) {\n"
"int s = index % spatial_dim;\n"
"Dtype v = out_tmp[index] / scale_tmp[s];\n"
"#ifdef LOG_SOFTMAX\n"
"v = log(v);\n"
"#endif\n"
"out[n * channels * spatial_dim + index] = v;\n"
"}\n"
"}\n"
"__kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,\n"
"const int spatial_dim,\n"
"__global Dtype* scale,\n"
"__global const Dtype* data,\n"
"__global Dtype* out) {\n"
"int n = get_global_id(1);\n"
"__global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim;\n"
"for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=\n"
"get_global_size(0), ++s) {\n"
"Dtype maxval = -DTYPE_MAX;\n"
"for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {\n"
"Dtype tmp = data[(n * channels + c) * spatial_dim + s];\n"
"maxval = max((Dtype)tmp, (Dtype)maxval);\n"
"}\n"
"maxval = sub_group_reduce_max(maxval);\n"
"group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;\n"
"}\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=\n"
"get_global_size(0)) {\n"
"int s = index / get_max_sub_group_size();\n"
"Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);\n"
"scale[n * spatial_dim + s] = maxval;\n"
"}\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"for (int index = get_global_id(0); index < channels * spatial_dim;\n"
"index += get_global_size(0)) {\n"
"int s = index % spatial_dim;\n"
"out[n * channels * spatial_dim + index] = exp(data[n * channels * spatial_dim + index] - scale[n * spatial_dim + s]);\n"
"}\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=\n"
"get_global_size(0), ++s) {\n"
"Dtype sum = 0;\n"
"for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {\n"
"sum += out[n * channels * spatial_dim + c * spatial_dim + s];\n"
"}\n"
"sum = sub_group_reduce_add(sum);\n"
"group_tmp[get_sub_group_id() * spatial_dim + s] = sum;\n"
"}\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=\n"
"get_global_size(0)) {\n"
"int s = index / get_max_sub_group_size();\n"
"Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);\n"
"scale[n * spatial_dim + s] = sum;\n"
"}\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"for (int index = get_global_id(0); index < channels * spatial_dim;\n"
"index += get_global_size(0)) {\n"
"int s = index % spatial_dim;\n"
"Dtype v = out[n * channels * spatial_dim + index] / scale[n * spatial_dim + s];\n"
"#ifdef LOG_SOFTMAX\n"
"v = log(v);\n"
"#endif\n"
"out[n * channels * spatial_dim + index] = v;\n"
"}\n"
"}\n"
, "9b1ebb425bf1e67c1294d8bf60783216", NULL};

}}}
#endif