#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>

#include "cuda_kernels.h"

namespace cuda_common
{

	__forceinline__ __device__ float3 get(uchar3* src, int x, int y, int w, int h) {
		if (x < 0 || x >= w || y < 0 || y >= h) return make_float3(0.5, 0.5, 0.5);
		uchar3 temp = src[y*w + x];
		return make_float3(float(temp.x) / 255., float(temp.y) / 255., float(temp.z) / 255.);
	}

	__global__ void resizeNormKernel(uchar3* src, float *dst, int dstW, int dstH, int srcW, int srcH,
		float scaleX, float scaleY, float shiftX, float shiftY) {
		int idx = blockIdx.x * blockDim.x + threadIdx.x;
		const int x = idx % dstW;
		const int y = idx / dstW;
		if (x >= dstW || y >= dstH)
			return;
		float w = (x - shiftX + 0.5) * scaleX - 0.5;        // Ëõ·ÅµÄ·´ÏòÓ³Éä¾ØÕó
		float h = (y - shiftY + 0.5) * scaleY - 0.5;        // opencv 
		int h_low = (int)h;
		int w_low = (int)w;
		int h_high = h_low + 1;
		int w_high = w_low + 1;
		float lh = h - h_low;
		float lw = w - w_low;
		float hh = 1 - lh, hw = 1 - lw;
		float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
		float3 v1 = get(src, w_low, h_low, srcW, srcH);
		float3 v2 = get(src, w_high, h_low, srcW, srcH);
		float3 v3 = get(src, w_low, h_high, srcW, srcH);
		float3 v4 = get(src, w_high, h_high, srcW, srcH);
		int stride = dstW*dstH;
		dst[y*dstW + x] = w1 *v1.x + w2 * v2.x + w3 *v3.x + w4 * v4.x;
		dst[stride + y*dstW + x] = w1 *v1.y + w2 * v2.y + w3 *v3.y + w4 * v4.y;
		dst[stride * 2 + y*dstW + x] = w1 *v1.z + w2 * v2.z + w3 *v3.z + w4 * v4.z;
	}

	__global__ void copy2square(uchar3 *dataIn, uchar3 *dataOut, int imgWidth, int imgHeight, int squareWidth)
	{
		// Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
		int32 x = blockIdx.x * blockDim.x + threadIdx.x;
		int32 y = blockIdx.y *  blockDim.y + threadIdx.y;

		if (x >= imgWidth)
		{
			return;
		}

		if (y >= imgHeight)
		{
			return;
		}

		dataOut[y*squareWidth + x] = dataIn[y*imgWidth + x];
	}

	__global__ void kernel_bilinear(uint8 *src_img, int src_width, int src_height, float *dst_img, int dst_width, int dst_height)
	{
		const int x = blockIdx.x * blockDim.x + threadIdx.x;
		const int y = blockIdx.y * blockDim.y + threadIdx.y;

		if (x < dst_width && y < dst_height)
		{
			float fx = (x + 0.5)*src_width / (float)dst_width - 0.5;
			float fy = (y + 0.5)*src_height / (float)dst_height - 0.5;
			int ax = floor(fx);
			int ay = floor(fy);
			if (ax < 0)
			{
				ax = 0;
			}
			else if (ax > src_width - 2)
			{
				ax = src_width - 2;
			}

			if (ay < 0) {
				ay = 0;
			}
			else if (ay > src_height - 2)
			{
				ay = src_height - 2;
			}

			int A = ax + ay*src_width;
			int B = ax + ay*src_width + 1;
			int C = ax + ay*src_width + src_width;
			int D = ax + ay*src_width + src_width + 1;

			float w1, w2, w3, w4;
			w1 = fx - ax;
			w2 = 1 - w1;
			w3 = fy - ay;
			w4 = 1 - w3;

			float blue = src_img[A] * w2*w4 + src_img[B] * w1*w4 + src_img[C] * w2*w3 + src_img[D] * w1*w3;

			float green = src_img[src_width * src_height + A] * w2*w4 + src_img[src_width * src_height + B] * w1*w4
				+ src_img[src_width * src_height + C] * w2*w3 + src_img[src_width * src_height + D] * w1*w3;

			float red = src_img[src_width * src_height * 2 + A] * w2*w4 + src_img[src_width * src_height * 2 + B] * w1*w4
				+ src_img[src_width * src_height * 2 + C] * w2*w3 + src_img[src_width * src_height * 2 + D] * w1*w3;

			dst_img[y * dst_width + x] = red;
			dst_img[dst_width * dst_height + y * dst_width + x] = green;
			dst_img[dst_width * dst_height * 2 + y * dst_width + x] = blue;
		}
	}

	__global__ void resize_norm_kernel(uchar3 *src_img, int src_width, int src_height, float *dataOut, int dst_width, int dst_height)
	{
		// Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
		const int x = blockIdx.x * blockDim.x + threadIdx.x;
		const int y = blockIdx.y * blockDim.y + threadIdx.y;

		if (x >= dst_width || y >= dst_height) {
			return;
		}

		double ratio = 0;
		if (src_width >= src_height) {
			ratio = src_width / (float)dst_width;
		}
		else 
		{
			ratio = src_height / (float)dst_height;
		}

		float fx = (x + 0.5)*ratio - 0.5;
		float fy = (y + 0.5)*ratio - 0.5;
		int ax = floor(fx);
		int ay = floor(fy);
		if (ax < 0)
		{
			ax = 0;
		}
		else if (ax >= (src_width - 2))
		{
			return;
		}

		if (ay < 0) {
			ay = 0;
		}
		else if (ay >= (src_height - 2))
		{
			return;
		}

		//int A = ay * src_width + ax;

		//dataOut[y * dst_width + x].x = src_img[A].x / 255.0;
		//dataOut[y * dst_width + x].y = src_img[A].x / 255.0;
		//dataOut[y * dst_width + x].z = src_img[A].x / 255.0;

		int A = ax + ay*src_width;
		int B = ax + ay*src_width + 1;
		int C = ax + ay*src_width + src_width;
		int D = ax + ay*src_width + src_width + 1;

		float w1, w2, w3, w4;
		w1 = fx - ax;
		w2 = 1 - w1;
		w3 = fy - ay;
		w4 = 1 - w3;

		float blue = src_img[A].x * w2*w4 + src_img[B].x * w1*w4 + src_img[C].x * w2*w3 + src_img[D].x * w1*w3;
		float green = src_img[A].y * w2*w4 + src_img[B].y * w1*w4 + src_img[C].y * w2*w3 + src_img[D].y * w1*w3;
		float red = src_img[A].z * w2*w4 + src_img[B].z * w1*w4 + src_img[C].z * w2*w3 + src_img[D].z * w1*w3;

	/*	dataOut[y * dst_width + x].x = red / 255.0;
		dataOut[y * dst_width + x].y = green / 255.0;
		dataOut[y * dst_width + x].z = blue / 255.0;*/

		// Clamp the results to RRRRR....GGGGGGG.......BBBBBBB....
		dataOut[y * dst_width + x] = red / 255.0;
		dataOut[dst_width * dst_height + y * dst_width + x] = green / 255.0;
		dataOut[dst_width * dst_height * 2 + y * dst_width + x] = blue / 255.0;
	}

	cudaError_t resizeAndNorm(unsigned char* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height)
	{
		dim3 block(32, 16, 1);
		dim3 grid((dst_width + (block.x - 1)) / block.x, (dst_height + (block.y - 1)) / block.y, 1);

		resize_norm_kernel << < grid, block >> >((uchar3 *)d_srcRGB, src_width, src_height, d_dstRGB, dst_width, dst_height);

		cudaError_t cudaStatus = cudaGetLastError();
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "kernel_bilinear launch failed: %s\n", cudaGetErrorString(cudaStatus));
			return cudaStatus;
		}

		cudaStatus = cudaDeviceSynchronize();
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus);
			return cudaStatus;
		}

		return cudaStatus;
	}

	//int resizeAndNorm(void * p, int in_w, int in_h, float *d, int w, int h, bool keepration, bool keepcenter, cudaStream_t stream) {
	//	float scaleX = (w*1.0f / in_w);
	//	float scaleY = (h*1.0f / in_h);
	//	float shiftX = 0.f, shiftY = 0.f;
	//	if (keepration)scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;
	//	if (keepration && keepcenter) { shiftX = (in_w - w / scaleX) / 2.f; shiftY = (in_h - h / scaleY) / 2.f; }
	//	const int n = in_w*in_h;
	//	int blockSize = 1024;
	//	const int gridSize = (n + blockSize - 1) / blockSize;
	//	resizeNormKernel << <gridSize, blockSize, 0, stream >> > ((uchar3*)(p), d, in_w, in_h, w, h, scaleX, scaleY, shiftX, shiftY);
	//	return 0;
	//}

	//int resizeAndNorm(void * p, int in_w, int in_h, float *d, int w, int h, bool keepration, bool keepcenter) {
	//	float scaleX = (w*1.0f / in_w);
	//	float scaleY = (h*1.0f / in_h);
	//	float shiftX = 0.f, shiftY = 0.f;
	//	if (keepration)scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;
	//	if (keepration && keepcenter) { shiftX = (in_w - w / scaleX) / 2.f; shiftY = (in_h - h / scaleY) / 2.f; }
	//	const int n = in_w*in_h;
	//	int blockSize = 1024;
	//	const int gridSize = (n + blockSize - 1) / blockSize;
	//	resizeNormKernel << <gridSize, blockSize, 0 >> > ((uchar3*)(p), d, in_w, in_h, w, h, scaleX, scaleY, shiftX, shiftY);
	//	return 0;
	//}

	int copy2square(void * p, void *d, int w, int h, int squareWidth, cudaStream_t stream) {
		dim3 block(32, 16, 1);
		dim3 grid((w + (block.x - 1)) / (block.x), (h + (block.y - 1)) / block.y, 1);
		copy2square << <grid, block, 0, stream>> > ((uchar3 *)(p), (uchar3 *)d, w, h, squareWidth);
		return 0;
	}

}