PartMemCopy.cu 11.2 KB
#include "cuda_kernels.h"
#include <algorithm>
typedef unsigned char   uchar;
typedef unsigned int    uint32;
typedef int             int32;

#define MAX_SNAPSHOT_WIDTH 320
#define MAX_SNAPSHOT_HEIGHT 320

namespace cuda_common
{
	__global__ void kernel_memcopy(unsigned char* d_srcRGB, int src_width, int src_height,
		unsigned char* d_dstRGB, int left, int top, int right, int bottom)
	{
		const int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
		const int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
		const int dst_width = right - left;
		const int dst_height = bottom - top;
		if (dst_x < dst_width && dst_y < dst_height)
		{
			int src_x = left + dst_x;
			int src_y = top + dst_y;

			//bgr...bgr...bgr...
			d_dstRGB[(dst_y*dst_width + dst_x) * 3] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3];
			d_dstRGB[(dst_y*dst_width + dst_x)
				* 3 + 1] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3 + 1];
			d_dstRGB[(dst_y*dst_width + dst_x) * 3 + 2] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3 + 2];

			//bbb...ggg...rrr...
			//d_dstRGB[(dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(src_y*src_width) + src_x];
			//d_dstRGB[(dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(src_width*src_height) + (src_y*src_width) + src_x];
			//d_dstRGB[(2 * dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(2 * src_width*src_height) + (src_y*src_width) + src_x];

			/*	memcpy(d_dstRGB + (dst_y*src_width) + dst_x, d_srcRGB + (src_y*src_width) + src_x, sizeof(float));
			memcpy(d_dstRGB + (src_width*src_height) + (dst_y*src_width) + dst_x, d_srcRGB + (src_width*src_height) + (src_y*src_width) + src_x, sizeof(float));
			memcpy(d_dstRGB + (2 * src_width*src_height) + (dst_y*src_width) + dst_x, d_srcRGB + (2 * src_width*src_height) + (src_y*src_width) + src_x, sizeof(float));*/
		}
	}

	cudaError_t PartMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom)
	{
		dim3 block(32, 16, 1);
		dim3 grid(((right - left) + (block.x - 1)) / block.x, ((bottom - top) + (block.y - 1)) / block.y, 1);

		kernel_memcopy << < grid, block >> > (d_srcRGB, src_width, src_height, d_dstRGB, left, top, right, bottom);

		cudaError_t cudaStatus = cudaGetLastError();
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "Part 50 kernel_memcopy launch failed: %s\n", cudaGetErrorString(cudaStatus));
			return cudaStatus;
		}
		cudaStatus = cudaDeviceSynchronize();
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus);
			return cudaStatus;
		}
		return cudaStatus;
	}


	//    __global__ void kernel_memcopy_mean_variance(float* d_srcRGB, int src_width, int src_height, 
	//            unsigned char* vd_dstRGB, int count, int * vleft, int* vtop, int* vright, int * vbottom, float submeanb,float submeang, float submeanr, float varianceb,float varianceg, float variancer)
	//    {
	//        const int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
	//        const int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
	//        for (int i=0;i<count;i++)
	//        {
	//                const int left = vleft[i];
	//                const int right = vright[i];
	//                const int top = vtop[i];
	//                const int bottom = vbottom[i];
	//        
	//                const int dst_width = right - left;
	//                const int dst_height = bottom - top;
	//
	//
	//                unsigned char * d_dstRGB = vd_dstRGB + i *   ;
	//
	//                if (dst_x < dst_width && dst_y < dst_height)
	//                {
	//                    int src_x = left + dst_x;
	//                    int src_y = top + dst_y;
	//        
	//                    d_dstRGB[(dst_y*dst_width) + dst_x] = (d_srcRGB[(src_y*src_width) + src_x] - submeanb)*varianceb;
	//                    d_dstRGB[(dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (d_srcRGB[(src_width*src_height) + (src_y*src_width) + src_x] -submeang)*varianceg;
	//                    d_dstRGB[(2 * dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (d_srcRGB[(2 * src_width*src_height) + (src_y*src_width) + src_x] - submeanr) * variancer;
	//        
	//                }
	//        }
	//    }
	__global__ void PartCopy_ResizeImgBilinearBGR_Mean_Variance_CUDAKernel(
		unsigned char * d_srcRGB, int srcimg_width, int srcimg_height,
		int* vleft, int* vtop, int* vright, int * vbottom,
		unsigned char** vd_dstRGB, int count, int *dst_width, int *dst_height,
		float submeanb, float submeang, float submeanr,
		float varianceb, float varianceg, float variancer)
	{
		int i = blockIdx.z;

		//for (int i = 0; i<count; i++)
		{
			const int left = vleft[i];
			const int right = vright[i];
			const int top = vtop[i];
			const int bottom = vbottom[i];
			const int cur_dst_width = dst_width[i];
			const int cur_dst_height = dst_height[i];

			unsigned char* d_dstRGB =  vd_dstRGB[i];

			const int src_width = right - left;
			const int src_height = bottom - top;
			const int x = blockIdx.x * blockDim.x + threadIdx.x;// + left;
			const int y = blockIdx.y * blockDim.y + threadIdx.y;//+ top;
			const int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
			const int dst_y = blockIdx.y * blockDim.y + threadIdx.y;

			/*if (dst_x == 0 && dst_y == 0)
				printf("%d %d %d %d %d\n", i, vleft[i], vright[i], cur_dst_width, cur_dst_height);*/

			unsigned char * src_img = d_srcRGB;
			unsigned char * dst_img = d_dstRGB;
			if (dst_x < cur_dst_width && dst_y < cur_dst_height)
			{
				float fx = (x + 0.5)*src_width / (float)cur_dst_width - 0.5 + left;
				float fy = (y + 0.5)*src_height / (float)cur_dst_height - 0.5 + top;
				int ax = floor(fx);
				int ay = floor(fy);
				if (ax < 0)
				{
					ax = 0;
				}
				if (ax > srcimg_width - 2)
				{
					ax = srcimg_width - 2;
				}
				if (ay < 0) {
					ay = 0;
				}
				if (ay > srcimg_height - 2)
				{
					ay = srcimg_height - 2;
				}

				int A = ax + ay*srcimg_width;
				int B = ax + ay*srcimg_width + 1;
				int C = ax + ay*srcimg_width + srcimg_width;
				int D = ax + ay*srcimg_width + srcimg_width + 1;

				float w1, w2, w3, w4;
				w1 = fx - ax;
				w2 = 1 - w1;
				w3 = fy - ay;
				w4 = 1 - w3;
				float blue = src_img[A * 3] * w2*w4 + src_img[B * 3] * w1*w4 + src_img[C * 3] * w2*w3 + src_img[D * 3] * w1*w3;
				float green = src_img[A * 3 + 1] * w2*w4 + src_img[B * 3 + 1] * w1*w4
					+ src_img[C * 3 + 1] * w2*w3 + src_img[D * 3 + 1] * w1*w3;
				float red = src_img[A * 3 + 2] * w2*w4 + src_img[B * 3 + 2] * w1*w4
					+ src_img[C * 3 + 2] * w2*w3 + src_img[D * 3 + 2] * w1*w3;

				/*dst_img[(dst_y * dst_width + dst_x) * 3] = (unsigned char)(blue - submeanb)*varianceb;
				dst_img[(dst_y * dst_width + dst_x) * 3 + 1] =(unsigned char) (green - submeang)*varianceg;
				dst_img[(dst_y * dst_width + dst_x) * 3 + 2] = (unsigned char) (red - submeanr)*variancer;*/

				if (blue < 0)
					blue = 0;
				else if (blue > 255)
					blue = 255;

				if (green < 0)
					green = 0;
				else if (green > 255)
					green = 255;

				if (red < 0)
					red = 0;
				else if (red > 255)
					red = 255;

				dst_img[(dst_y * cur_dst_width + dst_x) * 3] = (unsigned char)blue;
				dst_img[(dst_y * cur_dst_width + dst_x) * 3 + 1] = (unsigned char)green;
				dst_img[(dst_y * cur_dst_width + dst_x) * 3 + 2] = (unsigned char)red;


				/*if (src_img[(dst_y * dst_width + dst_x) * 3] < 0)
					src_img[(dst_y * dst_width + dst_x) * 3] = 0;
				else if (src_img[(dst_y * dst_width + dst_x) * 3] > 255)
					src_img[(dst_y * dst_width + dst_x) * 3] = 255;

				if (src_img[(dst_y * dst_width + dst_x) * 3 + 1] < 0)
					src_img[(dst_y * dst_width + dst_x) * 3 + 1] = 0;
				else if (src_img[(dst_y * dst_width + dst_x) * 3 + 1] > 255)
					src_img[(dst_y * dst_width + dst_x) * 3 + 1] = 255;

				if (src_img[(dst_y * dst_width + dst_x) * 3 + 2] < 0)
					src_img[(dst_y * dst_width + dst_x) * 3 + 2] = 0;
				else if (src_img[(dst_y * dst_width + dst_x) * 3 + 2] > 255)
					src_img[(dst_y * dst_width + dst_x) * 3 + 2] = 255;


				dst_img[(dst_y * dst_width + dst_x) * 3] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3];
				dst_img[(dst_y * dst_width + dst_x) * 3 + 1] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3 + 1];
				dst_img[(dst_y * dst_width + dst_x) * 3 + 2] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3 + 2];*/
			}
		}
	}

	cudaError_t PartMemResizeBatch(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, int count, int* left, int* top, int* right, int* bottom, int *dst_w, int *dst_h, float submeanb, float submeang, float submeanr,
		float varianceb, float varianceg, float variancer)
	{
	/*	cudaEvent_t start, stop;
		float time;
		cudaEventCreate(&start);
		cudaEventCreate(&stop);
		cudaEventRecord(start, 0);*/

		dim3 block(32, 16, 1);
		dim3 grid((*std::max_element(dst_w, dst_w+ count) + (block.x - 1)) / block.x, (*std::max_element(dst_h, dst_h + count) + (block.y - 1)) / block.y, count);

		int * gpu_left;
		cudaMalloc(&gpu_left, 1000 * sizeof(int));
		cudaMemcpy(gpu_left, left, count * sizeof(int), cudaMemcpyHostToDevice);

		int * gpu_right;
		cudaMalloc(&gpu_right, 1000 * sizeof(int));
		cudaMemcpy(gpu_right, right, count * sizeof(int), cudaMemcpyHostToDevice);

		int * gpu_top;
		cudaMalloc(&gpu_top, 1000 * sizeof(int));
		cudaMemcpy(gpu_top, top, count * sizeof(int), cudaMemcpyHostToDevice);

		int * gpu_bottom;
		cudaMalloc(&gpu_bottom, 1000 * sizeof(int));
		cudaMemcpy(gpu_bottom, bottom, count * sizeof(int), cudaMemcpyHostToDevice);

		int * gpu_dst_w;
		cudaMalloc(&gpu_dst_w, 1000 * sizeof(int));
		cudaMemcpy(gpu_dst_w, dst_w, count * sizeof(int), cudaMemcpyHostToDevice);

		int * gpu_dst_h;
		cudaMalloc(&gpu_dst_h, 1000 * sizeof(int));
		cudaMemcpy(gpu_dst_h, dst_h, count * sizeof(int), cudaMemcpyHostToDevice);

		unsigned char** gpu_dst_rgb;
		cudaMalloc(&gpu_dst_rgb, 1000 * sizeof(unsigned char*));
		cudaMemcpy(gpu_dst_rgb, d_dstRGB, count * sizeof(unsigned char*), cudaMemcpyHostToDevice);

		//cudaMemcpy(cpu_personfloat, d_srcRGB, 112*224*2*sizeof(float), cudaMemcpyDeviceToHost);
		//            for(int i=0;i<100;i++)
		//            {
		//                  printf("the score is %f\t",cpu_personfloat[i]);
		//            }
		PartCopy_ResizeImgBilinearBGR_Mean_Variance_CUDAKernel << < grid, block >> > (
			d_srcRGB, src_width, src_height,
			gpu_left, gpu_top, gpu_right, gpu_bottom,
			gpu_dst_rgb, count, gpu_dst_w, gpu_dst_h,
			submeanb, submeang, submeanr,
			varianceb, varianceg, variancer);
		cudaFree(gpu_top);
		cudaFree(gpu_bottom);
		cudaFree(gpu_left);
		cudaFree(gpu_right);
		cudaFree(gpu_dst_w);
		cudaFree(gpu_dst_h);
		cudaFree(gpu_dst_rgb);
	
		cudaError_t cudaStatus = cudaGetLastError();
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "Part 270 kernel_memcopy launch failed: %s\n", cudaGetErrorString(cudaStatus));
			return cudaStatus;
		}
		cudaStatus = cudaDeviceSynchronize();
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus);
			return cudaStatus;
		}

		/*cudaEventRecord(stop, 0);
		cudaEventSynchronize(stop);
		cudaEventElapsedTime(&time, start, stop);
		cudaEventDestroy(start);
		cudaEventDestroy(stop);
		printf("ºËº¯ÊýÏûºÄʱ¼ä:%f\n", time);*/

		return cudaStatus;
	}

}