#include "cuda_kernels.h" #include typedef unsigned char uchar; typedef unsigned int uint32; typedef int int32; #define MAX_SNAPSHOT_WIDTH 320 #define MAX_SNAPSHOT_HEIGHT 320 namespace cuda_common { __global__ void kernel_memcopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom) { const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; const int dst_width = right - left; const int dst_height = bottom - top; if (dst_x < dst_width && dst_y < dst_height) { int src_x = left + dst_x; int src_y = top + dst_y; //bgr...bgr...bgr... d_dstRGB[(dst_y*dst_width + dst_x) * 3] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3]; d_dstRGB[(dst_y*dst_width + dst_x) * 3 + 1] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3 + 1]; d_dstRGB[(dst_y*dst_width + dst_x) * 3 + 2] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3 + 2]; //bbb...ggg...rrr... //d_dstRGB[(dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(src_y*src_width) + src_x]; //d_dstRGB[(dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(src_width*src_height) + (src_y*src_width) + src_x]; //d_dstRGB[(2 * dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(2 * src_width*src_height) + (src_y*src_width) + src_x]; /* memcpy(d_dstRGB + (dst_y*src_width) + dst_x, d_srcRGB + (src_y*src_width) + src_x, sizeof(float)); memcpy(d_dstRGB + (src_width*src_height) + (dst_y*src_width) + dst_x, d_srcRGB + (src_width*src_height) + (src_y*src_width) + src_x, sizeof(float)); memcpy(d_dstRGB + (2 * src_width*src_height) + (dst_y*src_width) + dst_x, d_srcRGB + (2 * src_width*src_height) + (src_y*src_width) + src_x, sizeof(float));*/ } } cudaError_t PartMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom) { dim3 block(32, 16, 1); dim3 grid(((right - left) + (block.x - 1)) / block.x, ((bottom - top) + (block.y - 1)) / block.y, 1); kernel_memcopy << < grid, block >> > (d_srcRGB, src_width, src_height, d_dstRGB, left, top, right, bottom); cudaError_t cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "Part 50 kernel_memcopy launch failed: %s\n", cudaGetErrorString(cudaStatus)); return cudaStatus; } cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus); return cudaStatus; } return cudaStatus; } // __global__ void kernel_memcopy_mean_variance(float* d_srcRGB, int src_width, int src_height, // unsigned char* vd_dstRGB, int count, int * vleft, int* vtop, int* vright, int * vbottom, float submeanb,float submeang, float submeanr, float varianceb,float varianceg, float variancer) // { // const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; // const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; // for (int i=0;i srcimg_width - 2) { ax = srcimg_width - 2; } if (ay < 0) { ay = 0; } if (ay > srcimg_height - 2) { ay = srcimg_height - 2; } int A = ax + ay*srcimg_width; int B = ax + ay*srcimg_width + 1; int C = ax + ay*srcimg_width + srcimg_width; int D = ax + ay*srcimg_width + srcimg_width + 1; float w1, w2, w3, w4; w1 = fx - ax; w2 = 1 - w1; w3 = fy - ay; w4 = 1 - w3; float blue = src_img[A * 3] * w2*w4 + src_img[B * 3] * w1*w4 + src_img[C * 3] * w2*w3 + src_img[D * 3] * w1*w3; float green = src_img[A * 3 + 1] * w2*w4 + src_img[B * 3 + 1] * w1*w4 + src_img[C * 3 + 1] * w2*w3 + src_img[D * 3 + 1] * w1*w3; float red = src_img[A * 3 + 2] * w2*w4 + src_img[B * 3 + 2] * w1*w4 + src_img[C * 3 + 2] * w2*w3 + src_img[D * 3 + 2] * w1*w3; /*dst_img[(dst_y * dst_width + dst_x) * 3] = (unsigned char)(blue - submeanb)*varianceb; dst_img[(dst_y * dst_width + dst_x) * 3 + 1] =(unsigned char) (green - submeang)*varianceg; dst_img[(dst_y * dst_width + dst_x) * 3 + 2] = (unsigned char) (red - submeanr)*variancer;*/ if (blue < 0) blue = 0; else if (blue > 255) blue = 255; if (green < 0) green = 0; else if (green > 255) green = 255; if (red < 0) red = 0; else if (red > 255) red = 255; dst_img[(dst_y * cur_dst_width + dst_x) * 3] = (unsigned char)blue; dst_img[(dst_y * cur_dst_width + dst_x) * 3 + 1] = (unsigned char)green; dst_img[(dst_y * cur_dst_width + dst_x) * 3 + 2] = (unsigned char)red; /*if (src_img[(dst_y * dst_width + dst_x) * 3] < 0) src_img[(dst_y * dst_width + dst_x) * 3] = 0; else if (src_img[(dst_y * dst_width + dst_x) * 3] > 255) src_img[(dst_y * dst_width + dst_x) * 3] = 255; if (src_img[(dst_y * dst_width + dst_x) * 3 + 1] < 0) src_img[(dst_y * dst_width + dst_x) * 3 + 1] = 0; else if (src_img[(dst_y * dst_width + dst_x) * 3 + 1] > 255) src_img[(dst_y * dst_width + dst_x) * 3 + 1] = 255; if (src_img[(dst_y * dst_width + dst_x) * 3 + 2] < 0) src_img[(dst_y * dst_width + dst_x) * 3 + 2] = 0; else if (src_img[(dst_y * dst_width + dst_x) * 3 + 2] > 255) src_img[(dst_y * dst_width + dst_x) * 3 + 2] = 255; dst_img[(dst_y * dst_width + dst_x) * 3] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3]; dst_img[(dst_y * dst_width + dst_x) * 3 + 1] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3 + 1]; dst_img[(dst_y * dst_width + dst_x) * 3 + 2] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3 + 2];*/ } } } cudaError_t PartMemResizeBatch(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, int count, int* left, int* top, int* right, int* bottom, int *dst_w, int *dst_h, float submeanb, float submeang, float submeanr, float varianceb, float varianceg, float variancer) { /* cudaEvent_t start, stop; float time; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0);*/ dim3 block(32, 16, 1); dim3 grid((*std::max_element(dst_w, dst_w+ count) + (block.x - 1)) / block.x, (*std::max_element(dst_h, dst_h + count) + (block.y - 1)) / block.y, count); int * gpu_left; cudaMalloc(&gpu_left, 1000 * sizeof(int)); cudaMemcpy(gpu_left, left, count * sizeof(int), cudaMemcpyHostToDevice); int * gpu_right; cudaMalloc(&gpu_right, 1000 * sizeof(int)); cudaMemcpy(gpu_right, right, count * sizeof(int), cudaMemcpyHostToDevice); int * gpu_top; cudaMalloc(&gpu_top, 1000 * sizeof(int)); cudaMemcpy(gpu_top, top, count * sizeof(int), cudaMemcpyHostToDevice); int * gpu_bottom; cudaMalloc(&gpu_bottom, 1000 * sizeof(int)); cudaMemcpy(gpu_bottom, bottom, count * sizeof(int), cudaMemcpyHostToDevice); int * gpu_dst_w; cudaMalloc(&gpu_dst_w, 1000 * sizeof(int)); cudaMemcpy(gpu_dst_w, dst_w, count * sizeof(int), cudaMemcpyHostToDevice); int * gpu_dst_h; cudaMalloc(&gpu_dst_h, 1000 * sizeof(int)); cudaMemcpy(gpu_dst_h, dst_h, count * sizeof(int), cudaMemcpyHostToDevice); unsigned char** gpu_dst_rgb; cudaMalloc(&gpu_dst_rgb, 1000 * sizeof(unsigned char*)); cudaMemcpy(gpu_dst_rgb, d_dstRGB, count * sizeof(unsigned char*), cudaMemcpyHostToDevice); //cudaMemcpy(cpu_personfloat, d_srcRGB, 112*224*2*sizeof(float), cudaMemcpyDeviceToHost); // for(int i=0;i<100;i++) // { // printf("the score is %f\t",cpu_personfloat[i]); // } PartCopy_ResizeImgBilinearBGR_Mean_Variance_CUDAKernel << < grid, block >> > ( d_srcRGB, src_width, src_height, gpu_left, gpu_top, gpu_right, gpu_bottom, gpu_dst_rgb, count, gpu_dst_w, gpu_dst_h, submeanb, submeang, submeanr, varianceb, varianceg, variancer); cudaFree(gpu_top); cudaFree(gpu_bottom); cudaFree(gpu_left); cudaFree(gpu_right); cudaFree(gpu_dst_w); cudaFree(gpu_dst_h); cudaFree(gpu_dst_rgb); cudaError_t cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "Part 270 kernel_memcopy launch failed: %s\n", cudaGetErrorString(cudaStatus)); return cudaStatus; } cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus); return cudaStatus; } /*cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); printf("ºËº¯ÊýÏûºÄÊ±¼ä:%f\n", time);*/ return cudaStatus; } }