更新解码器

ming
1 parent 372e629f
Showing 35 changed files with 2765 additions and 340 deletions
.gitignore
.vscode/launch.json
README.md
src/AbstractDecoder.cpp
src/AbstractDecoder.h
src/DrawImageOnGPU.cu
src/FFCuContextManager.cpp
src/FFNvDecoder.cpp
src/FFNvDecoder.h
src/FFNvDecoderManager.cpp
src/FFNvDecoderManager.h
src/FrameQueue.cpp
src/FrameQueue.h
src/GpuRgbMemory.hpp
src/ImageSaveGPU.cpp
src/ImageSaveGPU.h
src/NV12ToRGB.cu
src/PartMemCopy.cu
src/RGB2YUV.cu
src/ResizeImage.cu
@@ -2,3 +2,5 @@ ffmpeg-4.2.2/
 .vscode/
 bin/
 .idea/
+
+3rdparty/
 \ No newline at end of file
@@ -6,7 +6,7 @@
         "type": "cppdbg",
         "request": "launch",
         "program": "${workspaceFolder}/bin/lib/test",
-        "args": ["rtsp://122.97.218.170:8604/openUrl/V5nXRHa?params=eyJwcm90b2NhbCI6InJ0c3AiLCJjbGllbnRUeXBlIjoib3Blbl9hcGkiLCJleHByaWVUaW1lIjotMSwicHJvdG9jb2wiOiJydHNwIiwiZXhwaXJlVGltZSI6MzAwLCJlbmFibGVNR0MiOnRydWUsImV4cGFuZCI6InN0YW5kYXJkPXJ0c3Amc3RyZWFtZm9ybT1ydHAiLCJhIjoiMTBjZjM4N2JjY2Y5NDg3YzhjNWYzNjE2M2ViMWUyNTJ8MXwwfDEiLCJ0IjoxfQ==","0"],
+        "args": ["rtsp","3", "30012"],
         "stopAtEntry": false,
         "cwd": "${workspaceFolder}/bin/lib",
         "environment": [],
@@ -4,7 +4,7 @@
     支持 cuvid 需要安装 nv-codec-headers， 进入 nv-codec-headers 文件夹后以sudo权限make && make install即可
 3. 编译ffmpeg
 ~~~
-./configure --enable-debug --extra-cflags=-g --extra-ldflags=-g --disable-optimizations --disable-stripping --enable-cuda --enable-cuvid --enable-nvenc --disable-x86asm --enable-nonfree --enable-libnpp --extra-cflags=-I/usr/local/cuda-11.7/targets/x86_64-linux/include --extra-cflags=-fPIC --extra-ldflags=-L/usr/local/cuda-11.7/targets/x86_64-linux/lib --enable-shared --enable-pic --enable-ffplay --prefix=../bin
+./configure --enable-debug --extra-cflags=-g --extra-ldflags=-g --disable-optimizations --disable-stripping --enable-cuda --enable-cuvid --enable-nvenc --disable-x86asm --enable-nonfree --enable-libnpp --disable-vaapi --extra-cflags=-I/usr/local/cuda-11.7/targets/x86_64-linux/include --extra-cflags=-fPIC --extra-ldflags=-L/usr/local/cuda-11.7/targets/x86_64-linux/lib --enable-shared --enable-pic --enable-ffplay --prefix=../bin
 ~~~
 其中以下是用于调试的，编译release可以去掉：
 ~~~
+#include "AbstractDecoder.h"
+
+#include "logger.hpp"
+#include "GpuRgbMemory.hpp"
+#include "cuda_kernels.h"
+
+#include "utiltools.hpp"
+
+
+FFImgInfo* AbstractDecoder::snapshot(){
+
+	// 锁住停止队列消耗
+	std::lock_guard<std::mutex> l(m_snapshot_mutex);
+
+	AVFrame * gpuFrame = nullptr;
+
+	bool bFirst = true;
+	while(true){
+		m_queue_mutex.lock();
+		if(mFrameQueue.size() <= 0){
+			m_queue_mutex.unlock();
+			if(bFirst){
+				std::this_thread::sleep_for(std::chrono::milliseconds(100));
+				bFirst = false;
+				continue;
+			}else{
+				// 再进来说明前面已经等了 100 ms
+				// 100 ms都没有等到解码数据，则退出
+				return nullptr;
+			}
+		}
+
+		// 队列中数据大于1 
+		gpuFrame = mFrameQueue.front();
+		m_queue_mutex.unlock();
+		break;
+	}
+
+	if (gpuFrame != nullptr && gpuFrame->format == AV_PIX_FMT_CUDA ){
+		LOG_DEBUG("decode task: gpuid: {}  width: {} height: {}", m_cfg.gpuid, gpuFrame->width, gpuFrame->height);
+		GpuRgbMemory* gpuMem = new GpuRgbMemory(3, gpuFrame->width, gpuFrame->height, getName(), m_cfg.gpuid , true);
+
+		if (gpuMem->getMem() == nullptr){
+			LOG_ERROR("new GpuRgbMemory failed !!!");
+			return nullptr;
+		}
+		
+		cudaSetDevice(atoi(m_cfg.gpuid.c_str()));
+		cuda_common::setColorSpace( ITU_709, 0 );
+		cudaError_t cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], gpuMem->getMem(), gpuFrame->width, gpuFrame->height);
+		cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			LOG_ERROR("CUDAToBGR failed failed !!!");
+			return nullptr;
+		}
+
+		unsigned char * pHwRgb = gpuMem->getMem();
+		int channel = gpuMem->getChannel();
+		int width = gpuMem->getWidth();
+		int height = gpuMem->getHeight();
+
+		if (pHwRgb != nullptr && channel > 0 && width > 0 && height > 0){
+			int nSize = channel * height * width;
+
+			LOG_INFO("channel:{} height:{} width:{}", channel, height, width);
+			// unsigned char* cpu_data = new unsigned char[nSize];
+
+            unsigned char* cpu_data = (unsigned char *)av_malloc(nSize * sizeof(unsigned char));
+
+			cudaMemcpy(cpu_data, pHwRgb, nSize * sizeof(unsigned char), cudaMemcpyDeviceToHost);
+			cudaDeviceSynchronize();
+
+			delete gpuMem;
+			gpuMem = nullptr;
+
+			FFImgInfo* imgInfo = new FFImgInfo();
+			imgInfo->dec_name = m_dec_name;
+			imgInfo->pData = cpu_data;
+			imgInfo->height = height;
+			imgInfo->width = width;
+			imgInfo->timestamp = UtilTools::get_cur_time_ms();
+			imgInfo->index = m_index;
+
+			m_index++;
+
+			return imgInfo;
+		}
+
+		delete gpuMem;
+		gpuMem = nullptr;
+	}
+
+	return nullptr;
+}
+
+bool AbstractDecoder::isSnapTime(){
+	if(m_snap_time_interval <= 0){
+		return false;
+	}
+	long cur_time = UtilTools::get_cur_time_ms();
+	if(cur_time - m_last_snap_time > m_snap_time_interval){
+		return true;
+	}
+	return false;
+}
+
+void AbstractDecoder::updateLastSnapTime(){
+	m_last_snap_time = UtilTools::get_cur_time_ms();
+}
+
+void AbstractDecoder::setSnapTimeInterval(long interval){
+	m_snap_time_interval = interval;
+	m_last_snap_time = UtilTools::get_cur_time_ms();
+}
 \ No newline at end of file
@@ -15,6 +15,9 @@ extern &quot;C&quot;
     #include <libavutil/imgutils.h>
 }
+#include <queue>
+#include <mutex>
+
 using namespace std;
 /**************************************************
@@ -32,7 +35,7 @@ typedef void(*POST_DECODE_CALLBACK)(const void * userPtr, AVFrame * gpuFrame);
 typedef void(*DECODE_FINISHED_CALLBACK)(const void* userPtr);
-typedef bool(*DECODE_REQUEST_STREAM_CALLBACK)();
+typedef bool(*DECODE_REQUEST_STREAM_CALLBACK)(const char* deviceId);
 struct FFDecConfig{
     string uri;                             // 视频地址
@@ -51,6 +54,15 @@ enum DECODER_TYPE{
     DECODER_TYPE_FFMPEG
 };
+struct FFImgInfo{
+    string dec_name;
+    int width;
+    int height;
+    unsigned char * pData;
+    long timestamp;
+    long index;
+};
+
 class AbstractDecoder {
 public:
     virtual ~AbstractDecoder(){};
@@ -83,6 +95,14 @@ public:
         return m_dec_name;
     }
+    FFImgInfo* snapshot();
+
+    bool isSnapTime();
+    
+    void updateLastSnapTime();
+
+    void setSnapTimeInterval(long interval);
+
 public:
     const void * m_postDecArg;
     POST_DECODE_CALLBACK post_decoded_cbk;
@@ -95,6 +115,14 @@ public:
     bool m_dec_keyframe;
     FFDecConfig m_cfg;
+
+    queue<AVFrame*> mFrameQueue;
+    mutex m_queue_mutex;
+    mutex m_snapshot_mutex;
+
+    long m_snap_time_interval{-1};
+    long m_last_snap_time;
+    long m_index{0};
 };
 #endif // _ABSTRACT_DECODER_H_
 \ No newline at end of file
+#include "cuda_kernels.h"
+
+#include "logger.hpp"
+
+typedef unsigned char   uchar;
+typedef unsigned int    uint32;
+typedef int             int32;
+
+namespace cuda_common
+{
+	__global__ void kernel_drawPixel(float* d_srcRGB, int src_width, int src_height,
+		int left, int top, int right, int bottom)
+	{
+		const int x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (((x == left || x == right) && y >= top && y <= bottom) || ((y == top || y == bottom) && x >= left && x <= right))
+		{
+			d_srcRGB[(y*src_width) + x] = 0;
+			d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255;
+			d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0;
+		}
+	}
+
+	cudaError_t DrawImage(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom)
+	{
+		dim3 block(32, 16, 1);
+		dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1);
+
+		kernel_drawPixel << < grid, block >> >(d_srcRGB, src_width, src_height, left, top, right, bottom);
+
+		cudaError_t cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			LOG_ERROR("Draw 32 kernel_memcopy launch failed:{}",cudaGetErrorString(cudaStatus));
+			return cudaStatus;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus);
+			return cudaStatus;
+		}
+
+		return cudaStatus;
+	}
+
+	__global__ void kernel_drawPixel(unsigned char* d_srcRGB, int src_width, int src_height,
+		int left, int top, int right, int bottom)
+	{
+		const int x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (((x == left || x == right) && y >= top && y <= bottom) || ((y == top || y == bottom) && x >= left && x <= right))
+		{
+			d_srcRGB[(y*src_width) + x] = 0;
+			d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255;
+			d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0;
+		}
+	}
+
+	cudaError_t DrawImage(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom)
+	{
+		dim3 block(32, 16, 1);
+		dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1);
+
+		kernel_drawPixel << < grid, block >> >(d_srcRGB, src_width, src_height, left, top, right, bottom);
+
+		cudaError_t cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			LOG_ERROR("Draw 68 kernel_memcopy launch failed: {}",cudaGetErrorString(cudaStatus));
+			return cudaStatus;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus);
+			return cudaStatus;
+		}
+
+		return cudaStatus;
+	}
+
+	__global__ void kernel_drawLine(float* d_srcRGB, int src_width, int src_height,
+		int begin_x, int begin_y, int end_x, int end_y)
+	{
+		int min_x = end_x < begin_x ? end_x : begin_x;
+		int max_x = end_x < begin_x ? begin_x : end_x;
+
+		int min_y = end_y < begin_y ? end_y : begin_y;
+		int max_y = end_y < begin_y ? begin_y : end_y;
+
+		const int x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if ((x - begin_x) * (end_y - begin_y) == (end_x - begin_x) * (y - begin_y)
+			&& min_x <= x && x <= max_x
+			&& min_y <= y && y <= max_y)
+		{
+			d_srcRGB[(y*src_width) + x] = 0;
+			d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255;
+			d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0;
+		}
+	}
+
+	cudaError_t DrawLine(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y)
+	{
+		dim3 block(32, 16, 1);
+		dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1);
+
+		kernel_drawLine << < grid, block >> >(d_srcRGB, src_width, src_height, begin_x, begin_y, end_x, end_y);
+
+		cudaError_t cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			LOG_ERROR("Draw 112 kernel_memcopy launch failed: {}",cudaGetErrorString(cudaStatus));
+			return cudaStatus;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus);
+			return cudaStatus;
+		}
+
+		return cudaStatus;
+	}
+}
 \ No newline at end of file
 #include "FFCuContextManager.h"
-#include <iostream>
+
+#include "logger.hpp"
 using namespace std;
@@ -19,7 +20,7 @@ AVBufferRef *FFCuContextManager::getCuCtx(string gpuid)
         // 初始化硬件解码器
         if (av_hwdevice_ctx_create(&hw_device_ctx, AV_HWDEVICE_TYPE_CUDA, gpuid.c_str(), nullptr, 0) < 0) 
         {
-            av_log(nullptr, AV_LOG_ERROR, "Failed to create specified HW device ! \n");
+            LOG_ERROR("Failed to create specified HW device.");
             return nullptr;
         }
         ctxMap[gpuid] = hw_device_ctx;
@@ -10,6 +10,8 @@
 #include "logger.hpp"
+#include "utiltools.hpp"
+
 using namespace std;
 // 参考博客： https://blog.csdn.net/qq_40116098/article/details/120704340
@@ -175,18 +177,6 @@ bool FFNvDecoder::start(){
 	return true;
 }
-static long long get_cur_time(){
-    // 获取操作系统当前时间点（精确到微秒）
-    chrono::time_point<chrono::system_clock, chrono::microseconds> tpMicro
-        = chrono::time_point_cast<chrono::microseconds>(chrono::system_clock::now());
-    // (微秒精度的)时间点 => (微秒精度的)时间戳
-    time_t totalMicroSeconds = tpMicro.time_since_epoch().count();
-
-	long long currentTime = ((long long)totalMicroSeconds)/1000;
-
-    return currentTime;
-}
-
 void FFNvDecoder::decode_thread()
 {
 	AVPacket* pkt ;
@@ -202,7 +192,7 @@ void FFNvDecoder::decode_thread()
         }
     ,this);
-	// long start_time = get_cur_time();
+	// long start_time = UtilTools::get_cur_time_ms();
 	while (m_bRunning)
 	{
@@ -214,13 +204,6 @@ void FFNvDecoder::decode_thread()
 				continue;
 			}
 		}
-
-		AVFrame * gpuFrame = mFrameQueue.getTail();
-		if (gpuFrame == nullptr)
-		{
-			std::this_thread::sleep_for(std::chrono::milliseconds(1));
-			continue;
-		}
 		int result = av_read_frame(fmt_ctx, pkt);
 		if (result == AVERROR_EOF || result < 0)
@@ -247,25 +230,37 @@ void FFNvDecoder::decode_thread()
 		if (stream_index == pkt->stream_index){
 			result = avcodec_send_packet(avctx, pkt);
 			if (result < 0){
+				av_packet_unref(pkt);
 				LOG_ERROR("{} - Failed to send pkt: {}", m_dec_name, result);
 				continue;
 			}
+			AVFrame* gpuFrame = av_frame_alloc();
 			result = avcodec_receive_frame(avctx, gpuFrame);
 			if ((result == AVERROR(EAGAIN) || result == AVERROR_EOF) || result < 0){
 				LOG_ERROR("{} - Failed to receive frame: {}", m_dec_name, result);
+				av_frame_free(&gpuFrame); 
+				av_packet_unref(pkt);
 				continue;
 			}
+			av_packet_unref(pkt);
-			mFrameQueue.addTail();
+			if(gpuFrame != nullptr){
+				m_queue_mutex.lock();
+				if(mFrameQueue.size() <= 10){
+					mFrameQueue.push(gpuFrame);
+				}else{
+					av_frame_free(&gpuFrame); 
+				}
+				m_queue_mutex.unlock();
+			}
 		}
 		av_packet_unref(pkt);
 	}
 	m_bRunning = false;
-	// long end_time = get_cur_time();
-
+	// long end_time = UtilTools::get_cur_time_ms();
 	// cout << "解码用时：" << end_time - start_time << endl;
 	if (m_post_decode_thread != 0)
@@ -277,6 +272,13 @@ void FFNvDecoder::decode_thread()
 	decode_finished();
+	// 清空队列
+	while(mFrameQueue.size() > 0){
+		AVFrame * gpuFrame = mFrameQueue.front();
+		av_frame_free(&gpuFrame); 
+		mFrameQueue.pop();
+	}
+
 	LOG_INFO("{} - decode thread exited.", m_dec_name);
 }
@@ -302,24 +304,25 @@ void FFNvDecoder::post_decode_thread(){
 	}
 	int index = 0;
-	while (m_bRunning || mFrameQueue.length() > 0)
+	while (m_bRunning)
 	{
-		AVFrame * gpuFrame = mFrameQueue.getHead();
-		if (gpuFrame == nullptr)
-		{
-			std::this_thread::sleep_for(std::chrono::milliseconds(3));
-			continue;
-		}
+		if(mFrameQueue.size() > 0){
+			std::lock_guard<std::mutex> l(m_snapshot_mutex);
+			// 取队头数据
+			m_queue_mutex.lock();
+			AVFrame * gpuFrame = mFrameQueue.front();
+			mFrameQueue.pop();
+			m_queue_mutex.unlock();
+			// 跳帧
+			if (skip_frame == 1 || index % skip_frame == 0){
+				post_decoded_cbk(m_postDecArg, gpuFrame);
+				index = 0;
+			}
-		// 跳帧
-		if (skip_frame == 1 || index % skip_frame == 0){
-			post_decoded_cbk(m_postDecArg, gpuFrame);
-			index = 0;
-		}
-		
-		mFrameQueue.addHead();
+			av_frame_free(&gpuFrame); 
-		index++;
+			index++;
+		}
 	}
 	LOG_INFO("post decode thread exited.");
@@ -374,7 +377,10 @@ void FFNvDecoder::setDecKeyframe(bool bKeyframe)
 }
 int FFNvDecoder::getCachedQueueLength(){
-	return mFrameQueue.length();
+	m_queue_mutex.lock();
+	int queue_size = mFrameQueue.size(); 
+	m_queue_mutex.lock();
+	return queue_size;
 }
 float FFNvDecoder::fps(){
 #include<string>
 #include <pthread.h>
-#include "FrameQueue.h"
-
 #include "AbstractDecoder.h"
+#include <mutex>
+
 using namespace std;
 class FFNvDecoder : public AbstractDecoder{
@@ -55,7 +55,6 @@ private:
     bool m_bFinished;
     bool m_bPause;
-    FrameQueue mFrameQueue;
     bool m_bReal; // 是否实时流
@@ -116,11 +116,12 @@ AbstractDecoder* FFNvDecoderManager::getDecoderByName(const string name)
     return nullptr;
 }
-void FFNvDecoderManager::startDecode(AbstractDecoder* dec){
+bool FFNvDecoderManager::startDecode(AbstractDecoder* dec){
     if (dec != nullptr && !dec->isRunning())
     {
-        dec->start();
+        return dec->start();
     }
+    return false;
 }
 bool FFNvDecoderManager::startDecodeByName(const string name){
@@ -486,7 +487,7 @@ FFImgInfo* FFNvDecoderManager::snapshot(const string&amp; uri){
     }
     // 计算解码后原始数据所需缓冲区大小，并分配内存空间 Determine required buffer size and allocate buffer
-    numBytes = av_image_get_buffer_size(AV_PIX_FMT_RGB24, codec_ctx->width, codec_ctx->height, 1);
+    numBytes = av_image_get_buffer_size(AV_PIX_FMT_BGR24, codec_ctx->width, codec_ctx->height, 1);
     buffer = (uint8_t *)av_malloc(numBytes * sizeof(uint8_t));
 	pFrameRGB = av_frame_alloc();
@@ -560,3 +561,40 @@ void FFNvDecoderManager::releaseFFImgInfo(FFImgInfo* info){
 		info = nullptr;
 	}
 }
+
+FFImgInfo* FFNvDecoderManager::snapshot_in_task(const string name){
+    if (name.empty()){
+        LOG_ERROR("name 为空!");
+        return nullptr;
+    }
+
+    std::lock_guard<std::mutex> l(m_mutex);
+
+    auto dec = decoderMap.find(name);
+    if (dec != decoderMap.end()){
+        return dec->second->snapshot();
+    }
+    
+    LOG_ERROR("没有找到name为{}的解码器",name);
+    return nullptr;
+}
+
+vector<FFImgInfo*> FFNvDecoderManager::timing_snapshot_all(){
+
+    closeAllFinishedDecoder();
+
+    std::lock_guard<std::mutex> l(m_mutex);
+
+    vector<FFImgInfo*> vec;
+    for(auto it = decoderMap.begin(); it != decoderMap.end(); ++it){
+        if(it->second->isSnapTime()){
+            FFImgInfo* imginfo = it->second->snapshot();
+            if(imginfo != nullptr){
+                vec.push_back(imginfo);
+            }
+            it->second->updateLastSnapTime();
+        }
+    }
+
+    return vec;
+}
 \ No newline at end of file
@@ -14,14 +14,9 @@ struct MgrDecConfig
     string name{""};    // 解码器名称
 };
-struct FFImgInfo{
-    int width;
-    int height;
-    unsigned char * pData;
-};
-
 /**
  * 解码器管理类，单例类
+ * 谨防死锁
  **/
 class FFNvDecoderManager {
 public:
@@ -90,7 +85,7 @@ public:
 	* 返回：void
 	* 备注：
 	**************************************************/
-    void startDecode(AbstractDecoder*);
+    bool startDecode(AbstractDecoder*);
     /**************************************************
 	* 接口：startAllDecode
@@ -257,6 +252,10 @@ public:
 	**************************************************/
 	void releaseFFImgInfo(FFImgInfo* info);
+	FFImgInfo* snapshot_in_task(const string name);
+
+	vector<FFImgInfo*> timing_snapshot_all();
+
 private:
     FFNvDecoderManager(){}
-#include "FrameQueue.h"
-
-FrameQueue::FrameQueue(/* args */)
-{
-    for (size_t i = 0; i < Maxsize; i++)
-    {
-        base[i] = av_frame_alloc();
-    }
-
-	front = rear = 0;//头指针和尾指针置为零，队列为空
-}
-
-FrameQueue::~FrameQueue()
-{
-    if (base)
-	{
-        for (size_t i = 0; i < Maxsize; i++)
-        {
-            if (base[i])
-            {
-                av_frame_free(&base[i]);
-            }
-        }
-	}
-
-    rear = front = 0;
-}
- 
-//循环队列的入队
-AVFrame* FrameQueue::getTail()
-{
-	//插入一个元素e为Q的新的队尾元素
-	if ((rear + 1) % Maxsize == front)
-		return nullptr;//队满
-	return base[rear];//获取队尾元素
-}
-
-// 将队尾元素添加到队列中
-void FrameQueue::addTail()
-{
-     rear = (rear + 1) % Maxsize;//队尾指针加1
-}
- 
-//循环队列的出队
-AVFrame* FrameQueue::deQueue()
-{
-	//删除Q的队头元素，用e返回其值
-	if (front == rear)
-		return nullptr;//队空
-	AVFrame* e = base[front];//保存队头元素
-	front = (front + 1) % Maxsize;//队头指针加1
-	return e;
-}
- 
-//取循环队列的队头元素
-AVFrame* FrameQueue::getHead()
-{
-	//返回Q的队头元素，不修改队头指针
-	if (front == rear)
-		return nullptr;//队列为空，取元素失败
-	return base[front];
-}
-
-void FrameQueue::addHead()
-{
-    front = (front + 1) % Maxsize;//队头指针加1
-}
-
-int FrameQueue::length()
-{
-    return (rear - front + Maxsize) % Maxsize;
-}
-
-bool FrameQueue::isEmpty()
-{
-	if (front == rear)
-		return true;
-
-	return false;
-}
-
-void FrameQueue::clearQueue()
-{
-	rear = front = 0;
-}
 \ No newline at end of file
-#include <iostream>
-#include <atomic>
-
-extern "C"
-{
-	#include <libavcodec/avcodec.h> 
-	#include <libavdevice/avdevice.h> 
-	#include <libavformat/avformat.h> 
-	#include <libavfilter/avfilter.h> 
-	#include <libavutil/avutil.h> 
-    #include <libavutil/pixdesc.h> 
-	#include <libswscale/swscale.h>
-}
-
-using namespace std;
-
-#define Maxsize 5	// 循环队列的大小
-
-// 循环队列
-class FrameQueue
-{
-private:
-    /* data */
-public:
-    FrameQueue(/* args */);
-    ~FrameQueue();
-
-    AVFrame* getTail();
-    void addTail();
-    AVFrame* deQueue();
-    AVFrame* getHead();
-	void addHead();
-    void clearQueue();
-
-    int length();
-    bool isEmpty();
-
-private:
-    AVFrame* base[Maxsize];
-	atomic<int> front;
-	atomic<int> rear;
-};
 \ No newline at end of file
+#include<string>
+
+#include "cuda_kernels.h"
+#include "define.hpp"
+#include "utiltools.hpp"
+
+using namespace std;
+
+class GpuRgbMemory{
+
+public:
+     GpuRgbMemory(int _channel, int _width, int _height, string _id, string _gpuid, bool _isused){
+        channel = _channel;
+        width = _width;
+        height = _height;
+        size = channel * width * height;
+        isused = _isused;
+        id = _id;
+        gpuid = _gpuid;
+        timestamp = UtilTools::get_cur_time_ms();
+
+        cudaSetDevice(atoi(gpuid.c_str()));
+        CHECK_CUDA(cudaMalloc((void **)&pHwRgb, size * sizeof(unsigned char)));
+    }
+
+    ~GpuRgbMemory(){
+        if (pHwRgb) {
+            cudaSetDevice(atoi(gpuid.c_str()));
+            CHECK_CUDA(cudaFree(pHwRgb));
+            pHwRgb = nullptr;
+        }
+    }
+    
+    int getSize() {
+        return size;
+    }
+    
+    bool isIsused() {
+        return isused;
+    }
+
+    void setIsused(bool _isused) {
+        isused = _isused;
+        // 更新时间戳
+        timestamp = UtilTools::get_cur_time_ms();
+    }
+
+    string getId() {
+        return id;
+    }
+
+    string getGpuId() {
+        return gpuid;
+    }
+
+    unsigned char* getMem(){
+        return pHwRgb;
+    }
+
+    long long getTimesstamp(){
+        return timestamp;
+    }
+
+    int getWidth(){
+        return width;
+    }
+
+    int getHeight(){
+        return height;
+    }
+
+    int getChannel(){
+        return channel;
+    }
+
+private:
+    int size;
+    bool isused;
+    string id;
+    string gpuid;
+    unsigned char * pHwRgb{nullptr};
+    long long timestamp;
+    int width{0};
+    int height{0};
+    int channel{3};
+};
 \ No newline at end of file
+#include "cuda_kernels.h"
+
+#include "logger.hpp"
+
+
+//int saveJPEG(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height)
+//{
+//	return jpegNPP(szOutputFile, d_srcRGB, img_width, img_height);
+//	//return 0;
+//}
+//
+//int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height)
+//{
+//	return jpegNPP(szOutputFile, d_srcRGB, img_width, img_height);
+//	//return 0;
+//}
+//
+//int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB)
+//{
+//	return jpegNPP(szOutputFile, d_srcRGB);
+//}
+//
+//int saveJPEG(const char *szOutputFile, float* d_srcRGB)
+//{
+//	return jpegNPP(szOutputFile, d_srcRGB);
+//}
+
+int resizeFrame(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height)
+{
+	cudaError_t cudaStatus = cuda_common::ResizeImage(d_srcRGB, src_width, src_height, d_dstRGB, dst_width, dst_height);
+	if (cudaStatus != cudaSuccess) {
+		LOG_ERROR("cuda_common::ResizeImage failed: {}",cudaGetErrorString(cudaStatus));
+		return -1;
+	}
+
+	return 0;
+}
+
+//int initTables()
+//{
+//	initTable();
+//	return 0;
+//}
+//
+//int initTables(int flag, int width, int height)
+//{
+//	initTable(0, width, height);
+//	return 0;
+//}
+
+int drawImageOnGPU(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom)
+{
+	cuda_common::DrawImage(d_srcRGB, src_width, src_height, left, top, right, bottom);
+	return 0;
+}
+
+int drawImageOnGPU(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom)
+{
+	cuda_common::DrawImage(d_srcRGB, src_width, src_height, left, top, right, bottom);
+	return 0;
+}
+
+int drawLineOnGPU(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y)
+{
+	cuda_common::DrawLine(d_srcRGB, src_width, src_height, begin_x, begin_y, end_x, end_y);
+	return 0;
+}
+
+//int releaseJpegSaver()
+//{
+//	releaseJpegNPP();
+//	return 0;
+//}
+
+int partMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom)
+{
+	cudaError_t cudaStatus = cuda_common::PartMemCopy(d_srcRGB, src_width, src_height, d_dstRGB, left, top, right, bottom);
+	if (cudaStatus != cudaSuccess) {
+		LOG_ERROR("cuda_common::77 PartMemCopy failed: {} {} {} {} {} {} {}",cudaGetErrorString(cudaStatus), left, top, right, bottom, src_height, d_dstRGB);
+		return -1;
+	}
+
+	return 0;
+}
+//#include <fstream>
+//extern std::ofstream g_os;
+int PartMemResizeBatch(unsigned char * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB,
+	int count, int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h,
+	float submeanb, float submeang, float submeanr,
+	float varianceb, float varianceg, float variancer)
+{
+	//g_os << "cudaMemcpyHostToDevice begin 9" << std::endl;
+	cudaError_t cudaStatus = cuda_common::PartMemResizeBatch(
+		d_srcRGB, src_width, src_height, d_dstRGB, count, vleft, vtop, vright, vbottom, dst_w, dst_h,
+		submeanb, submeang, submeanr,
+		varianceb, varianceg, variancer);
+	//g_os << "cudaMemcpyHostToDevice end 9" << std::endl;
+	if (cudaStatus != cudaSuccess) {
+		LOG_ERROR("cuda_common::PartMemResizeBatch failed: {}",cudaGetErrorString(cudaStatus));
+		return -1;
+	}
+
+	return 0;
+}
+
+
+//int PartMemResizeBatch(float * d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, 
+//	int count, int* vleft, int * vtop, int* vright, int* vbottom, int dst_w, int dst_h,
+//	float submeanb, float submeang, float submeanr,
+//	float varianceb, float varianceg, float variancer)
+//
+//{
+//	cudaError_t cudaStatus = cuda_common::PartMemResizeBatch(
+//		d_srcRGB, src_width, src_height, d_dstRGB, count, vleft, vtop, vright, vbottom, dst_w, dst_h,
+//		submeanb, submeang, submeanr,
+//		varianceb, varianceg, variancer);
+//	if (cudaStatus != cudaSuccess) {
+//		fprintf(stderr, "cuda_common::PartMemCopy failed: %s\n", cudaGetErrorString(cudaStatus));
+//		return -1;
+//	}
+//
+//	return 0;
+//}
 \ No newline at end of file
+/*******************************************************************************************
+* Version: VPT_x64_V2.0.0_20170904
+* CopyRight: 中科院自动化研究所模式识别实验室图像视频组
+* UpdateDate: 20170904
+* Content: 人车物监测跟踪
+********************************************************************************************/
+
+#ifndef IMAGESAVEGPU_H_
+#define IMAGESAVEGPU_H_
+
+#ifdef _MSC_VER
+	#ifdef IMAGESAVEGPU_EXPORTS
+		#define IMAGESAVEGPU_API __declspec(dllexport)
+	#else
+		#define IMAGESAVEGPU_API __declspec(dllimport)
+	#endif
+#else
+#define IMAGESAVEGPU_API __attribute__((visibility ("default")))
+#endif
+// 功能：保存成jpeg文件
+// szOutputFile		输出图片路径，如D:\\out.jpg
+// d_srcRGB			输入RGB数据，由cudaMalloc分配的显存空间，数据排列形式为：BBBBBB......GGGGGG......RRRRRRRR......
+// img_width		RGB数据图片的宽度
+// img_height		RGB数据图片的高度
+//
+//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height);
+//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, float* d_srcRGB);
+//
+//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height);
+//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB);
+
+// 功能：防缩图像
+IMAGESAVEGPU_API int resizeFrame(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height);
+
+// 功能：部分拷贝数据
+IMAGESAVEGPU_API int partMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom);
+
+//IMAGESAVEGPU_API int partMemResizeImage(float * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB,
+//	int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h,
+//	float submeanb, float submeang, float submeanr,
+//	float varianceb, float varianceg, float variancer);
+
+
+IMAGESAVEGPU_API int PartMemResizeBatch(unsigned char * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB,
+	int count, int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h,
+	float submeanb, float submeang, float submeanr,
+	float varianceb, float varianceg, float variancer);
+
+
+//// 功能：初始化GPU保存图像的各种量化表
+//IMAGESAVEGPU_API int initTables();
+//IMAGESAVEGPU_API int initTables(int falg, int width, int height);
+//
+//// 功能：释放资源
+//IMAGESAVEGPU_API int releaseJpegSaver();
+
+// 功能：在GPU中绘制快照包围框
+IMAGESAVEGPU_API int drawImageOnGPU(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom);
+
+IMAGESAVEGPU_API int drawImageOnGPU(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom);
+
+// 功能：在GPU中绘制直线
+IMAGESAVEGPU_API int drawLineOnGPU(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y);
+
+#endif
@@ -257,14 +257,13 @@ namespace cuda_common
 		dstImage[width * y * 3 + x * 3 + 5] = clip_v(red[1] * 0.25,0 ,255);
 	}
-	cudaError_t setColorSpace(e_ColorSpace CSC, float hue)
+	cudaError_t setColorSpace(FF_ColorSpace CSC, float hue)
 	{
-
 		float hueSin = sin(hue);
 		float hueCos = cos(hue);
 		float hueCSC[9];
-		if (CSC == ITU601)
+		if (CSC == ITU_601)
 		{
 			//CCIR 601
 			hueCSC[0] = 1.1644f;
@@ -277,7 +276,7 @@ namespace cuda_common
 			hueCSC[7] = hueCos *  2.0172f;
 			hueCSC[8] = hueSin * -2.0172f;
 		}
-		else if (CSC == ITU709)
+		else if (CSC == ITU_709)
 		{
 			//CCIR 709
 			hueCSC[0] = 1.0f;
+#include "cuda_kernels.h"
+#include <algorithm>
+typedef unsigned char   uchar;
+typedef unsigned int    uint32;
+typedef int             int32;
+
+#define MAX_SNAPSHOT_WIDTH 320
+#define MAX_SNAPSHOT_HEIGHT 320
+
+namespace cuda_common
+{
+	__global__ void kernel_memcopy(unsigned char* d_srcRGB, int src_width, int src_height,
+		unsigned char* d_dstRGB, int left, int top, int right, int bottom)
+	{
+		const int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
+		const int dst_width = right - left;
+		const int dst_height = bottom - top;
+		if (dst_x < dst_width && dst_y < dst_height)
+		{
+			int src_x = left + dst_x;
+			int src_y = top + dst_y;
+
+			//bgr...bgr...bgr...
+			d_dstRGB[(dst_y*dst_width + dst_x) * 3] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3];
+			d_dstRGB[(dst_y*dst_width + dst_x)
+				* 3 + 1] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3 + 1];
+			d_dstRGB[(dst_y*dst_width + dst_x) * 3 + 2] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3 + 2];
+
+			//bbb...ggg...rrr...
+			//d_dstRGB[(dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(src_y*src_width) + src_x];
+			//d_dstRGB[(dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(src_width*src_height) + (src_y*src_width) + src_x];
+			//d_dstRGB[(2 * dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(2 * src_width*src_height) + (src_y*src_width) + src_x];
+
+			/*	memcpy(d_dstRGB + (dst_y*src_width) + dst_x, d_srcRGB + (src_y*src_width) + src_x, sizeof(float));
+			memcpy(d_dstRGB + (src_width*src_height) + (dst_y*src_width) + dst_x, d_srcRGB + (src_width*src_height) + (src_y*src_width) + src_x, sizeof(float));
+			memcpy(d_dstRGB + (2 * src_width*src_height) + (dst_y*src_width) + dst_x, d_srcRGB + (2 * src_width*src_height) + (src_y*src_width) + src_x, sizeof(float));*/
+		}
+	}
+
+	cudaError_t PartMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom)
+	{
+		dim3 block(32, 16, 1);
+		dim3 grid(((right - left) + (block.x - 1)) / block.x, ((bottom - top) + (block.y - 1)) / block.y, 1);
+
+		kernel_memcopy << < grid, block >> > (d_srcRGB, src_width, src_height, d_dstRGB, left, top, right, bottom);
+
+		cudaError_t cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "Part 50 kernel_memcopy launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			return cudaStatus;
+		}
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus);
+			return cudaStatus;
+		}
+		return cudaStatus;
+	}
+
+
+	//    __global__ void kernel_memcopy_mean_variance(float* d_srcRGB, int src_width, int src_height, 
+	//            unsigned char* vd_dstRGB, int count, int * vleft, int* vtop, int* vright, int * vbottom, float submeanb,float submeang, float submeanr, float varianceb,float varianceg, float variancer)
+	//    {
+	//        const int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
+	//        const int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
+	//        for (int i=0;i<count;i++)
+	//        {
+	//                const int left = vleft[i];
+	//                const int right = vright[i];
+	//                const int top = vtop[i];
+	//                const int bottom = vbottom[i];
+	//        
+	//                const int dst_width = right - left;
+	//                const int dst_height = bottom - top;
+	//
+	//
+	//                unsigned char * d_dstRGB = vd_dstRGB + i *   ;
+	//
+	//                if (dst_x < dst_width && dst_y < dst_height)
+	//                {
+	//                    int src_x = left + dst_x;
+	//                    int src_y = top + dst_y;
+	//        
+	//                    d_dstRGB[(dst_y*dst_width) + dst_x] = (d_srcRGB[(src_y*src_width) + src_x] - submeanb)*varianceb;
+	//                    d_dstRGB[(dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (d_srcRGB[(src_width*src_height) + (src_y*src_width) + src_x] -submeang)*varianceg;
+	//                    d_dstRGB[(2 * dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (d_srcRGB[(2 * src_width*src_height) + (src_y*src_width) + src_x] - submeanr) * variancer;
+	//        
+	//                }
+	//        }
+	//    }
+	__global__ void PartCopy_ResizeImgBilinearBGR_Mean_Variance_CUDAKernel(
+		unsigned char * d_srcRGB, int srcimg_width, int srcimg_height,
+		int* vleft, int* vtop, int* vright, int * vbottom,
+		unsigned char** vd_dstRGB, int count, int *dst_width, int *dst_height,
+		float submeanb, float submeang, float submeanr,
+		float varianceb, float varianceg, float variancer)
+	{
+		int i = blockIdx.z;
+
+		//for (int i = 0; i<count; i++)
+		{
+			const int left = vleft[i];
+			const int right = vright[i];
+			const int top = vtop[i];
+			const int bottom = vbottom[i];
+			const int cur_dst_width = dst_width[i];
+			const int cur_dst_height = dst_height[i];
+
+			unsigned char* d_dstRGB =  vd_dstRGB[i];
+
+			const int src_width = right - left;
+			const int src_height = bottom - top;
+			const int x = blockIdx.x * blockDim.x + threadIdx.x;// + left;
+			const int y = blockIdx.y * blockDim.y + threadIdx.y;//+ top;
+			const int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
+			const int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+			/*if (dst_x == 0 && dst_y == 0)
+				printf("%d %d %d %d %d\n", i, vleft[i], vright[i], cur_dst_width, cur_dst_height);*/
+
+			unsigned char * src_img = d_srcRGB;
+			unsigned char * dst_img = d_dstRGB;
+			if (dst_x < cur_dst_width && dst_y < cur_dst_height)
+			{
+				float fx = (x + 0.5)*src_width / (float)cur_dst_width - 0.5 + left;
+				float fy = (y + 0.5)*src_height / (float)cur_dst_height - 0.5 + top;
+				int ax = floor(fx);
+				int ay = floor(fy);
+				if (ax < 0)
+				{
+					ax = 0;
+				}
+				if (ax > srcimg_width - 2)
+				{
+					ax = srcimg_width - 2;
+				}
+				if (ay < 0) {
+					ay = 0;
+				}
+				if (ay > srcimg_height - 2)
+				{
+					ay = srcimg_height - 2;
+				}
+
+				int A = ax + ay*srcimg_width;
+				int B = ax + ay*srcimg_width + 1;
+				int C = ax + ay*srcimg_width + srcimg_width;
+				int D = ax + ay*srcimg_width + srcimg_width + 1;
+
+				float w1, w2, w3, w4;
+				w1 = fx - ax;
+				w2 = 1 - w1;
+				w3 = fy - ay;
+				w4 = 1 - w3;
+				float blue = src_img[A * 3] * w2*w4 + src_img[B * 3] * w1*w4 + src_img[C * 3] * w2*w3 + src_img[D * 3] * w1*w3;
+				float green = src_img[A * 3 + 1] * w2*w4 + src_img[B * 3 + 1] * w1*w4
+					+ src_img[C * 3 + 1] * w2*w3 + src_img[D * 3 + 1] * w1*w3;
+				float red = src_img[A * 3 + 2] * w2*w4 + src_img[B * 3 + 2] * w1*w4
+					+ src_img[C * 3 + 2] * w2*w3 + src_img[D * 3 + 2] * w1*w3;
+
+				/*dst_img[(dst_y * dst_width + dst_x) * 3] = (unsigned char)(blue - submeanb)*varianceb;
+				dst_img[(dst_y * dst_width + dst_x) * 3 + 1] =(unsigned char) (green - submeang)*varianceg;
+				dst_img[(dst_y * dst_width + dst_x) * 3 + 2] = (unsigned char) (red - submeanr)*variancer;*/
+
+				if (blue < 0)
+					blue = 0;
+				else if (blue > 255)
+					blue = 255;
+
+				if (green < 0)
+					green = 0;
+				else if (green > 255)
+					green = 255;
+
+				if (red < 0)
+					red = 0;
+				else if (red > 255)
+					red = 255;
+
+				dst_img[(dst_y * cur_dst_width + dst_x) * 3] = (unsigned char)blue;
+				dst_img[(dst_y * cur_dst_width + dst_x) * 3 + 1] = (unsigned char)green;
+				dst_img[(dst_y * cur_dst_width + dst_x) * 3 + 2] = (unsigned char)red;
+
+
+				/*if (src_img[(dst_y * dst_width + dst_x) * 3] < 0)
+					src_img[(dst_y * dst_width + dst_x) * 3] = 0;
+				else if (src_img[(dst_y * dst_width + dst_x) * 3] > 255)
+					src_img[(dst_y * dst_width + dst_x) * 3] = 255;
+
+				if (src_img[(dst_y * dst_width + dst_x) * 3 + 1] < 0)
+					src_img[(dst_y * dst_width + dst_x) * 3 + 1] = 0;
+				else if (src_img[(dst_y * dst_width + dst_x) * 3 + 1] > 255)
+					src_img[(dst_y * dst_width + dst_x) * 3 + 1] = 255;
+
+				if (src_img[(dst_y * dst_width + dst_x) * 3 + 2] < 0)
+					src_img[(dst_y * dst_width + dst_x) * 3 + 2] = 0;
+				else if (src_img[(dst_y * dst_width + dst_x) * 3 + 2] > 255)
+					src_img[(dst_y * dst_width + dst_x) * 3 + 2] = 255;
+
+
+				dst_img[(dst_y * dst_width + dst_x) * 3] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3];
+				dst_img[(dst_y * dst_width + dst_x) * 3 + 1] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3 + 1];
+				dst_img[(dst_y * dst_width + dst_x) * 3 + 2] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3 + 2];*/
+			}
+		}
+	}
+
+	cudaError_t PartMemResizeBatch(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, int count, int* left, int* top, int* right, int* bottom, int *dst_w, int *dst_h, float submeanb, float submeang, float submeanr,
+		float varianceb, float varianceg, float variancer)
+	{
+	/*	cudaEvent_t start, stop;
+		float time;
+		cudaEventCreate(&start);
+		cudaEventCreate(&stop);
+		cudaEventRecord(start, 0);*/
+
+		dim3 block(32, 16, 1);
+		dim3 grid((*std::max_element(dst_w, dst_w+ count) + (block.x - 1)) / block.x, (*std::max_element(dst_h, dst_h + count) + (block.y - 1)) / block.y, count);
+
+		int * gpu_left;
+		cudaMalloc(&gpu_left, 1000 * sizeof(int));
+		cudaMemcpy(gpu_left, left, count * sizeof(int), cudaMemcpyHostToDevice);
+
+		int * gpu_right;
+		cudaMalloc(&gpu_right, 1000 * sizeof(int));
+		cudaMemcpy(gpu_right, right, count * sizeof(int), cudaMemcpyHostToDevice);
+
+		int * gpu_top;
+		cudaMalloc(&gpu_top, 1000 * sizeof(int));
+		cudaMemcpy(gpu_top, top, count * sizeof(int), cudaMemcpyHostToDevice);
+
+		int * gpu_bottom;
+		cudaMalloc(&gpu_bottom, 1000 * sizeof(int));
+		cudaMemcpy(gpu_bottom, bottom, count * sizeof(int), cudaMemcpyHostToDevice);
+
+		int * gpu_dst_w;
+		cudaMalloc(&gpu_dst_w, 1000 * sizeof(int));
+		cudaMemcpy(gpu_dst_w, dst_w, count * sizeof(int), cudaMemcpyHostToDevice);
+
+		int * gpu_dst_h;
+		cudaMalloc(&gpu_dst_h, 1000 * sizeof(int));
+		cudaMemcpy(gpu_dst_h, dst_h, count * sizeof(int), cudaMemcpyHostToDevice);
+
+		unsigned char** gpu_dst_rgb;
+		cudaMalloc(&gpu_dst_rgb, 1000 * sizeof(unsigned char*));
+		cudaMemcpy(gpu_dst_rgb, d_dstRGB, count * sizeof(unsigned char*), cudaMemcpyHostToDevice);
+
+		//cudaMemcpy(cpu_personfloat, d_srcRGB, 112*224*2*sizeof(float), cudaMemcpyDeviceToHost);
+		//            for(int i=0;i<100;i++)
+		//            {
+		//                  printf("the score is %f\t",cpu_personfloat[i]);
+		//            }
+		PartCopy_ResizeImgBilinearBGR_Mean_Variance_CUDAKernel << < grid, block >> > (
+			d_srcRGB, src_width, src_height,
+			gpu_left, gpu_top, gpu_right, gpu_bottom,
+			gpu_dst_rgb, count, gpu_dst_w, gpu_dst_h,
+			submeanb, submeang, submeanr,
+			varianceb, varianceg, variancer);
+		cudaFree(gpu_top);
+		cudaFree(gpu_bottom);
+		cudaFree(gpu_left);
+		cudaFree(gpu_right);
+		cudaFree(gpu_dst_w);
+		cudaFree(gpu_dst_h);
+		cudaFree(gpu_dst_rgb);
+	
+		cudaError_t cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "Part 270 kernel_memcopy launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			return cudaStatus;
+		}
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus);
+			return cudaStatus;
+		}
+
+		/*cudaEventRecord(stop, 0);
+		cudaEventSynchronize(stop);
+		cudaEventElapsedTime(&time, start, stop);
+		cudaEventDestroy(start);
+		cudaEventDestroy(stop);
+		printf("ºËº¯ÊýÏûºÄÊ±¼ä:%f\n", time);*/
+
+		return cudaStatus;
+	}
+
+}
 \ No newline at end of file
+
+
+#include "cuda_kernels.h"
+
+typedef unsigned char   uint8;
+typedef unsigned int    uint32;
+typedef int             int32;
+
+namespace cuda_common
+{
+	__device__ unsigned char clip_value(unsigned char x, unsigned char min_val, unsigned char  max_val){
+		if (x>max_val){
+			return max_val;
+		}
+		else if (x<min_val){
+			return min_val;
+		}
+		else{
+			return x;
+		}
+	}
+
+	__global__ void kernel_rgb2yuv(unsigned char *src_img, unsigned char* Y, unsigned char* u, unsigned char* v,
+		int src_width, int src_height, size_t yPitch)
+	{
+		const int x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (x >= src_width)
+			return; //x = width - 1;
+
+		if (y >= src_height)
+			return; // y = height - 1;
+		
+		int B = src_img[y * src_width * 3 + x * 3];
+		int G = src_img[y * src_width * 3 + x * 3 + 1];
+		int R = src_img[y * src_width * 3 + x * 3 + 2];
+
+		/*int B = src_img[y * src_width + x];
+		int G = src_img[src_width * src_height + y * src_width + x];
+		int R = src_img[src_width * src_height * 2 + y * src_width + x];*/
+
+		Y[y * yPitch + x] = clip_value((unsigned char)(0.299 * R + 0.587 * G + 0.114 * B), 0, 255);
+		u[y * src_width + x] = clip_value((unsigned char)(-0.147 * R - 0.289 * G + 0.436 * B + 128), 0, 255);
+		v[y * src_width + x] = clip_value((unsigned char)(0.615 * R - 0.515 * G - 0.100 * B + 128), 0, 255);
+
+		//Y[y * yPitch + x] = clip_value((unsigned char)(0.257 * R + 0.504 * G + 0.098 * B + 16), 0, 255);
+		//u[y * src_width + x] = clip_value((unsigned char)(-0.148 * R - 0.291 * G + 0.439 * B + 128), 0, 255);
+		//v[y * src_width + x] = clip_value((unsigned char)(0.439 * R - 0.368 * G - 0.071 * B + 128), 0, 255);
+	}
+
+	__global__ void kernel_rgb2yuv(float *src_img, unsigned char* Y, unsigned char* u, unsigned char* v,
+		int src_width, int src_height, size_t yPitch)
+	{
+		const int x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (x >= src_width)
+			return; //x = width - 1;
+
+		if (y >= src_height)
+			return; // y = height - 1;
+
+		float B = src_img[y * src_width + x];
+		float G = src_img[src_width * src_height + y * src_width + x];
+		float R = src_img[src_width * src_height * 2 + y * src_width + x];
+
+		Y[y * yPitch + x] = clip_value((unsigned char)(0.299 * R + 0.587 * G + 0.114 * B), 0, 255);
+		u[y * src_width + x] = clip_value((unsigned char)(-0.147 * R - 0.289 * G + 0.436 * B + 128), 0, 255);
+		v[y * src_width + x] = clip_value((unsigned char)(0.615 * R - 0.515 * G - 0.100 * B + 128), 0, 255);
+
+		//Y[y * yPitch + x] = clip_value((unsigned char)(0.257 * R + 0.504 * G + 0.098 * B + 16), 0, 255);
+		//u[y * src_width + x] = clip_value((unsigned char)(-0.148 * R - 0.291 * G + 0.439 * B + 128), 0, 255);
+		//v[y * src_width + x] = clip_value((unsigned char)(0.439 * R - 0.368 * G - 0.071 * B + 128), 0, 255);
+	}
+
+	extern "C"
+	__global__ void kernel_resize_UV(unsigned char* src_img, unsigned char *dst_img,
+		int src_width, int src_height, int dst_width, int dst_height, int nPitch)
+	{
+		const int x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (x >= dst_width)
+			return; //x = width - 1;
+
+		if (y >= dst_height)
+			return; // y = height - 1;
+
+		float fx = (x + 0.5)*src_width / (float)dst_width - 0.5;
+		float fy = (y + 0.5)*src_height / (float)dst_height - 0.5;
+		int ax = floor(fx);
+		int ay = floor(fy);
+		if (ax < 0)
+		{
+			ax = 0;
+		}
+		else if (ax > src_width - 2)
+		{
+			ax = src_width - 2;
+		}
+
+		if (ay < 0){
+			ay = 0;
+		}
+		else if (ay > src_height - 2)
+		{
+			ay = src_height - 2;
+		}
+
+		int A = ax + ay*src_width;
+		int B = ax + ay*src_width + 1;
+		int C = ax + ay*src_width + src_width;
+		int D = ax + ay*src_width + src_width + 1;
+
+		float w1, w2, w3, w4;
+		w1 = fx - ax;
+		w2 = 1 - w1;
+		w3 = fy - ay;
+		w4 = 1 - w3;
+
+		unsigned char val = src_img[A] * w2*w4 + src_img[B] * w1*w4 + src_img[C] * w2*w3 + src_img[D] * w1*w3;
+
+		dst_img[y * nPitch + x] = clip_value(val,0,255);
+	}
+
+	cudaError_t RGB2YUV(float* d_srcRGB, int src_width, int src_height,
+						unsigned char* Y, size_t yPitch, int yWidth, int yHeight,
+						unsigned char* U, size_t uPitch, int uWidth, int uHeight,
+						unsigned char* V, size_t vPitch, int vWidth, int vHeight)
+	{
+		unsigned char * u ;
+		unsigned char * v ;
+
+		cudaError_t cudaStatus;
+
+		cudaStatus = cudaMalloc((void**)&u, src_width * src_height * sizeof(unsigned char));
+		cudaStatus = cudaMalloc((void**)&v, src_width * src_height * sizeof(unsigned char));
+
+		dim3 block(32, 16, 1);
+		dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1);
+		dim3 grid1((uWidth + (block.x - 1)) / block.x, (uHeight + (block.y - 1)) / block.y, 1);
+		dim3 grid2((vWidth + (block.x - 1)) / block.x, (vHeight + (block.y - 1)) / block.y, 1);
+
+		kernel_rgb2yuv << < grid, block >> >(d_srcRGB, Y, u, v, src_width, src_height, yPitch);
+
+		cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "kernel_rgb2yuv launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			goto Error;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_rgb2yuv!\n", cudaStatus);
+			goto Error;
+		}
+
+		kernel_resize_UV << < grid1, block >> >(u, U, src_width, src_height, uWidth, uHeight, uPitch);
+
+		cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			goto Error;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus);
+			goto Error;
+		}
+
+		kernel_resize_UV << < grid2, block >> >(v, V, src_width, src_height, vWidth, vHeight, vPitch);
+
+		cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			goto Error;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus);
+			goto Error;
+		}
+
+Error :
+		cudaFree(u);
+		cudaFree(v);
+
+		return cudaStatus;
+	}
+
+
+
+	cudaError_t RGB2YUV(unsigned char* d_srcRGB, int src_width, int src_height,
+		unsigned char* Y, size_t yPitch, int yWidth, int yHeight,
+		unsigned char* U, size_t uPitch, int uWidth, int uHeight,
+		unsigned char* V, size_t vPitch, int vWidth, int vHeight)
+	{
+		unsigned char * u;
+		unsigned char * v;
+
+		cudaError_t cudaStatus;
+
+		cudaStatus = cudaMalloc((void**)&u, src_width * src_height * sizeof(unsigned char));
+		cudaStatus = cudaMalloc((void**)&v, src_width * src_height * sizeof(unsigned char));
+
+		dim3 block(32, 16, 1);
+		dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1);
+		dim3 grid1((uWidth + (block.x - 1)) / block.x, (uHeight + (block.y - 1)) / block.y, 1);
+		dim3 grid2((vWidth + (block.x - 1)) / block.x, (vHeight + (block.y - 1)) / block.y, 1);
+
+		kernel_rgb2yuv << < grid, block >> >(d_srcRGB, Y, u, v, src_width, src_height, yPitch);
+
+		cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "kernel_rgb2yuv launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			goto Error;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_rgb2yuv!\n", cudaStatus);
+			goto Error;
+		}
+
+		kernel_resize_UV << < grid1, block >> >(u, U, src_width, src_height, uWidth, uHeight, uPitch);
+
+		cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			goto Error;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus);
+			goto Error;
+		}
+
+		kernel_resize_UV << < grid2, block >> >(v, V, src_width, src_height, vWidth, vHeight, vPitch);
+
+		cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			goto Error;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus);
+			goto Error;
+		}
+
+	Error:
+		cudaFree(u);
+		cudaFree(v);
+
+		return cudaStatus;
+	}
+}
+
+#include "cuda_kernels.h"
+
+typedef unsigned char   uchar;
+typedef unsigned int    uint32;
+typedef int             int32;
+
+namespace cuda_common
+{
+	__global__ void kernel_bilinear(float *src_img, float *dst_img,
+		int src_width, int src_height, int dst_width, int dst_height)
+	{
+		const int x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (x < dst_width && y < dst_height)
+		{
+			float fx = (x + 0.5)*src_width / (float)dst_width - 0.5;
+			float fy = (y + 0.5)*src_height / (float)dst_height - 0.5;
+			int ax = floor(fx);
+			int ay = floor(fy);
+			if (ax < 0)
+			{
+				ax = 0;
+			}
+			else if (ax > src_width - 2)
+			{
+				ax = src_width - 2;
+			}
+
+			if (ay < 0){
+				ay = 0;
+			}
+			else if (ay > src_height - 2)
+			{
+				ay = src_height - 2;
+			}
+
+			int A = ax + ay*src_width;
+			int B = ax + ay*src_width + 1;
+			int C = ax + ay*src_width + src_width;
+			int D = ax + ay*src_width + src_width + 1;
+
+			float w1, w2, w3, w4;
+			w1 = fx - ax;
+			w2 = 1 - w1;
+			w3 = fy - ay;
+			w4 = 1 - w3;
+
+			float blue = src_img[A] * w2*w4 + src_img[B] * w1*w4 + src_img[C] * w2*w3 + src_img[D] * w1*w3;
+
+			float green = src_img[src_width * src_height + A] * w2*w4 + src_img[src_width * src_height + B] * w1*w4 
+				+ src_img[src_width * src_height + C] * w2*w3 + src_img[src_width * src_height + D] * w1*w3;
+
+			float red = src_img[src_width * src_height * 2 + A] * w2*w4 + src_img[src_width * src_height * 2 + B] * w1*w4 
+				+ src_img[src_width * src_height * 2 + C] * w2*w3 + src_img[src_width * src_height * 2 + D] * w1*w3;
+
+			dst_img[y * dst_width + x] = blue;
+			dst_img[dst_width * dst_height + y * dst_width + x] = green;
+			dst_img[dst_width * dst_height * 2 + y * dst_width + x] = red;
+		}
+	}
+
+	cudaError_t ResizeImage(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height)
+	{
+		dim3 block(32, 16, 1);
+		dim3 grid((dst_width + (block.x - 1)) / block.x, (dst_height + (block.y - 1)) / block.y, 1);
+
+		kernel_bilinear << < grid, block >> >(d_srcRGB, d_dstRGB, src_width, src_height, dst_width, dst_height);
+
+		cudaError_t cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "kernel_bilinear launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			return cudaStatus;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus);
+			return cudaStatus;
+		}
+
+		return cudaStatus;
+	}
+}
 \ No newline at end of file
@@ -218,8 +218,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV()
     // Find the best major SM Architecture GPU device
     while (current_device < device_count)
     {
-        checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
-        checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
+        checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
         if (major > 0 && major < 9999)
         {
@@ -240,9 +239,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV()
         checkCudaErrors(cuDeviceGetAttribute(&clockRate,
                                              CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
                                              current_device));
-
-        checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
-        checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
+        checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
         int computeMode;
         getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
@@ -320,9 +317,7 @@ inline int gpuGetMaxGflopsGLDeviceIdDRV()
     while (current_device < device_count)
     {
         checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
-
-        checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
-        checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
+        checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
 #if CUDA_VERSION >= 3020
         checkCudaErrors(cuDeviceGetAttribute(&bTCC,  CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device));
@@ -374,9 +369,7 @@ inline int gpuGetMaxGflopsGLDeviceIdDRV()
         checkCudaErrors(cuDeviceGetAttribute(&clockRate,
                                              CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
                                              current_device));
-
-        checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
-        checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
+        checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
 #if CUDA_VERSION >= 3020
         checkCudaErrors(cuDeviceGetAttribute(&bTCC,  CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device));
@@ -507,9 +500,7 @@ inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int d
     checkCudaErrors(cuDeviceGet(&cuDevice, devID));
     checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
-
-    checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, devID));
-    checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, devID));
+    checkCudaErrors(cuDeviceComputeCapability(&major, &minor, devID));
     if ((major > major_version) ||
         (major == major_version && minor >= minor_version))
@@ -12,15 +12,52 @@
 typedef enum
 {
-	ITU601 = 1,
-	ITU709 = 2
-} e_ColorSpace;
+	ITU_601 = 1,
+	ITU_709 = 2
+} FF_ColorSpace;
 namespace cuda_common
 {
-	cudaError_t setColorSpace(e_ColorSpace CSC, float hue);
+	cudaError_t setColorSpace(FF_ColorSpace CSC, float hue);
 	cudaError_t NV12ToRGBnot(CUdeviceptr d_srcNV12, size_t nSourcePitch, unsigned char* d_dstRGB, int width, int height);
 	cudaError_t CUDAToBGR(CUdeviceptr dataY, CUdeviceptr dataUV, size_t pitchY, size_t pitchUV, unsigned char* d_dstRGB, int width, int height);
+
+	
+	cudaError_t ResizeImage(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height);
+
+	cudaError_t RGB2YUV(float* d_srcRGB, int src_width, int src_height,
+		unsigned char* Y, size_t yPitch, int yWidth, int yHeight,
+		unsigned char* U, size_t uPitch, int uWidth, int uHeight,
+		unsigned char* V, size_t vPitch, int vWidth, int vHeight);
+
+	cudaError_t RGB2YUV(unsigned char* d_srcRGB, int src_width, int src_height,
+		unsigned char* Y, size_t yPitch, int yWidth, int yHeight,
+		unsigned char* U, size_t uPitch, int uWidth, int uHeight,
+		unsigned char* V, size_t vPitch, int vWidth, int vHeight);
+
+	cudaError_t PartMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom);
+	//	cudaError_t PartMemResize(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int left, int top, int right, int bottom);
+
+	cudaError_t PartMemResizeBatch(unsigned char* d_srcRGB, int srcimg_width, int srcimg_height, unsigned char** d_dstRGB, int count,
+		int* left, int* top, int* right, int* bottom, int *dst_w, int *dst_h,
+		float submeanb, float submeang, float submeanr,
+		float varianceb, float varianceg, float variancer);
+
+	cudaError_t DrawImage(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom);
+	cudaError_t DrawImage(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom);
+
+	cudaError_t DrawLine(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y);
 }
+
+int jpegNPP(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height);
+int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height);
+
+int jpegNPP(const char *szOutputFile, float* d_srcRGB);
+int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB);
+
+int initTable();
+int initTable(int flag, int width, int height);
+int releaseJpegNPP();
+
@@ -5,3 +5,9 @@
 #define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#define CHECK_CUDA(call) \
+{\
+    const cudaError_t error_code = call;\
+    if (cudaSuccess != error_code)\
+        LOG_ERROR("CUDA error, code: {} reason: {}", error_code, cudaGetErrorString(error_code));\
+}
 \ No newline at end of file
@@ -10,11 +10,13 @@ extern &quot;C&quot; {
 #include "libswscale/swscale.h"
 }
-#include "../logger.hpp"
-
 #include"RTPTcpReceiver.h"
 #include"RTPUdpReceiver.h"
+#include <cuda_runtime.h>
+
+#include "common_header.h"
+
 #define ECLOSED 0
 #define ECLOSING 1
 #define ERUNNING 2
@@ -36,6 +38,7 @@ FFGB28181Decoder::FFGB28181Decoder() {
 	m_frameSkip = 1;
     m_port = -1;
     m_dec_keyframe = false;
+    m_post_decode_thread = 0;
 }
 FFGB28181Decoder::~FFGB28181Decoder()
@@ -47,11 +50,6 @@ FFGB28181Decoder::~FFGB28181Decoder()
         avcodec_free_context(&m_pAVCodecCtx);
     }
-    if (m_pAVFrame) {
-        av_frame_free(&m_pAVFrame);
-        m_pAVFrame = NULL;
-    }
-
     m_dec_keyframe = false;
     LOG_INFO("destroy OK--{}", m_dec_name);
@@ -74,9 +72,22 @@ void FFGB28181Decoder::close(){
         m_rtpPtr = nullptr;
     }
-    LOG_INFO("解码器关闭成功 --{}", m_dec_name);
+    if (gpu_options) av_dict_free(&gpu_options);
+
+    if (m_post_decode_thread != 0)
+	{
+		pthread_join(m_post_decode_thread,0);
+	}
+
+    while(mFrameQueue.size() > 0){
+		AVFrame * gpuFrame = mFrameQueue.front();
+		av_frame_free(&gpuFrame); 
+		mFrameQueue.pop();
+	}
     m_status = ECLOSED;
+
+    LOG_INFO("解码器关闭成功 --{}", m_dec_name);
 }
 bool FFGB28181Decoder::init(FFDecConfig& cfg){
@@ -124,7 +135,18 @@ bool FFGB28181Decoder::start() {
     LOG_INFO("start - {} {}: ", m_dec_name, m_port);
-    return m_rtpPtr->Open((uint16_t)m_port);
+    bool bRet = m_rtpPtr->Open((uint16_t)m_port);
+    if(bRet){
+        pthread_create(&m_post_decode_thread,0,
+            [](void* arg)
+            {
+                FFGB28181Decoder* a=(FFGB28181Decoder*)arg;
+                a->post_decode_thread();
+                return (void*)0;
+            }
+        ,this);
+    }
+    return bRet;
 }
 void FFGB28181Decoder::setDecKeyframe(bool bKeyframe){
@@ -151,15 +173,12 @@ void FFGB28181Decoder::stream_callback(int videoType, char* data, int len, int i
         return;
     }
-	AVPacket framePacket = {}, mp4Packet = {};
+	AVPacket framePacket = {};
     av_init_packet(&framePacket);
-	av_init_packet(&mp4Packet);
     framePacket.size = len;
     framePacket.data = (uint8_t*)data;
-	AVDictionary *gpu_options = nullptr;
-
 	if (m_pAVCodecCtx == nullptr) {
         LOG_INFO("frame data is zero --{}", m_dec_name);
 		if (VIDEO_TYPE_H264 == videoType) {
@@ -192,7 +211,6 @@ void FFGB28181Decoder::stream_callback(int videoType, char* data, int len, int i
 		}
         m_pAVCodecCtx = avcodec_alloc_context3(m_pAVCodec);
-       
         if (m_gpuid >= 0) {
             char gpui[8] = { 0 };
@@ -211,8 +229,6 @@ void FFGB28181Decoder::stream_callback(int videoType, char* data, int len, int i
         if (avcodec_open2(m_pAVCodecCtx, m_pAVCodec, &gpu_options) < 0)
 			return;
-
-		m_pAVFrame = av_frame_alloc();
 	}
     //开始解码
@@ -220,6 +236,7 @@ void FFGB28181Decoder::stream_callback(int videoType, char* data, int len, int i
     if (ret < 0) {
         //send_exception(RunMessageType::E2002, e_msg);
         LOG_ERROR("Real stream视频解码失败，请检查视频设备{}: avcodec_send_packet failed. ret={}", m_dec_name, ret);
+        av_packet_unref(&framePacket);
         return;
     }
@@ -228,61 +245,67 @@ void FFGB28181Decoder::stream_callback(int videoType, char* data, int len, int i
         frameH = m_pAVCodecCtx->height;
         if (frameW <= 0 || frameH <= 0) {
             LOG_ERROR("[{}] frame W or H is error! ({},{})", m_dec_name, frameW, frameH);
+            av_packet_unref(&framePacket);
             return;
         }	
     }
 	// m_fps = m_pAVCodecCtx->pkt_timebase.den == 0 ? 25.0 : av_q2d(m_pAVCodecCtx->pkt_timebase);
     m_fps = av_q2d(m_pAVCodecCtx->framerate);
-	LOG_DEBUG("frameW {}--frameH {}", frameW, frameH);
-    while (ret  >= 0) {
-        ret = avcodec_receive_frame(m_pAVCodecCtx, m_pAVFrame);
-        if (ret == AVERROR_EOF || ret == AVERROR(EAGAIN))
-            return;
-        else if (ret < 0) {       
-            if (m_frameCount % 10 == 0){
-                //send_exception(RunMessageType::E2002, e_msg);
-                LOG_ERROR("Real stream视频解码失败，请检查视频设备{}: avcodec_receive_frame failed. ret={}", m_dec_name, ret);
-            }
-            continue;
-        }
+	// LOG_DEBUG("frameW {}--frameH {}", frameW, frameH);
+
+    AVFrame* gpuFrame = av_frame_alloc();
+    ret = avcodec_receive_frame(m_pAVCodecCtx, gpuFrame);
+    if ((ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) || ret < 0){
+        LOG_ERROR("{} - Failed to receive frame: {}", m_dec_name, ret);
+        av_packet_unref(&framePacket);
+        av_frame_free(&gpuFrame);
+        return;
+    }
-        if (++m_frameCount % m_frameSkip != 0) continue;
-        
-        if (m_pAVFrame->width != frameW || m_pAVFrame->height != frameH){
-            LOG_INFO("AVFrame is inconsistent: width is {}, height is {}; original frameW is {}, frameH is {}--{}", m_pAVFrame->width, m_pAVFrame->height, frameW, frameH , m_dec_name);
-            continue;
-        }
-        
-        LOG_DEBUG("curpos is: {}", m_frameCount);
-
-		post_decoded_cbk(m_postDecArg, m_pAVFrame);
-
-            //LOG_count++;
-            //if (LOG_count > 100000) {
-            //    LOG_INFO("Real frame send_shm_videoframe pts={}-{}", localPts, m_dec_name);
-            //    //LOG_count = 0;
-            //}
-        //}
-        //catch (GeneralException2& e)
-        //{
-        //    LOG_ERROR("send_shm_videoframe failed! {}--{}--{}", e.err_code(), e.err_msg(), m_dec_name);
-        //    if (e.err_code() == -666) {
-        //        this->close();
-        //    }
-        //    
-        //    if (e.err_code() == ERROR_MEMORY) {
-        //        if (m_frameCount % 10 == 0) {
-        //            string e_msg;
-        //            format_string(e_msg, "服务器资源内存分配失败, 在vas模块%s文件%d行出现无法获取内存的情况!", __FILE__, __LINE__);
-        //            send_exception(RunMessageType::F4001, e_msg);
-        //            LOG_ERROR("{}", e_msg);
-        //        }
-        //    }
-        //    return;
-        //}
+    av_packet_unref(&framePacket);
+    
+    if (gpuFrame->width != frameW || gpuFrame->height != frameH){
+        LOG_INFO("AVFrame is inconsistent: width is {}, height is {}; original frameW is {}, frameH is {}--{}", gpuFrame->width, gpuFrame->height, frameW, frameH , m_dec_name);
+        av_frame_free(&gpuFrame);
+        return;
+    }
+
+    m_queue_mutex.lock();
+    if(mFrameQueue.size() <= 10){
+        mFrameQueue.push(gpuFrame);
+    }else{
+        av_frame_free(&gpuFrame); 
     }
+    m_queue_mutex.unlock();
+}
+
+void FFGB28181Decoder::post_decode_thread(){
+	
+	int index = 0;
+	while (isRunning())
+	{
+		if(mFrameQueue.size() > 0){
+			std::lock_guard<std::mutex> l(m_snapshot_mutex);
+			// 取队头数据
+			m_queue_mutex.lock();
+			AVFrame * gpuFrame = mFrameQueue.front();
+			mFrameQueue.pop();
+			m_queue_mutex.unlock();
+			// 跳帧
+			if (m_frameSkip == 1 || index % m_frameSkip == 0){
+				post_decoded_cbk(m_postDecArg, gpuFrame);
+			}
+
+			av_frame_free(&gpuFrame); 
+
+			index++;
+            if(index >= 100000){
+                index = 0;
+            }
+		}
+	}
-	if (gpu_options) av_dict_free(&gpu_options);
+	LOG_INFO("post decode thread exited.");
 }
 void FFGB28181Decoder::stream_end_callback()
@@ -6,6 +6,7 @@
 #include "../AbstractDecoder.h"
 #include <atomic>
+#include <mutex>
 struct AVFormatContext;
 struct AVCodecContext;
@@ -14,6 +15,7 @@ struct AVFrame;
 struct AVPacket;
 struct SwsContext;
+using namespace std;
 class FFGB28181Decoder: public AbstractDecoder
 {
@@ -45,19 +47,16 @@ public:
 public:
     void stream_callback(int videoType, char* data, int len, int isKey, uint64_t pts, uint64_t localPts);
 	void stream_end_callback();
+    void post_decode_thread();
 private:
     AVCodecContext* m_pAVCodecCtx {};
     const AVCodec* m_pAVCodec {};
-    AVFrame* m_pAVFrame {};
     int m_gpuid {-1};
 	RTPReceiver* m_rtpPtr;
     int m_port;
-    uint64_t m_frameCount {};
-	
-    AVFrame* pFrameRGB {};
     uint64_t  m_startPts {};
     uint64_t  m_lastPts {};   //上一次pts的值
@@ -71,6 +70,10 @@ private:
     int log_count {};
     std::atomic_int m_status {};
+
+    AVDictionary *gpu_options = nullptr;
+
+    pthread_t m_post_decode_thread;
 };
 #endif // _GB28181_DECODER_H_
-#include "RTPReceiver.h"
+#include "RTPReceiver.h"
 #include "rtppacket.h"
-#include "../logger.hpp"
 #include <thread>
+#include "common_header.h"
+
 #define BUFFERSIZE_1024     1024
 const int kVideoFrameSize         = BUFFERSIZE_1024*BUFFERSIZE_1024*5*2;
@@ -174,7 +175,7 @@ int RTPReceiver::OnPsProcess()
     LOG_INFO("[{}] started.", m_deviceID);
     while (!m_bPsExit) {
 		m_psFrameMutex.lock();
-        LOG_DEBUG("[{}] PS frame size : {}", m_deviceID, m_psVideoFrames.size());
+        // LOG_DEBUG("[{}] PS frame size : {}", m_deviceID, m_psVideoFrames.size());
 		if (m_psVideoFrames.size() <= 0){
 			m_psFrameMutex.unlock();
 			std::this_thread::sleep_for(std::chrono::milliseconds(10));
@@ -257,7 +258,7 @@ int RTPReceiver::ParsePacket(RTPPacket* packet){
             break;
         }
-        LOG_DEBUG("[{}] ParsePacket GetPayloadLength", m_deviceID);
+        // LOG_DEBUG("[{}] ParsePacket GetPayloadLength", m_deviceID);
         if (mark)
         {
@@ -271,7 +272,7 @@ int RTPReceiver::ParsePacket(RTPPacket* packet){
                 std::lock_guard<std::mutex> l(m_psFrameMutex);
                 if (m_psVideoFrames.size() < 100)
                 {
-                    LOG_DEBUG("[{}]ParsePacket push", m_deviceID);
+                    // LOG_DEBUG("[{}]ParsePacket push", m_deviceID);
                     m_psVideoFrames.push(new Frame(frameBuf, offset, false));
                 }
                 else {
@@ -32,7 +32,7 @@ typedef void(*CallBack_VodFileEnd)(void* userdata);
 /** 
  * 请求流
 */
-typedef bool(*CallBack_Request_Stream)();
+typedef bool(*CallBack_Request_Stream)(const char* deviceId);
 // 标识帧, 注意buffer需要自己开辟和释放
 struct Frame {
@@ -85,7 +85,7 @@ class RTPReceiver{
 public:
     RTPReceiver();
-    ~RTPReceiver();
+    virtual ~RTPReceiver();
     virtual bool Open(uint16_t localPort) = 0;
 	virtual bool IsOpened() = 0;
 #include"RTPTcpReceiver.h"
-#include "../logger.hpp"
+#include "common_header.h"
-static long long get_cur_time() {
-
-	chrono::time_point<chrono::system_clock, chrono::milliseconds> tpMicro
-		= chrono::time_point_cast<chrono::milliseconds>(chrono::system_clock::now());
-
-	return tpMicro.time_since_epoch().count();
-}
 // class TcpRTPSession : public RTPSession
 // {
@@ -65,7 +58,7 @@ public:
         LOG_ERROR("Error sending over socket {}, removing destination", sock);
 		DeleteDestination(RTPTCPAddress(sock));
         if(nullptr != tcpReceiver && !tcpReceiver->isClosing()){
-            tcpReceiver->RequestStream();
+            tcpReceiver->ReConnect();
         }
 	}
@@ -90,6 +83,16 @@ static int rtp_revc_thread_(void* param)
 	return self->OnRtpRecv();
 }
+static int listen_finish_thread_(void* param)
+{
+	if (!param)
+	{
+		return -1;
+	}
+
+	RTPTcpReceiver* self = (RTPTcpReceiver*)param;
+	return self->ListenFinish();
+}
 RTPTcpReceiver::RTPTcpReceiver()
 : m_bRtpExit(false)
@@ -143,11 +146,19 @@ bool RTPTcpReceiver::IsOpened(){
 }
 void RTPTcpReceiver::Close(){
+    m_bRtpExit = true;
+
+    if(m_listenFinishThread.joinable()){
+        m_listenFinishThread.join();
+    }
+}
+
+void RTPTcpReceiver::close_task(){
+    m_bRtpExit = true;
     m_bClosing = true;
     m_bAccepted = true;
-    m_bRtpExit = true;
     LOG_DEBUG("[{}] 1.", m_deviceID);
@@ -207,20 +218,22 @@ int RTPTcpReceiver::initSession(int localPort){
 	status = m_rtpSessionPtr->Create(*m_pSessparams, m_pTrans);
 	if (status < 0)
 	{
-        LOG_ERROR("[{}] create session error!!", m_deviceID);
+        // 若status = -59 ,需运行 export LOGNAME=root ，见 https://blog.csdn.net/m0_37876242/article/details/128588162
+        LOG_ERROR("[{}] create session error: {}", m_deviceID, status);
 		return -1;
 	}
     m_rtpThread = std::thread(rtp_revc_thread_, this);
+    m_listenFinishThread = std::thread(listen_finish_thread_, this);
     InitPS();
-    bool bRet = RequestStream();
-    if (!bRet)
-    {
-        LOG_INFO("[{}] 请求流失败！", m_deviceID);
-        return -1;
-    }
+    // bool bRet = RequestStream();
+    // if (!bRet)
+    // {
+    //     LOG_INFO("[{}] 请求流失败！", m_deviceID);
+    //     return -1;
+    // }
     LOG_INFO("[{}] 初始化成功, congratulations !!!", m_deviceID);
@@ -240,17 +253,56 @@ int RTPTcpReceiver::OnRtpRecv()
 	SocketType nServer = -1;
     LOG_INFO("[{}] Poll started.", m_deviceID);
-    int status = -1;
+    int reconn_times = 0;
+    int reaccept_times = 0;
+    bool bReconn = false;
     while(!m_bRtpExit){
         while(!m_bAccepted){
+            if(m_bRtpExit){
+                goto end_flag;
+            }
+            
+            while (!bReconn){
+                if(m_bRtpExit){
+                    goto end_flag;
+                }
+
+                reconn_times++;
+                if(reconn_times > 10){
+                    // 10次请求都失败，结束任务
+                    m_bRtpExit = true;
+                    goto end_flag;
+                }
+                LOG_DEBUG("[{}] RequestStream...", m_deviceID);
+                bReconn = RequestStream();
+                if (bReconn){
+                    LOG_DEBUG("[{}] RequestStream, True", m_deviceID);
+                   continue;
+                }
+                LOG_DEBUG("[{}] RequestStream, False", m_deviceID);
+                
+                std::this_thread::sleep_for(std::chrono::seconds(3));
+            }
+            
             LOG_DEBUG("[{}] accepting...", m_deviceID);
             nServer = accept(m_nListener, (sockaddr*)&clientAddr, (socklen_t * ) &nLen);
             if (-1 == nServer){
-                std::this_thread::sleep_for(std::chrono::milliseconds(10));
+                reaccept_times++;
+                LOG_DEBUG("[{}] reaccept_times =  {}", m_deviceID, reaccept_times);
+                if(reaccept_times > 600){
+                    LOG_DEBUG("[{}] reaccept_times > 600", m_deviceID);
+                    bReconn = false;
+                    reaccept_times = 0;
+                }
+                std::this_thread::sleep_for(std::chrono::milliseconds(50));
                 continue;
             }
+            LOG_DEBUG("[{}] accept success", m_deviceID);
             m_rtpSessionPtr->AddDestination(RTPTCPAddress(nServer));
             m_bAccepted = true;
+            bReconn = false;
+            reconn_times = 0;
+            reaccept_times = 0;
             LOG_INFO("[{}] nServer={}", m_deviceID, nServer);
             break;
@@ -265,7 +317,7 @@ int RTPTcpReceiver::OnRtpRecv()
                 while ((pack = m_rtpSessionPtr->GetNextPacket()) != NULL)
                 {
-                    LOG_DEBUG("[{}] time: {} ", m_deviceID, get_cur_time());
+                    // LOG_DEBUG("[{}] time: {} ", m_deviceID, UtilTools::get_cur_time_ms());
                     ParsePacket(pack);
                     m_rtpSessionPtr->DeletePacket(pack);
@@ -279,6 +331,8 @@ int RTPTcpReceiver::OnRtpRecv()
 		std::this_thread::sleep_for(std::chrono::milliseconds(10));
     }
+end_flag:
+
     m_rtpSessionPtr->Destroy();
     if(nServer > 0){
@@ -293,13 +347,18 @@ int RTPTcpReceiver::OnRtpRecv()
     return 0;
 }
-bool RTPTcpReceiver::RequestStream(){
-    bool bConnect = m_callback_request_stream();
-    if(!bConnect){
-        Close();
-        return false;
+int RTPTcpReceiver::ListenFinish(){
+    while(!m_bRtpExit){
+        std::this_thread::sleep_for(std::chrono::seconds(3));
     }
+
+    close_task();
+}
+
+bool RTPTcpReceiver::ReConnect(){
     m_bAccepted = false;
+}
-    return true;
+bool RTPTcpReceiver::RequestStream(){
+    return m_callback_request_stream(m_deviceID.c_str());
 }
 \ No newline at end of file
@@ -57,11 +57,14 @@ public:
 public:
 	int OnRtpRecv();
+    bool ReConnect();
+    int ListenFinish();
     bool RequestStream();
     bool isClosing();
 private:
     int initSession(int localPort);
+    void close_task();
 private:
@@ -77,9 +80,12 @@ private:
     std::thread      	m_rtpThread; 					// RTP接收线程
     SocketType          m_nListener;
-    RTPSession*	    m_rtpSessionPtr;					// RTP会话
-    RTPSessionParams*    m_pSessparams;
-    MyTCPTransmitter*    m_pTrans;
+    RTPSession*	        m_rtpSessionPtr;					// RTP会话
+    RTPSessionParams*   m_pSessparams;
+    MyTCPTransmitter*   m_pTrans;
+
+    std::thread      	m_listenFinishThread; 					// RTP接收线程
+    
 };
 #endif  // _RTP_TCP_RECEIVER_H_
@@ -6,7 +6,7 @@
 #include <thread>
 #include <chrono>
-#include "../logger.hpp"
+#include "common_header.h"
 using namespace std;
@@ -42,15 +42,6 @@ private:
 	}
 };
-
-static long long get_cur_time() {
-
-	chrono::time_point<chrono::system_clock, chrono::milliseconds> tpMicro
-		= chrono::time_point_cast<chrono::milliseconds>(chrono::system_clock::now());
-
-	return tpMicro.time_since_epoch().count();
-}
-
 static int rtp_revc_thread_(void* param)
 {
 	if (!param)
@@ -175,7 +166,7 @@ int RTPUdpReceiver::OnRtpRecv()
 			if (m_rtpSessionPtr->GotoFirstSourceWithData())
 			{
 				LOG_INFO("OnRtpRecv GotoFirstSourceWithData --{}", m_deviceID);
-    			last_recv_ts = get_cur_time();
+    			last_recv_ts = UtilTools::get_cur_time_ms();
 				m_idleCount = 0;
 				m_noDataCount = 0;
 				do
@@ -261,7 +252,7 @@ int RTPUdpReceiver::OnRtpRecv()
 			//		//若是30000，时长大约 18s
 			//		if(m_idleCount > 30000)
 			//		{
-   // 					uint64_t cts = get_cur_time();
+   // 					uint64_t cts = UtilTools::get_cur_time_ms();
    // 					float duration_not_recv = (cts - last_recv_ts) / 1000.0;
    // 					
    // 					//LOG_ERROR("************I haven't  got stream from hik gateway exceed {}s,send eof********{}******", duration_not_recv, m_deviceID);	 
+#ifndef _COMMON_HEADER_H_
+#define _COMMON_HEADER_H_
+
+
+#include "../logger.hpp"
+#include "../utiltools.hpp"
+
+#endif
 \ No newline at end of file
@@ -8,9 +8,11 @@
 {       CMpeg2Demux class.                              }
 {                                                       }
 {*******************************************************/
+
 #ifndef _DEMUXER_H_
 #define _DEMUXER_H_
+
 #include <stdint.h>
 #include "buffer.h"
+/*
+* Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.  This source code is a "commercial item" as
+* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer software" and "commercial computer software
+* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*/
+
+// This sample needs at least CUDA 5.5 and a GPU that has at least Compute Capability 2.0
+
+// This sample demonstrates a simple image processing pipeline.
+// First, a JPEG file is huffman decoded and inverse DCT transformed and dequantized.
+// Then the different planes are resized. Finally, the resized image is quantized, forward
+// DCT transformed and huffman encoded.
+
+#include "cuda_kernels.h"
+
+#include <npp.h>
+#include <cuda_runtime.h>
+#include "common/UtilNPP/Exceptions.h"
+
+#include "Endianess.h"
+#include <math.h>
+
+#include <string.h>
+#include <fstream>
+#include <iostream>
+
+#include "common/inc/helper_string.h"
+#include "common/inc/helper_cuda.h"
+//#include "MacroDef.h"
+#include "cuda.h"
+
+using namespace std;
+
+struct FrameHeader
+{
+	unsigned char nSamplePrecision;
+	unsigned short nHeight;
+	unsigned short nWidth;
+	unsigned char nComponents;
+	unsigned char aComponentIdentifier[3];
+	unsigned char aSamplingFactors[3];
+	unsigned char aQuantizationTableSelector[3];
+};
+
+struct ScanHeader
+{
+	unsigned char nComponents;
+	unsigned char aComponentSelector[3];
+	unsigned char aHuffmanTablesSelector[3];
+	unsigned char nSs;
+	unsigned char nSe;
+	unsigned char nA;
+};
+
+struct QuantizationTable
+{
+	unsigned char nPrecisionAndIdentifier;
+	unsigned char aTable[64];
+};
+
+struct HuffmanTable
+{
+	unsigned char nClassAndIdentifier;
+	unsigned char aCodes[16];
+	unsigned char aTable[256];
+};
+
+//??准?炼??藕?量??模??
+//unsigned char std_Y_QT[64] =
+//{
+//	16, 11, 10, 16, 24, 40, 51, 61,
+//	12, 12, 14, 19, 26, 58, 60, 55,
+//	14, 13, 16, 24, 40, 57, 69, 56,
+//	14, 17, 22, 29, 51, 87, 80, 62,
+//	18, 22, 37, 56, 68, 109, 103, 77,
+//	24, 35, 55, 64, 81, 104, 113, 92,
+//	49, 64, 78, 87, 103, 121, 120, 101,
+//	72, 92, 95, 98, 112, 100, 103, 99
+//};
+//
+////??准色???藕?量??模??
+//unsigned char std_UV_QT[64] =
+//{
+//	17, 18, 24, 47, 99, 99, 99, 99,
+//	18, 21, 26, 66, 99, 99, 99, 99,
+//	24, 26, 56, 99, 99, 99, 99, 99,
+//	47, 66, 99, 99, 99, 99, 99, 99,
+//	99, 99, 99, 99, 99, 99, 99, 99,
+//	99, 99, 99, 99, 99, 99, 99, 99,
+//	99, 99, 99, 99, 99, 99, 99, 99,
+//	99, 99, 99, 99, 99, 99, 99, 99
+//};
+
+////?炼??藕?量??模??
+//unsigned char std_Y_QT[64] =
+//{
+//	6, 4, 5, 6, 5, 4, 6, 6,
+//	5, 6, 7, 7, 6, 8, 10, 16,
+//	10, 10, 9, 9, 10, 20, 14, 15,
+//	12, 16, 23, 20, 24, 24, 23, 20,
+//	22, 22, 26, 29, 37, 31, 26, 27,
+//	35, 28, 22, 22, 32, 44, 32, 35,
+//	38, 39, 41, 42, 41, 25, 31, 45,
+//	48, 45, 40, 48, 37, 40, 41, 40
+//};
+//
+////色???藕?量??模??
+//unsigned char std_UV_QT[64] =
+//{
+//	7, 7, 7, 10, 8, 10, 19, 10,
+//	10, 19, 40, 26, 22, 26, 40, 40,
+//	40, 40, 40, 40, 40, 40, 40, 40,
+//	40, 40, 40, 40, 40, 40, 40, 40,
+//	40, 40, 40, 40, 40, 40, 40, 40,
+//	40, 40, 40, 40, 40, 40, 40, 40,
+//	40, 40, 40, 40, 40, 40, 40, 40,
+//	40, 40, 40, 40, 40, 40, 40, 40
+//};
+
+//?炼??藕?量??模??
+unsigned char std_Y_QT[64] =
+{
+	0.75 * 6, 0.75 * 4, 0.75 * 5, 0.75 * 6, 0.75 * 5, 0.75 * 4, 0.75 * 6, 0.75 * 6,
+	0.75 * 5, 0.75 * 6, 0.75 * 7, 0.75 * 7, 0.75 * 6, 0.75 * 8, 0.75 * 10, 0.75 * 16,
+	0.75 * 10, 0.75 * 10, 0.75 * 9, 0.75 * 9, 0.75 * 10, 0.75 * 20, 0.75 * 14, 0.75 * 15,
+	0.75 * 12, 0.75 * 16, 0.75 * 23, 0.75 * 20, 0.75 * 24, 0.75 * 24, 0.75 * 23, 0.75 * 20,
+	0.75 * 22, 0.75 * 22, 0.75 * 26, 0.75 * 29, 0.75 * 37, 0.75 * 31, 0.75 * 26, 0.75 * 27,
+	0.75 * 35, 0.75 * 28, 0.75 * 22, 0.75 * 22, 0.75 * 32, 0.75 * 44, 0.75 * 32, 0.75 * 35,
+	0.75 * 38, 0.75 * 39, 0.75 * 41, 0.75 * 42, 0.75 * 41, 0.75 * 25, 0.75 * 31, 0.75 * 45,
+	0.75 * 48, 0.75 * 45, 0.75 * 40, 0.75 * 48, 0.75 * 37, 0.75 * 40, 0.75 * 41, 0.75 * 40
+};
+
+//色???藕?量??模??
+unsigned char std_UV_QT[64] =
+{
+	0.75 * 7, 0.75 * 7, 0.75 * 7, 0.75 * 10, 0.75 * 8, 0.75 * 10, 0.75 * 19, 0.75 * 10,
+	0.75 * 10, 0.75 * 19, 0.75 * 40, 0.75 * 26, 0.75 * 22, 0.75 * 26, 0.75 * 40, 0.75 * 40,
+	30, 30, 30, 30, 30, 30, 30, 30,
+	30, 30, 30, 30, 30, 30, 30, 30,
+	30, 30, 30, 30, 30, 30, 30, 30,
+	30, 30, 30, 30, 30, 30, 30, 30,
+	30, 30, 30, 30, 30, 30, 30, 30,
+	30, 30, 30, 30, 30, 30, 30, 30
+};
+
+unsigned char STD_DC_Y_NRCODES[16] = { 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
+unsigned char STD_DC_Y_VALUES[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+
+unsigned char STD_DC_UV_NRCODES[16] = { 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
+unsigned char STD_DC_UV_VALUES[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+
+unsigned char STD_AC_Y_NRCODES[16] = { 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0X7D };
+unsigned char STD_AC_Y_VALUES[162] =
+{
+	0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
+	0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
+	0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+	0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
+	0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
+	0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+	0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+	0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+	0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+	0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+	0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+	0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+	0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+	0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+	0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
+	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
+	0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+	0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
+	0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+	0xf9, 0xfa
+};
+
+unsigned char STD_AC_UV_NRCODES[16] = { 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0X77 };
+unsigned char STD_AC_UV_VALUES[162] =
+{
+	0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
+	0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
+	0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+	0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
+	0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
+	0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+	0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
+	0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+	0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+	0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+	0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
+	0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
+	0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
+	0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+	0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
+	0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
+	0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+	0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
+	0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+	0xf9, 0xfa
+};
+
+int DivUp(int x, int d)
+{
+	return (x + d - 1) / d;
+}
+
+template<typename T>
+void writeAndAdvance(unsigned char *&pData, T nElement)
+{
+	writeBigEndian<T>(pData, nElement);
+	pData += sizeof(T);
+}
+
+void writeMarker(unsigned char nMarker, unsigned char *&pData)
+{
+	*pData++ = 0x0ff;
+	*pData++ = nMarker;
+}
+
+void writeJFIFTag(unsigned char *&pData)
+{
+	const char JFIF_TAG[] =
+	{
+		0x4a, 0x46, 0x49, 0x46, 0x00,
+		0x01, 0x02,
+		0x00,
+		0x00, 0x01, 0x00, 0x01,
+		0x00, 0x00
+	};
+
+	writeMarker(0x0e0, pData);
+	writeAndAdvance<unsigned short>(pData, sizeof(JFIF_TAG) + sizeof(unsigned short));
+	memcpy(pData, JFIF_TAG, sizeof(JFIF_TAG));
+	pData += sizeof(JFIF_TAG);
+}
+
+void writeFrameHeader(const FrameHeader &header, unsigned char *&pData)
+{
+	unsigned char aTemp[128];
+	unsigned char *pTemp = aTemp;
+
+	writeAndAdvance<unsigned char>(pTemp, header.nSamplePrecision);
+	writeAndAdvance<unsigned short>(pTemp, header.nHeight);
+	writeAndAdvance<unsigned short>(pTemp, header.nWidth);
+	writeAndAdvance<unsigned char>(pTemp, header.nComponents);
+
+	for (int c = 0; c<header.nComponents; ++c)
+	{
+		writeAndAdvance<unsigned char>(pTemp, header.aComponentIdentifier[c]);
+		writeAndAdvance<unsigned char>(pTemp, header.aSamplingFactors[c]);
+		writeAndAdvance<unsigned char>(pTemp, header.aQuantizationTableSelector[c]);
+	}
+
+	unsigned short nLength = (unsigned short)(pTemp - aTemp);
+
+	writeMarker(0x0C0, pData);
+	writeAndAdvance<unsigned short>(pData, nLength + 2);
+	memcpy(pData, aTemp, nLength);
+	pData += nLength;
+}
+
+void writeScanHeader(const ScanHeader &header, unsigned char *&pData)
+{
+	unsigned char aTemp[128];
+	unsigned char *pTemp = aTemp;
+
+	writeAndAdvance<unsigned char>(pTemp, header.nComponents);
+
+	for (int c = 0; c<header.nComponents; ++c)
+	{
+		writeAndAdvance<unsigned char>(pTemp, header.aComponentSelector[c]);
+		writeAndAdvance<unsigned char>(pTemp, header.aHuffmanTablesSelector[c]);
+	}
+
+	writeAndAdvance<unsigned char>(pTemp, header.nSs);
+	writeAndAdvance<unsigned char>(pTemp, header.nSe);
+	writeAndAdvance<unsigned char>(pTemp, header.nA);
+
+	unsigned short nLength = (unsigned short)(pTemp - aTemp);
+
+	writeMarker(0x0DA, pData);
+	writeAndAdvance<unsigned short>(pData, nLength + 2);
+	memcpy(pData, aTemp, nLength);
+	pData += nLength;
+}
+
+void writeQuantizationTable(const QuantizationTable &table, unsigned char *&pData)
+{
+	writeMarker(0x0DB, pData);
+	writeAndAdvance<unsigned short>(pData, sizeof(QuantizationTable) + 2);
+	memcpy(pData, &table, sizeof(QuantizationTable));
+	pData += sizeof(QuantizationTable);
+}
+
+void writeHuffmanTable(const HuffmanTable &table, unsigned char *&pData)
+{
+	writeMarker(0x0C4, pData);
+
+	// Number of Codes for Bit Lengths [1..16]
+	int nCodeCount = 0;
+
+	for (int i = 0; i < 16; ++i)
+	{
+		nCodeCount += table.aCodes[i];
+	}
+
+	writeAndAdvance<unsigned short>(pData, 17 + nCodeCount + 2);
+	memcpy(pData, &table, 17 + nCodeCount);
+	pData += 17 + nCodeCount;
+}
+
+bool printfNPPinfo(int cudaVerMajor, int cudaVerMinor)
+{
+	const NppLibraryVersion *libVer = nppGetLibVersion();
+
+	printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);
+
+	int driverVersion, runtimeVersion;
+	cudaDriverGetVersion(&driverVersion);
+	cudaRuntimeGetVersion(&runtimeVersion);
+
+	printf("  CUDA Driver  Version: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10);
+	printf("  CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
+
+	bool bVal = checkCudaCapabilities(cudaVerMajor, cudaVerMinor);
+	return bVal;
+}
+
+NppiDCTState *pDCTState;
+FrameHeader oFrameHeader;
+FrameHeader oFrameHeaderFixedSize;
+ScanHeader oScanHeader;
+QuantizationTable aQuantizationTables[4];
+Npp8u *pdQuantizationTables;
+HuffmanTable aHuffmanTables[4];
+HuffmanTable *pHuffmanDCTables;
+HuffmanTable *pHuffmanACTables;
+int nMCUBlocksH;
+int nMCUBlocksV;
+int nMCUBlocksHFixedSize;
+int nMCUBlocksVFixedSize;
+Npp8u *pdScan;
+NppiEncodeHuffmanSpec *apHuffmanDCTable[3];
+NppiEncodeHuffmanSpec *apHuffmanACTable[3];
+unsigned char *pDstJpeg;
+unsigned char *pDstOutput;
+int nRestartInterval;
+
+int initTable()
+{
+	NPP_CHECK_NPP(nppiDCTInitAlloc(&pDCTState));
+
+	nRestartInterval = -1;
+
+	cudaMalloc(&pdQuantizationTables, 64 * 4);
+	pHuffmanDCTables = aHuffmanTables;
+	pHuffmanACTables = &aHuffmanTables[2];
+	memset(aQuantizationTables, 0, 4 * sizeof(QuantizationTable));
+	memset(aHuffmanTables, 0, 4 * sizeof(HuffmanTable));
+	memset(&oFrameHeader, 0, sizeof(FrameHeader));
+
+
+	//????Huffman??
+	aHuffmanTables[0].nClassAndIdentifier = 0;
+	memcpy(aHuffmanTables[0].aCodes, STD_DC_Y_NRCODES, 16);
+	memcpy(aHuffmanTables[0].aTable, STD_DC_Y_VALUES, 12);
+
+	aHuffmanTables[1].nClassAndIdentifier = 1;
+	memcpy(aHuffmanTables[1].aCodes, STD_DC_UV_NRCODES, 16);
+	memcpy(aHuffmanTables[1].aTable, STD_DC_UV_VALUES, 12);
+
+	aHuffmanTables[2].nClassAndIdentifier = 16;
+	memcpy(aHuffmanTables[2].aCodes, STD_AC_Y_NRCODES, 16);
+	memcpy(aHuffmanTables[2].aTable, STD_AC_Y_VALUES, 162);
+
+	aHuffmanTables[3].nClassAndIdentifier = 17;
+	memcpy(aHuffmanTables[3].aCodes, STD_AC_UV_NRCODES, 16);
+	memcpy(aHuffmanTables[3].aTable, STD_AC_UV_VALUES, 162);
+
+
+	//????量????
+	aQuantizationTables[0].nPrecisionAndIdentifier = 0;
+	memcpy(aQuantizationTables[0].aTable, std_Y_QT, 64);
+	aQuantizationTables[1].nPrecisionAndIdentifier = 1;
+	memcpy(aQuantizationTables[1].aTable, std_UV_QT, 64);
+
+	NPP_CHECK_CUDA(cudaMemcpyAsync(pdQuantizationTables, aQuantizationTables[0].aTable, 64, cudaMemcpyHostToDevice));
+	NPP_CHECK_CUDA(cudaMemcpyAsync(pdQuantizationTables + 64, aQuantizationTables[1].aTable, 64, cudaMemcpyHostToDevice));
+
+	oFrameHeader.nSamplePrecision = 8;
+	oFrameHeader.nComponents = 3;
+	oFrameHeader.aComponentIdentifier[0] = 1;
+	oFrameHeader.aComponentIdentifier[1] = 2;
+	oFrameHeader.aComponentIdentifier[2] = 3;
+	oFrameHeader.aSamplingFactors[0] = 34;
+	oFrameHeader.aSamplingFactors[1] = 17;
+	oFrameHeader.aSamplingFactors[2] = 17;
+	oFrameHeader.aQuantizationTableSelector[0] = 0;
+	oFrameHeader.aQuantizationTableSelector[1] = 1;
+	oFrameHeader.aQuantizationTableSelector[2] = 1;
+
+	for (int i = 0; i < oFrameHeader.nComponents; ++i)
+	{
+		nMCUBlocksV = max(nMCUBlocksV, oFrameHeader.aSamplingFactors[i] & 0x0f);
+		nMCUBlocksH = max(nMCUBlocksH, oFrameHeader.aSamplingFactors[i] >> 4);
+	}
+	NPP_CHECK_CUDA(cudaMalloc(&pdScan, 4 << 20));
+
+
+
+	oScanHeader.nComponents = 3;
+	oScanHeader.aComponentSelector[0] = 1;
+	oScanHeader.aComponentSelector[1] = 2;
+	oScanHeader.aComponentSelector[2] = 3;
+	oScanHeader.aHuffmanTablesSelector[0] = 0;
+	oScanHeader.aHuffmanTablesSelector[1] = 17;
+	oScanHeader.aHuffmanTablesSelector[2] = 17;
+	oScanHeader.nSs = 0;
+	oScanHeader.nSe = 63;
+	oScanHeader.nA = 0;
+
+
+	return 0;
+}
+
+NppiSize aSrcSize[3];
+Npp16s *apdDCT[3];// = { 0, 0, 0 };
+Npp32s aDCTStep[3];
+
+Npp8u *apSrcImage[3];// = { 0, 0, 0 };
+Npp32s aSrcImageStep[3];
+size_t aSrcPitch[3];
+
+
+int releaseJpegNPP()
+{
+	nppiDCTFree(pDCTState);
+	cudaFree(pdQuantizationTables);
+	cudaFree(pdScan);
+	for (int i = 0; i < 3; ++i)
+	{
+		cudaFree(apdDCT[i]);
+		cudaFree(apSrcImage[i]);
+	}
+	return 0;
+}
+
+
+int initTable(int flag, int width, int height)
+{
+	//????帧头
+	oFrameHeaderFixedSize.nSamplePrecision = 8;
+	oFrameHeaderFixedSize.nComponents = 3;
+	oFrameHeaderFixedSize.aComponentIdentifier[0] = 1;
+	oFrameHeaderFixedSize.aComponentIdentifier[1] = 2;
+	oFrameHeaderFixedSize.aComponentIdentifier[2] = 3;
+	oFrameHeaderFixedSize.aSamplingFactors[0] = 34;
+	oFrameHeaderFixedSize.aSamplingFactors[1] = 17;
+	oFrameHeaderFixedSize.aSamplingFactors[2] = 17;
+	oFrameHeaderFixedSize.aQuantizationTableSelector[0] = 0;
+	oFrameHeaderFixedSize.aQuantizationTableSelector[1] = 1;
+	oFrameHeaderFixedSize.aQuantizationTableSelector[2] = 1;
+	oFrameHeaderFixedSize.nWidth = width;
+	oFrameHeaderFixedSize.nHeight = height;
+
+	for (int i = 0; i < oFrameHeaderFixedSize.nComponents; ++i)
+	{
+		nMCUBlocksVFixedSize = max(nMCUBlocksVFixedSize, oFrameHeaderFixedSize.aSamplingFactors[i] & 0x0f);
+		nMCUBlocksHFixedSize = max(nMCUBlocksHFixedSize, oFrameHeaderFixedSize.aSamplingFactors[i] >> 4);
+	}
+
+	for (int i = 0; i < oFrameHeaderFixedSize.nComponents; ++i)
+	{
+		NppiSize oBlocks;
+		NppiSize oBlocksPerMCU = { oFrameHeaderFixedSize.aSamplingFactors[i] >> 4, oFrameHeaderFixedSize.aSamplingFactors[i] & 0x0f };
+
+		oBlocks.width = (int)ceil((oFrameHeaderFixedSize.nWidth + 7) / 8 *
+			static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksHFixedSize);
+		oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;
+
+		oBlocks.height = (int)ceil((oFrameHeaderFixedSize.nHeight + 7) / 8 *
+			static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksVFixedSize);
+		oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;
+
+		aSrcSize[i].width = oBlocks.width * 8;
+		aSrcSize[i].height = oBlocks.height * 8;
+
+		// Allocate Memory
+		size_t nPitch;
+		NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height));
+		aDCTStep[i] = static_cast<Npp32s>(nPitch);
+
+		NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height));
+
+		aSrcPitch[i] = nPitch;
+		aSrcImageStep[i] = static_cast<Npp32s>(nPitch);
+	}
+
+	return 0;
+}
+
+int jpegNPP(const char *szOutputFile, float* d_srcRGB)
+{
+	//RGB2YUV
+	cudaError_t cudaStatus;
+	cudaStatus = cuda_common::RGB2YUV(d_srcRGB, oFrameHeaderFixedSize.nWidth, oFrameHeaderFixedSize.nHeight,
+		apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height,
+		apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height,
+		apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height);
+
+	/**
+	* Forward DCT, quantization and level shift part of the JPEG encoding.
+	* Input is expected in 8x8 macro blocks and output is expected to be in 64x1
+	* macro blocks. The new version of the primitive takes the ROI in image pixel size and
+	* works with DCT coefficients that are in zig-zag order.
+	*/
+	int k = 0;
+	//LOG_INFO("NPP_CHECK_NPP:%d", 1);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0],
+		apdDCT[0], aDCTStep[0],
+		pdQuantizationTables + k * 64,
+		aSrcSize[0],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	k = 1;
+	//LOG_INFO("NPP_CHECK_NPP:%d", 2);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1],
+		apdDCT[1], aDCTStep[1],
+		pdQuantizationTables + k * 64,
+		aSrcSize[1],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	//LOG_INFO("NPP_CHECK_NPP:%d", 3);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2],
+		apdDCT[2], aDCTStep[2],
+		pdQuantizationTables + k * 64,
+		aSrcSize[2],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	// Huffman Encoding
+
+	Npp32s nScanLength;
+	Npp8u *pJpegEncoderTemp;
+
+#if (CUDA_VERSION == 8000)
+		Npp32s nTempSize; //when using CUDA8
+#else
+		size_t nTempSize; //when using CUDA9
+#endif
+	//modified by Junlin 190221
+
+	//LOG_INFO("NPP_CHECK_NPP:%d",4);
+	if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize)))
+	{
+		printf("nppiEncodeHuffmanGetSize Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",5);
+	NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize));
+
+	/**
+	* Allocates memory and creates a Huffman table in a format that is suitable for the encoder.
+	*/
+	NppStatus t_status;
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]);
+
+	/**
+	* Huffman Encoding of the JPEG Encoding.
+	* Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan.
+	*/
+	Npp32s nSs = 0;
+	Npp32s nSe = 63;
+	Npp32s nH = 0;
+	Npp32s nL = 0;
+	//LOG_INFO("NPP_CHECK_NPP:%d",6);
+	if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep,
+		0, nSs, nSe, nH, nL,
+		pdScan, &nScanLength,
+		apHuffmanDCTable,
+		apHuffmanACTable,
+		aSrcSize,
+		pJpegEncoderTemp)))
+	{
+		printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	for (int i = 0; i < 3; ++i)
+	{
+		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]);
+		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]);
+	}
+	// Write JPEG
+	pDstJpeg = new unsigned char[4 << 20]{};
+	pDstOutput = pDstJpeg;
+
+	writeMarker(0x0D8, pDstOutput);
+	writeJFIFTag(pDstOutput);
+	writeQuantizationTable(aQuantizationTables[0], pDstOutput);
+	writeQuantizationTable(aQuantizationTables[1], pDstOutput);
+	writeHuffmanTable(pHuffmanDCTables[0], pDstOutput);
+	writeHuffmanTable(pHuffmanACTables[0], pDstOutput);
+	writeHuffmanTable(pHuffmanDCTables[1], pDstOutput);
+	writeHuffmanTable(pHuffmanACTables[1], pDstOutput);
+	writeFrameHeader(oFrameHeaderFixedSize, pDstOutput);
+	writeScanHeader(oScanHeader, pDstOutput);
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",7);
+	NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost));
+
+	pDstOutput += nScanLength;
+	writeMarker(0x0D9, pDstOutput);
+	{
+		// Write result to file.
+		std::ofstream outputFile(szOutputFile, ios::out | ios::binary);
+		outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg));
+	}
+
+	// Cleanup
+	cudaFree(pJpegEncoderTemp);
+	delete[] pDstJpeg;
+
+
+	return EXIT_SUCCESS;
+}
+
+int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB)
+{
+	//RGB2YUV
+	cudaError_t cudaStatus;
+	cudaStatus = cuda_common::RGB2YUV(d_srcRGB, oFrameHeaderFixedSize.nWidth, oFrameHeaderFixedSize.nHeight,
+		apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height,
+		apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height,
+		apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height);
+
+	/**
+	* Forward DCT, quantization and level shift part of the JPEG encoding.
+	* Input is expected in 8x8 macro blocks and output is expected to be in 64x1
+	* macro blocks. The new version of the primitive takes the ROI in image pixel size and
+	* works with DCT coefficients that are in zig-zag order.
+	*/
+	int k = 0;
+	//LOG_INFO("NPP_CHECK_NPP:%d", 1);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0],
+		apdDCT[0], aDCTStep[0],
+		pdQuantizationTables + k * 64,
+		aSrcSize[0],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	k = 1;
+	//LOG_INFO("NPP_CHECK_NPP:%d", 2);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1],
+		apdDCT[1], aDCTStep[1],
+		pdQuantizationTables + k * 64,
+		aSrcSize[1],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	//LOG_INFO("NPP_CHECK_NPP:%d", 3);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2],
+		apdDCT[2], aDCTStep[2],
+		pdQuantizationTables + k * 64,
+		aSrcSize[2],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	// Huffman Encoding
+
+	Npp32s nScanLength;
+	Npp8u *pJpegEncoderTemp;
+
+#if (CUDA_VERSION == 8000)
+	Npp32s nTempSize; //when using CUDA8
+#else
+	size_t nTempSize; //when using CUDA9
+#endif
+					  //modified by Junlin 190221
+
+					  //LOG_INFO("NPP_CHECK_NPP:%d",4);
+	if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize)))
+	{
+		printf("nppiEncodeHuffmanGetSize Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",5);
+	NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize));
+
+	/**
+	* Allocates memory and creates a Huffman table in a format that is suitable for the encoder.
+	*/
+	NppStatus t_status;
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]);
+
+	/**
+	* Huffman Encoding of the JPEG Encoding.
+	* Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan.
+	*/
+	Npp32s nSs = 0;
+	Npp32s nSe = 63;
+	Npp32s nH = 0;
+	Npp32s nL = 0;
+	//LOG_INFO("NPP_CHECK_NPP:%d",6);
+	if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep,
+		0, nSs, nSe, nH, nL,
+		pdScan, &nScanLength,
+		apHuffmanDCTable,
+		apHuffmanACTable,
+		aSrcSize,
+		pJpegEncoderTemp)))
+	{
+		printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	for (int i = 0; i < 3; ++i)
+	{
+		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]);
+		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]);
+	}
+	// Write JPEG
+	pDstJpeg = new unsigned char[4 << 20]{};
+	pDstOutput = pDstJpeg;
+
+	writeMarker(0x0D8, pDstOutput);
+	writeJFIFTag(pDstOutput);
+	writeQuantizationTable(aQuantizationTables[0], pDstOutput);
+	writeQuantizationTable(aQuantizationTables[1], pDstOutput);
+	writeHuffmanTable(pHuffmanDCTables[0], pDstOutput);
+	writeHuffmanTable(pHuffmanACTables[0], pDstOutput);
+	writeHuffmanTable(pHuffmanDCTables[1], pDstOutput);
+	writeHuffmanTable(pHuffmanACTables[1], pDstOutput);
+	writeFrameHeader(oFrameHeaderFixedSize, pDstOutput);
+	writeScanHeader(oScanHeader, pDstOutput);
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",7);
+	NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost));
+
+	pDstOutput += nScanLength;
+	writeMarker(0x0D9, pDstOutput);
+	{
+		// Write result to file.
+		std::ofstream outputFile(szOutputFile, ios::out | ios::binary);
+		outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg));
+	}
+
+	// Cleanup
+	cudaFree(pJpegEncoderTemp);
+	delete[] pDstJpeg;
+
+
+	return EXIT_SUCCESS;
+}
+
+
+int jpegNPP(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height)
+{
+	NppiSize aSrcSize[3];
+	Npp16s *apdDCT[3] = { 0, 0, 0 };
+	Npp32s aDCTStep[3];
+
+	Npp8u *apSrcImage[3] = { 0, 0, 0 };
+	Npp32s aSrcImageStep[3];
+	size_t aSrcPitch[3];
+
+
+	//????帧头
+	oFrameHeader.nWidth = img_width;
+	oFrameHeader.nHeight = img_height;
+
+	for (int i = 0; i < oFrameHeader.nComponents; ++i)
+	{
+		NppiSize oBlocks;
+		NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] >> 4, oFrameHeader.aSamplingFactors[i] & 0x0f };
+
+		oBlocks.width = (int)ceil((oFrameHeader.nWidth + 7) / 8 *
+			static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH);
+		oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;
+
+		oBlocks.height = (int)ceil((oFrameHeader.nHeight + 7) / 8 *
+			static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV);
+		oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;
+
+		aSrcSize[i].width = oBlocks.width * 8;
+		aSrcSize[i].height = oBlocks.height * 8;
+
+		// Allocate Memory
+		size_t nPitch;
+		//LOG_INFO("NPP_CHECK_CUDA:%d",1);
+		NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height));
+		aDCTStep[i] = static_cast<Npp32s>(nPitch);
+
+		//LOG_INFO("NPP_CHECK_CUDA:%d",2);
+		NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height));
+
+		aSrcPitch[i] = nPitch;
+		aSrcImageStep[i] = static_cast<Npp32s>(nPitch);
+	}
+
+	//RGB2YUV
+	cudaError_t cudaStatus;
+	cudaStatus = cuda_common::RGB2YUV(d_srcRGB, img_width, img_height,
+		apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height,
+		apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height,
+		apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height);
+
+	/**
+	* Forward DCT, quantization and level shift part of the JPEG encoding.
+	* Input is expected in 8x8 macro blocks and output is expected to be in 64x1
+	* macro blocks. The new version of the primitive takes the ROI in image pixel size and
+	* works with DCT coefficients that are in zig-zag order.
+	*/
+	int k = 0;
+	//LOG_INFO("NPP_CHECK_CUDA:%d",3);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0],
+		apdDCT[0], aDCTStep[0],
+		pdQuantizationTables + k * 64,
+		aSrcSize[0],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+	k = 1;
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",4);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1],
+		apdDCT[1], aDCTStep[1],
+		pdQuantizationTables + k * 64,
+		aSrcSize[1],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",5);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2],
+		apdDCT[2], aDCTStep[2],
+		pdQuantizationTables + k * 64,
+		aSrcSize[2],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	// Huffman Encoding
+
+	Npp32s nScanLength;
+	Npp8u *pJpegEncoderTemp;
+
+#if (CUDA_VERSION == 8000)
+	Npp32s nTempSize; //when using CUDA8
+#else
+	size_t nTempSize; //when using CUDA9
+#endif
+					  //modified by Junlin 190221
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",6);
+	if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize)))
+	{
+		printf("nppiEncodeHuffmanGetSize Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",7);
+	NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize));
+
+	/**
+	* Allocates memory and creates a Huffman table in a format that is suitable for the encoder.
+	*/
+	NppStatus t_status;
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]);
+
+	/**
+	* Huffman Encoding of the JPEG Encoding.
+	* Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan.
+	*/
+	Npp32s nSs = 0;
+	Npp32s nSe = 63;
+	Npp32s nH = 0;
+	Npp32s nL = 0;
+	//LOG_INFO("NPP_CHECK_CUDA:%d",8);
+	if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep,
+		0, nSs, nSe, nH, nL,
+		pdScan, &nScanLength,
+		apHuffmanDCTable,
+		apHuffmanACTable,
+		aSrcSize,
+		pJpegEncoderTemp)))
+	{
+		printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	for (int i = 0; i < 3; ++i)
+	{
+		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]);
+		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]);
+	}
+	// Write JPEG
+	pDstJpeg = new unsigned char[4 << 20]{};
+	pDstOutput = pDstJpeg;
+
+	writeMarker(0x0D8, pDstOutput);
+	writeJFIFTag(pDstOutput);
+	writeQuantizationTable(aQuantizationTables[0], pDstOutput);
+	writeQuantizationTable(aQuantizationTables[1], pDstOutput);
+	writeHuffmanTable(pHuffmanDCTables[0], pDstOutput);
+	writeHuffmanTable(pHuffmanACTables[0], pDstOutput);
+	writeHuffmanTable(pHuffmanDCTables[1], pDstOutput);
+	writeHuffmanTable(pHuffmanACTables[1], pDstOutput);
+	writeFrameHeader(oFrameHeader, pDstOutput);
+	writeScanHeader(oScanHeader, pDstOutput);
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",9);
+	NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost));
+
+	pDstOutput += nScanLength;
+	writeMarker(0x0D9, pDstOutput);
+
+	{
+		// Write result to file.
+		std::ofstream outputFile(szOutputFile, ios::out | ios::binary);
+		outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg));
+	}
+
+	// Cleanup
+	cudaFree(pJpegEncoderTemp);
+	delete[] pDstJpeg;
+	for (int i = 0; i < 3; ++i)
+	{
+		cudaFree(apdDCT[i]);
+		cudaFree(apSrcImage[i]);
+	}
+
+	return EXIT_SUCCESS;
+}
+
+
+int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height)
+{
+	NppiSize aSrcSize[3];
+	Npp16s *apdDCT[3] = { 0, 0, 0 };
+	Npp32s aDCTStep[3];
+
+	Npp8u *apSrcImage[3] = { 0, 0, 0 };
+	Npp32s aSrcImageStep[3];
+	size_t aSrcPitch[3];
+
+
+	//????帧头
+	oFrameHeader.nWidth = img_width;
+	oFrameHeader.nHeight = img_height;
+
+	for (int i = 0; i < oFrameHeader.nComponents; ++i)
+	{
+		NppiSize oBlocks;
+		NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] >> 4, oFrameHeader.aSamplingFactors[i] & 0x0f };
+
+		oBlocks.width = (int)ceil((oFrameHeader.nWidth + 7) / 8 *
+			static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH);
+		oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;
+
+		oBlocks.height = (int)ceil((oFrameHeader.nHeight + 7) / 8 *
+			static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV);
+		oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;
+
+		aSrcSize[i].width = oBlocks.width * 8;
+		aSrcSize[i].height = oBlocks.height * 8;
+
+		// Allocate Memory
+		size_t nPitch;
+		//LOG_INFO("NPP_CHECK_CUDA:%d",1);
+		NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height));
+		aDCTStep[i] = static_cast<Npp32s>(nPitch);
+
+		//LOG_INFO("NPP_CHECK_CUDA:%d",2);
+		NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height));
+
+		aSrcPitch[i] = nPitch;
+		aSrcImageStep[i] = static_cast<Npp32s>(nPitch);
+	}
+
+	//RGB2YUV
+	cudaError_t cudaStatus;
+	cudaStatus = cuda_common::RGB2YUV(d_srcRGB, img_width, img_height,
+		apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height,
+		apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height,
+		apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height);
+
+	/**
+	* Forward DCT, quantization and level shift part of the JPEG encoding.
+	* Input is expected in 8x8 macro blocks and output is expected to be in 64x1
+	* macro blocks. The new version of the primitive takes the ROI in image pixel size and
+	* works with DCT coefficients that are in zig-zag order.
+	*/
+	int k = 0;
+	//LOG_INFO("NPP_CHECK_CUDA:%d",3);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0],
+		apdDCT[0], aDCTStep[0],
+		pdQuantizationTables + k * 64,
+		aSrcSize[0],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+	k = 1;
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",4);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1],
+		apdDCT[1], aDCTStep[1],
+		pdQuantizationTables + k * 64,
+		aSrcSize[1],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",5);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2],
+		apdDCT[2], aDCTStep[2],
+		pdQuantizationTables + k * 64,
+		aSrcSize[2],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	// Huffman Encoding
+
+	Npp32s nScanLength;
+	Npp8u *pJpegEncoderTemp;
+
+#if (CUDA_VERSION == 8000)
+	Npp32s nTempSize; //when using CUDA8
+#else
+	size_t nTempSize; //when using CUDA9
+#endif
+					  //modified by Junlin 190221
+
+					  //LOG_INFO("NPP_CHECK_CUDA:%d",6);
+	if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize)))
+	{
+		printf("nppiEncodeHuffmanGetSize Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",7);
+	NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize));
+
+	/**
+	* Allocates memory and creates a Huffman table in a format that is suitable for the encoder.
+	*/
+	NppStatus t_status;
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]);
+
+	/**
+	* Huffman Encoding of the JPEG Encoding.
+	* Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan.
+	*/
+	Npp32s nSs = 0;
+	Npp32s nSe = 63;
+	Npp32s nH = 0;
+	Npp32s nL = 0;
+	//LOG_INFO("NPP_CHECK_CUDA:%d",8);
+	if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep,
+		0, nSs, nSe, nH, nL,
+		pdScan, &nScanLength,
+		apHuffmanDCTable,
+		apHuffmanACTable,
+		aSrcSize,
+		pJpegEncoderTemp)))
+	{
+		printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	for (int i = 0; i < 3; ++i)
+	{
+		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]);
+		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]);
+	}
+	// Write JPEG
+	pDstJpeg = new unsigned char[4 << 20]{};
+	pDstOutput = pDstJpeg;
+
+	writeMarker(0x0D8, pDstOutput);
+	writeJFIFTag(pDstOutput);
+	writeQuantizationTable(aQuantizationTables[0], pDstOutput);
+	writeQuantizationTable(aQuantizationTables[1], pDstOutput);
+	writeHuffmanTable(pHuffmanDCTables[0], pDstOutput);
+	writeHuffmanTable(pHuffmanACTables[0], pDstOutput);
+	writeHuffmanTable(pHuffmanDCTables[1], pDstOutput);
+	writeHuffmanTable(pHuffmanACTables[1], pDstOutput);
+	writeFrameHeader(oFrameHeader, pDstOutput);
+	writeScanHeader(oScanHeader, pDstOutput);
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",9);
+	NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost));
+
+	pDstOutput += nScanLength;
+	writeMarker(0x0D9, pDstOutput);
+
+	{
+		// Write result to file.
+		std::ofstream outputFile(szOutputFile, ios::out | ios::binary);
+		outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg));
+	}
+
+	// Cleanup
+	cudaFree(pJpegEncoderTemp);
+	delete[] pDstJpeg;
+	for (int i = 0; i < 3; ++i)
+	{
+		cudaFree(apdDCT[i]);
+		cudaFree(apSrcImage[i]);
+	}
+
+	return EXIT_SUCCESS;
+}
@@ -10,8 +10,6 @@
 #include <chrono>
-
-
 #include <unistd.h>
@@ -24,6 +22,7 @@
 #include "arpa/inet.h"
 #endif
+#include "utiltools.hpp"
 #define MIN_RTP_PORT		10000
 #define MAX_RTP_PORT		60000
@@ -88,7 +87,7 @@ int sum2 = 0;
 cudaStream_t stream[2];
-string data_home = "/data/tongtu/";
+string data_home = "/mnt/data/cmhu/tmp/";
 #define checkCudaErrors(S) do {CUresult  status; \
@@ -183,7 +182,7 @@ void postDecoded(const void * userPtr, AVFrame * gpuFrame){
                     cudaError_t cudaStatus;
                     if(pHwRgb[0] == nullptr){
                         // cudaStreamCreate(&stream[0]);
-                        cuda_common::setColorSpace( ITU709, 0 );
+                        cuda_common::setColorSpace( ITU_709, 0 );
                         cudaStatus = cudaMalloc((void **)&pHwRgb[0], 3 * gpuFrame->width * gpuFrame->height * sizeof(unsigned char));
                     }
                     cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], pHwRgb[0], gpuFrame->width, gpuFrame->height);
@@ -208,7 +207,7 @@ void postDecoded(const void * userPtr, AVFrame * gpuFrame){
                     cudaError_t cudaStatus;
                     if(pHwRgb[1] == nullptr){
                         // cudaStreamCreate(&stream[1]);
-                        cuda_common::setColorSpace( ITU709, 0 );
+                        cuda_common::setColorSpace( ITU_709, 0 );
                         cudaStatus = cudaMalloc((void **)&pHwRgb[1], 3 * gpuFrame->width * gpuFrame->height * sizeof(unsigned char));
                     }
                     cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], pHwRgb[1], gpuFrame->width, gpuFrame->height);
@@ -231,13 +230,6 @@ bool count_flag = false;
 int count = 0;
 int count_std = 100;
-static long long get_cur_time(){
-    // 获取操作系统当前时间点（精确到ms）
-    chrono::time_point<chrono::system_clock, chrono::milliseconds> tpMicro
-        = chrono::time_point_cast<chrono::milliseconds>(chrono::system_clock::now());
-
-    return tpMicro.time_since_epoch().count();
-}
 static int sum = 0;
 unsigned char *pHwData = nullptr;
@@ -255,13 +247,13 @@ void postDecoded0(const void * userPtr, AVFrame * gpuFrame){
             {
                 count_flag = true;
                 count = 0;
-                end_time = start_time = get_cur_time();
+                end_time = start_time = UtilTools::get_cur_time_ms();
             }
             count++;
             sum ++ ;
             if (count >= count_std)
             {
-                // end_time = get_cur_time();
+                // end_time = UtilTools::get_cur_time_ms();
                 // long time_using = end_time - start_time;
                 // double time_per_frame = double(time_using)/count_std ;
                 // cout << count_std << "帧用时:" << time_using << "ms 每帧用时：" << time_per_frame << "ms" << endl;
@@ -278,7 +270,7 @@ void postDecoded0(const void * userPtr, AVFrame * gpuFrame){
                 // cout << "gpu id : " << decoder->m_cfg.gpuid.c_str() << endl;
                 cudaError_t cudaStatus;
                 if(pHwData == nullptr){
-                    cuda_common::setColorSpace( ITU709, 0 );
+                    cuda_common::setColorSpace( ITU_709, 0 );
                     cudaStatus = cudaMalloc((void **)&pHwData, 3 * gpuFrame->width * gpuFrame->height * sizeof(unsigned char));
                 }
                 cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], pHwData, gpuFrame->width, gpuFrame->height);
@@ -296,10 +288,10 @@ void postDecoded0(const void * userPtr, AVFrame * gpuFrame){
 }
 void decode_finished_cbk(const void* userPtr){
-    cout << "当前时间戳: " << get_cur_time() << endl;
+    cout << "当前时间戳: " << UtilTools::get_cur_time_ms() << endl;
 }
-bool decode_request_stream_cbk(){
+bool decode_request_stream_cbk(const char* deviceId){
     cout << "需在此请求流" << endl;
     return true;
 }
@@ -374,7 +366,7 @@ void logFF(void *, int level, const char *fmt, va_list ap)
 int main(int argc, char* argv[]){
-    test_uri = argv[1];
+    test_uri = "rtsp://admin:admin@123456@192.168.60.176:554/cam/realmonitor?channel=1&subtype=0";//argv[1];
     char* gpuid = argv[2];
     int port = atoi(argv[3]);
     cout << test_uri << "   gpu_id:" << gpuid << "   port:" << port << endl;
@@ -393,7 +385,7 @@ int main(int argc, char* argv[]){
                 std::this_thread::sleep_for(std::chrono::minutes(1));
                 FFNvDecoderManager* pDecManager = FFNvDecoderManager::getInstance();
                 int count = pDecManager->count();
-                cout << "当前时间：" << get_cur_time() << "  当前运行路数： " << pDecManager->count() << endl;
+                cout << "当前时间：" << UtilTools::get_cur_time_ms() << "  当前运行路数： " << pDecManager->count() << endl;
             }  
             return (void*)0;
+#ifndef _UTIL_TOOLS_HPP_
+#define _UTIL_TOOLS_HPP_
+
+#include<chrono>
+
+using namespace std;
+
+namespace UtilTools{
+
+    static long get_cur_time_ms() {
+        chrono::time_point<chrono::system_clock, chrono::milliseconds> tpMicro
+            = chrono::time_point_cast<chrono::milliseconds>(chrono::system_clock::now());
+        return tpMicro.time_since_epoch().count();
+    }
+
+}
+
+#endif
 \ No newline at end of file