From 5686354a692a7987f1bb58f2dbe10f39dcbcbd1e Mon Sep 17 00:00:00 2001
From: fiss <2657262686@qq.com>
Date: Mon, 20 Mar 2023 23:30:34 -0400
Subject: [PATCH] 初步编译成功cuvid部分的

---
 src/Makefile                         |    8 ++++----
 src/Makefile.bak                     |    4 ++--
 src/demo/Makefile                    |  104 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------------------
 src/demo/Makefile.o.nvdec            |   61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/demo/main_nvdec.cpp              |  101 ++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------
 src/gb28181/FFGB28181Decoder.cpp     |    7 ++++---
 src/gb28181/Makefile                 |    4 +++-
 src/interface/FFNvDecoderManager.cpp |    2 +-
 src/interface/Makefile               |   10 ++++++----
 src/nvdec/DrawImageOnGPU.cu          |  126 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/nvdec/FFCuContextManager.cpp     |   41 +++++++++++++++++++++++++++++++++++++++++
 src/nvdec/FFCuContextManager.h       |   28 ++++++++++++++++++++++++++++
 src/nvdec/FFNvDecoder.cpp            |
 src/nvdec/FFNvDecoder.h              |  107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/nvdec/GpuRgbMemory.hpp           |   34 ++++++++++++++++++++++++++++++++++
 src/nvdec/ImageSaveGPU.cpp           |  123 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/nvdec/ImageSaveGPU.h             |   65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/nvdec/Makefile                   |  101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/nvdec/NV12ToRGB.cu               |  345 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/nvdec/NvDecoderApi.cpp           |  133 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/nvdec/NvDecoderApi.h             |   44 ++++++++++++++++++++++++++++++++++++++++++++
 src/nvdec/NvJpegEncoder.cpp          |   90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/nvdec/NvJpegEncoder.h            |    3 +++
 src/nvdec/PartMemCopy.cu             |  289 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/nvdec/RGB2YUV.cu                 |  263 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/nvdec/ResizeImage.cu             |   84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/nvdec/common_header.h            |    9 +++++++++
 src/nvdec/cuda_kernels.h             |   63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/nvdec/define.hpp                 |   11 +++++++++++
 src/nvdec/jpegNPP.cpp-1              |
 src/nvdecoder/DrawImageOnGPU.cu      |  126 ------------------------------------------------------------------------------------------------------------------------------
 src/nvdecoder/FFCuContextManager.cpp |   41 -----------------------------------------
 src/nvdecoder/FFCuContextManager.h   |   28 ----------------------------
 src/nvdecoder/FFNvDecoder.cpp        |
 src/nvdecoder/FFNvDecoder.h          |  107 -----------------------------------------------------------------------------------------------------------
 src/nvdecoder/GpuRgbMemory.hpp       |   34 ----------------------------------
 src/nvdecoder/ImageSaveGPU.cpp       |  123 ---------------------------------------------------------------------------------------------------------------------------
 src/nvdecoder/ImageSaveGPU.h         |   65 -----------------------------------------------------------------
 src/nvdecoder/Makefile               |  102 ------------------------------------------------------------------------------------------------------
 src/nvdecoder/NV12ToRGB.cu           |  345 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 src/nvdecoder/NvDecoderApi.cpp       |  133 -------------------------------------------------------------------------------------------------------------------------------------
 src/nvdecoder/NvDecoderApi.h         |   44 --------------------------------------------
 src/nvdecoder/NvJpegEncoder.cpp      |   90 ------------------------------------------------------------------------------------------
 src/nvdecoder/NvJpegEncoder.h        |    3 ---
 src/nvdecoder/PartMemCopy.cu         |  289 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 src/nvdecoder/RGB2YUV.cu             |  263 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 src/nvdecoder/ResizeImage.cu         |   84 ------------------------------------------------------------------------------------
 src/nvdecoder/common_header.h        |    9 ---------
 src/nvdecoder/cuda_kernels.h         |   63 ---------------------------------------------------------------
 src/nvdecoder/define.hpp             |   11 -----------
 src/nvdecoder/jpegNPP.cpp-1          |
 51 files changed, 3857 insertions(+), 3775 deletions(-)
 create mode 100644 src/demo/Makefile.o.nvdec
 create mode 100644 src/nvdec/DrawImageOnGPU.cu
 create mode 100644 src/nvdec/FFCuContextManager.cpp
 create mode 100644 src/nvdec/FFCuContextManager.h
 create mode 100644 src/nvdec/FFNvDecoder.cpp
 create mode 100644 src/nvdec/FFNvDecoder.h
 create mode 100644 src/nvdec/GpuRgbMemory.hpp
 create mode 100644 src/nvdec/ImageSaveGPU.cpp
 create mode 100644 src/nvdec/ImageSaveGPU.h
 create mode 100644 src/nvdec/Makefile
 create mode 100644 src/nvdec/NV12ToRGB.cu
 create mode 100644 src/nvdec/NvDecoderApi.cpp
 create mode 100644 src/nvdec/NvDecoderApi.h
 create mode 100644 src/nvdec/NvJpegEncoder.cpp
 create mode 100644 src/nvdec/NvJpegEncoder.h
 create mode 100644 src/nvdec/PartMemCopy.cu
 create mode 100644 src/nvdec/RGB2YUV.cu
 create mode 100644 src/nvdec/ResizeImage.cu
 create mode 100644 src/nvdec/common_header.h
 create mode 100644 src/nvdec/cuda_kernels.h
 create mode 100644 src/nvdec/define.hpp
 create mode 100644 src/nvdec/jpegNPP.cpp-1
 delete mode 100644 src/nvdecoder/DrawImageOnGPU.cu
 delete mode 100644 src/nvdecoder/FFCuContextManager.cpp
 delete mode 100644 src/nvdecoder/FFCuContextManager.h
 delete mode 100644 src/nvdecoder/FFNvDecoder.cpp
 delete mode 100644 src/nvdecoder/FFNvDecoder.h
 delete mode 100644 src/nvdecoder/GpuRgbMemory.hpp
 delete mode 100644 src/nvdecoder/ImageSaveGPU.cpp
 delete mode 100644 src/nvdecoder/ImageSaveGPU.h
 delete mode 100644 src/nvdecoder/Makefile
 delete mode 100644 src/nvdecoder/NV12ToRGB.cu
 delete mode 100644 src/nvdecoder/NvDecoderApi.cpp
 delete mode 100644 src/nvdecoder/NvDecoderApi.h
 delete mode 100644 src/nvdecoder/NvJpegEncoder.cpp
 delete mode 100644 src/nvdecoder/NvJpegEncoder.h
 delete mode 100644 src/nvdecoder/PartMemCopy.cu
 delete mode 100644 src/nvdecoder/RGB2YUV.cu
 delete mode 100644 src/nvdecoder/ResizeImage.cu
 delete mode 100644 src/nvdecoder/common_header.h
 delete mode 100644 src/nvdecoder/cuda_kernels.h
 delete mode 100644 src/nvdecoder/define.hpp
 delete mode 100644 src/nvdecoder/jpegNPP.cpp-1

diff --git a/src/Makefile b/src/Makefile
index 2b6c8d8..994ba2a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -29,11 +29,11 @@ LDFLAGS:=
 LIBS:= -L $(SPDLOG_ROOT)/lib -l:libspdlog.a \
 
 # 各个模块
-MODULES:= nvdecoder gb28181 interface demo
+MODULES:= nvdec gb28181 interface demo
 
 # 各个模块对应的库
-# MODULE_LIBS:=$(BUILD_DIR)/nvdecoder/lib/nvdecoder.a\
-# 			$(BUILD_DIR)/nvdecoder/lib/gb28181.a\
+# MODULE_LIBS:=$(BUILD_DIR)/nvdec/lib/nvdec.a\
+# 			$(BUILD_DIR)/nvdec/lib/gb28181.a\
 # 			$(BUILD_DIR)/interface/lib/interface.a\
 
 # 最终目标文件
@@ -46,7 +46,7 @@ all:$(TARGET)
 # 最终目标依赖关系
 $(TARGET):FORCE | $(BIN_DIR)
 	@for n in $(MODULES); do make -s -f $(TOP_DIR)/$$n/Makefile MODULE=$$n || exit "$$?"; done
-# @echo -e "\e[32m""Linking executable $(TARGET)""\e[0m"
+	@echo -e "\e[32m""Linking executable $(TARGET)""\e[0m"
 #@$(LD) $(LDFLAGS) -o $@ $(MODULE_LIBS) $(LIBS)
 
 # 若没有bin目录则自动生成
diff --git a/src/Makefile.bak b/src/Makefile.bak
index bddc482..af26493 100644
--- a/src/Makefile.bak
+++ b/src/Makefile.bak
@@ -38,7 +38,7 @@ CFLAGS= -g -fPIC -O0 $(INCLUDE) -pthread -lrt -lz -std=c++11 -fvisibility=hidden
 NFLAGS_LIB=-g -c -shared -Xcompiler -fPIC -Xcompiler -fvisibility=hidden
 NFLAGS = $(NFLAGS_LIB) $(INCLUDE) -std=c++11
 
-SRCS:=$(wildcard $(SRC_ROOT)/nvdecoder/*.cpp) \
+SRCS:=$(wildcard $(SRC_ROOT)/nvdec/*.cpp) \
 		$(wildcard $(SRC_ROOT)/gb28181/*.cpp) \
     $(wildcard $(SRC_ROOT)/dvpp/*.cpp) 
 OBJS = $(patsubst %.cpp, %.o, $(notdir $(SRCS)))
@@ -52,7 +52,7 @@ $(TARGET):$(OBJS) $(CU_OBJS)
 	$(XX) -o $@ $^ $(CFLAGS)  $(LIBSPATH) $(LIBS) -Wwrite-strings
 	rm -f *.o
 
-# %.o:$(SRC_ROOT)/nvdecoder/%.cpp
+# %.o:$(SRC_ROOT)/nvdec/%.cpp
 # 	$(XX) $(CFLAGS) -c $<
 
 %.o:$(SRC_ROOT)/gb28181/%.cpp
diff --git a/src/demo/Makefile b/src/demo/Makefile
index b19cdb2..e608a63 100644
--- a/src/demo/Makefile
+++ b/src/demo/Makefile
@@ -1,25 +1,41 @@
-XX = g++
+# 各项目录
+LIB_DIR:=$(BUILD_DIR)/$(MODULE)/lib
+DEP_DIR:=$(BUILD_DIR)/$(MODULE)/.dep
+OBJ_DIR:=$(BUILD_DIR)/$(MODULE)/obj
+SRC_DIR:=$(TOP_DIR)/$(MODULE)
 
+# 源文件以及中间目标文件和依赖文件
+SRCS:=$(notdir $(wildcard $(SRC_DIR)/*.cpp))
+OBJS:=$(addprefix $(OBJ_DIR)/, $(patsubst %.cpp, %.o, $(SRCS)))
+DEPS:=$(addprefix $(DEP_DIR)/, $(patsubst %.cpp, %.d,a $(SRCS)))
 
-PROJECT_ROOT= /mnt/data/cmhu/FFNvDecoder
+# 自动生成头文件依赖选项
+DEPFLAGS=-MT $@ -MMD -MP -MF $(DEP_DIR)/$*.d
 
-CUDA_ROOT = /usr/local/cuda-11.1
+# 最终目标文件
+TARGET:=/mnt/data/cmhu/FFNvDecoder/bin/lib/demo
+
+
+
+PROJECT_ROOT= /mnt/data/cmhu/FFNvDecoder
 
 DEPEND_DIR = $(PROJECT_ROOT)/bin
 THIRDPARTY_ROOT = $(PROJECT_ROOT)/3rdparty
 SPDLOG_ROOT = $(THIRDPARTY_ROOT)/spdlog-1.9.2/release
 JRTP_ROOT = $(THIRDPARTY_ROOT)/jrtp_export
 
-SRC_ROOT = $(PROJECT_ROOT)/src
+CUDA_ROOT = /usr/local/cuda-11.1
 
-TARGET= $(PROJECT_ROOT)/bin/lib/demo
+LIBSPATH= -L $(DEPEND_DIR)/lib -lavformat -lavcodec -lswscale -lavutil -lavfilter -lswresample -lavdevice \
+   -L $(CUDA_ROOT)/lib64 -lcuda -lcudart -lnvcuvid -lcurand -lcublas -lnvjpeg \
+   -L $(SPDLOG_ROOT) -l:libspdlog.a \
+   -L $(JRTP_ROOT)/jthread/lib -l:libjthread.a \
+   -L $(JRTP_ROOT)/jrtplib/lib -l:libjrtp.a
 
-DEFS = -DENABLE_DVPP_INTERFACE
 
-INCLUDE= -I $(SRC_ROOT)/interface \
-		-I $(SRC_ROOT)/dvpp \
-		-I $(SRC_ROOT)/gb28181 \
-		-I $(SRC_ROOT)/nvdecoder \
+INCLUDE= -I $(TOP_DIR)/interface \
+		-I $(TOP_DIR)/nvdec \
+		-I $(TOP_DIR)/gb28181 \
 		-I $(DEPEND_DIR)/include \
 		-I $(CUDA_ROOT)/include \
 		-I $(TOP_DIR)/common/inc \
@@ -29,51 +45,53 @@ INCLUDE= -I $(SRC_ROOT)/interface \
 		-I $(JRTP_ROOT)/jrtplib/include/jrtplib3 \
 		-I $(JRTP_ROOT)/jthread/include/jthread
 
-LIBSPATH= -L $(DEPEND_DIR)/lib -lavformat -lavcodec -lswscale -lavutil -lavfilter -lswresample -lavdevice \
+LIBSPATH= -L $(BUILD_DIR)/interface/lib -l:interface.a \
+		-L $(BUILD_DIR)/nvdec/lib -l:nvdec.a \
+		-L $(BUILD_DIR)/gb28181/lib -l:gb28181.a \
+		-L $(DEPEND_DIR)/lib -lavformat -lavcodec -lswscale -lavutil -lavfilter -lswresample -lavdevice \
 		-L $(CUDA_ROOT)/lib64 -lcuda -lcudart -lnvcuvid -lcurand -lcublas -lnvjpeg \
 		-L $(SPDLOG_ROOT) -l:libspdlog.a \
 		-L $(JRTP_ROOT)/jthread/lib -l:libjthread.a \
 		-L $(JRTP_ROOT)/jrtplib/lib -l:libjrtp.a
 
+CXXFLAGS= -g -O0 -fPIC $(INCLUDE) -lpthread -lrt -lz -fexceptions -std=c++11 -fvisibility=hidden -Wl,-Bsymbolic -ldl -Wwrite-strings
+	# -DUNICODE -D_UNICODE
 
-# include_dir=-I/usr/local/Ascend/ascend-toolkit/6.3.RC1.alpha001/runtime/include
-
-# lib_dir=-L/usr/lib \
-# 		-L/usr/local/lib \
-# 		-L/usr/local/Ascend/driver/lib64 \
-# 		-L/usr/local/Ascend/ascend-toolkit/6.3.RC1.alpha001/atc/lib64\
-# 		-L/usr/local/Ascend/ascend-toolkit/6.3.RC1.alpha001/runtime/lib64 \
-# 		-L/usr/local/Ascend/ascend-toolkit/6.3.RC1.alpha001/runtime/lib64/stub \
-# 		-L/usr/local/Ascend/ascend-toolkit/6.3.RC1.alpha001/lib64 \
-# 		-L/usr/local/Ascend/driver/lib64/driver
-		
-# lib=-lacl_dvpp -lascendcl -lmmpa -lglog -lgflags -lpthread -lz -lacl_dvpp_mpi -lruntime -lascendalog -lc_sec -lmsprofiler -lgert -lge_executor -lge_common \
-# 	-lgraph -lascend_protobuf -lprofapi -lerror_manager -lexe_graph -lregister -lplatform
-# LIBS= -lavformat -lavcodec -lswscale -lavutil -lavfilter -lswresample -lavdevice
-# CXXFLAGS= -g -O0 -fPIC $(INCLUDE) $(include_dir) $(DEFS) -lpthread -lrt -lz -fexceptions -std=c++11 -fvisibility=hidden -Wl,-Bsymbolic -ldl
+# 默认最终目标
+.PHONY:all
+all:$(TARGET)
 
-CXXFLAGS= -g -O0 -fPIC $(INCLUDE) $(DEFS) -lpthread -lrt -lz -fexceptions -std=c++11 -fvisibility=hidden -Wl,-Bsymbolic -ldl
+# 生成最终目标
+$(TARGET): $(OBJS) | $(LIB_DIR)
+	@echo -e "\e[32m""Linking static library $(TARGET)""\e[0m"
+	@echo -e "$(CXX) -o $@ $^ $(DEPFLAGS) $(CXXFLAGS) $(LIBSPATH) $(MACROS)"
+	$(CXX) -o $@ $^ $(DEPFLAGS) $(CXXFLAGS) $(LIBSPATH) $(MACROS)
 
-SRCS:=$(wildcard $(SRC_ROOT)/demo/*.cpp)
-OBJS = $(patsubst %.cpp, %.o, $(notdir $(SRCS)))
+# 若没有lib目录则自动生成
+$(LIB_DIR):
+	@mkdir -p $@
 
-OBJ_ROOT = $(PROJECT_ROOT)/src/build
-# DVPP_SRCS:=$(wildcard $(OBJ_ROOT)/dvpp/obj/*.o)
-INTEFACE_SRCS:=$(wildcard $(OBJ_ROOT)/interface/obj/*.o)
-NVDECODER_SRCS:=$(wildcard $(OBJ_ROOT)/nvdecoder/obj/*.o)
-GB28181_SRCS:=$(wildcard $(OBJ_ROOT)/gb28181/obj/*.o)
+# 生成中间目标文件
+$(OBJ_DIR)/%.o:$(SRC_DIR)/%.cpp $(DEP_DIR)/%.d | $(OBJ_DIR) $(DEP_DIR)
+	@echo -e "\e[33m""Building object $@""\e[0m"
+	@echo -e "$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $<"
+	$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $<
 
+# 若没有obj目录则自动生成
+$(OBJ_DIR):
+	@mkdir -p $@
 
-$(TARGET):$(OBJS) $(INTEFACE_SRCS) $(NVDECODER_SRCS) $(GB28181_SRCS)
-	rm -f $(TARGET)
-# @echo -e "\e[33m""Building object $@""\e[0m"
-# $(XX) -o $@ $^ $(CXXFLAGS) $(LIBS) $(lib_dir) $(lib) -Wwrite-strings
-	$(XX) -o $@ $^ $(CXXFLAGS) $(LIBSPATH) -Wwrite-strings
-	rm -f *.o
+# 若没有.dep目录则自动生成
+$(DEP_DIR):
+	@mkdir -p $@
 
-%.o:$(SRC_ROOT)/demo/%.cpp
-	$(XX) $(CXXFLAGS) -c $<
+# 依赖文件会在生成中间文件的时候自动生成,这里只是为了防止报错
+$(DEPS):
 
+# 引入中间目标文件头文件依赖关系
+include $(wildcard $(DEPS))
 
+# 直接删除组件build目录
+.PHONY:clean
 clean:
-	rm -f *.o $(TARGET)
\ No newline at end of file
+	@rm -rf $(BUILD_DIR)/$(MODULE)
diff --git a/src/demo/Makefile.o.nvdec b/src/demo/Makefile.o.nvdec
new file mode 100644
index 0000000..a40488b
--- /dev/null
+++ b/src/demo/Makefile.o.nvdec
@@ -0,0 +1,61 @@
+XX = g++
+
+
+PROJECT_ROOT= /mnt/data/cmhu/FFNvDecoder
+
+CUDA_ROOT = /usr/local/cuda-11.1
+
+DEPEND_DIR = $(PROJECT_ROOT)/bin
+THIRDPARTY_ROOT = $(PROJECT_ROOT)/3rdparty
+SPDLOG_ROOT = $(THIRDPARTY_ROOT)/spdlog-1.9.2/release
+JRTP_ROOT = $(THIRDPARTY_ROOT)/jrtp_export
+
+SRC_ROOT = $(PROJECT_ROOT)/src
+
+TARGET= $(PROJECT_ROOT)/bin/lib/demo
+
+DEFS = -DENABLE_DVPP_INTERFACE
+
+INCLUDE= -I $(SRC_ROOT)/interface \
+		-I $(SRC_ROOT)/dvpp \
+		-I $(SRC_ROOT)/gb28181 \
+		-I $(SRC_ROOT)/nvdec \
+		-I $(DEPEND_DIR)/include \
+		-I $(CUDA_ROOT)/include \
+		-I $(TOP_DIR)/common/inc \
+		-I $(TOP_DIR)/common/UtilNPP \
+		-I $(TOP_DIR)/ \
+		-I $(SPDLOG_ROOT)/include \
+		-I $(JRTP_ROOT)/jrtplib/include/jrtplib3 \
+		-I $(JRTP_ROOT)/jthread/include/jthread
+
+LIBSPATH= -L $(DEPEND_DIR)/lib -lavformat -lavcodec -lswscale -lavutil -lavfilter -lswresample -lavdevice \
+		-L $(CUDA_ROOT)/lib64 -lcuda -lcudart -lnvcuvid -lcurand -lcublas -lnvjpeg \
+		-L $(SPDLOG_ROOT) -l:libspdlog.a \
+		-L $(JRTP_ROOT)/jthread/lib -l:libjthread.a \
+		-L $(JRTP_ROOT)/jrtplib/lib -l:libjrtp.a
+
+
+CXXFLAGS= -g -O0 -fPIC $(INCLUDE) $(DEFS) -lpthread -lrt -lz -fexceptions -std=c++11 -fvisibility=hidden -Wl,-Bsymbolic -ldl
+
+SRCS:=$(wildcard $(SRC_ROOT)/demo/*.cpp)
+OBJS = $(patsubst %.cpp, %.o, $(notdir $(SRCS)))
+
+OBJ_ROOT = $(PROJECT_ROOT)/src/build
+INTEFACE_SRCS:=$(wildcard $(OBJ_ROOT)/interface/obj/*.o)
+NVDECODER_SRCS:=$(wildcard $(OBJ_ROOT)/nvdec/obj/*.o)
+GB28181_SRCS:=$(wildcard $(OBJ_ROOT)/gb28181/obj/*.o)
+
+
+$(TARGET):$(OBJS) $(INTEFACE_SRCS) $(NVDECODER_SRCS) $(GB28181_SRCS)
+	rm -f $(TARGET)
+	@echo -e "\e[33m""Building object $@""\e[0m"
+	$(XX) -o $@ $^ $(CXXFLAGS) $(LIBSPATH) -Wwrite-strings
+	rm -f *.o
+
+%.o:$(SRC_ROOT)/demo/%.cpp
+	$(XX) $(CXXFLAGS) -c $<
+
+
+clean:
+	rm -f *.o $(TARGET)
\ No newline at end of file
diff --git a/src/demo/main_nvdec.cpp b/src/demo/main_nvdec.cpp
index db2da61..227bc8a 100644
--- a/src/demo/main_nvdec.cpp
+++ b/src/demo/main_nvdec.cpp
@@ -1,9 +1,8 @@
 #include "FFNvDecoderManager.h"
 #include <iostream>
 
-#include "cuda_kernels.h"
-
-#include "NvJpegEncoder.h"
+// #include "cuda_kernels.h"
+// #include "NvJpegEncoder.h"
 
 #include <pthread.h>
 #include <thread>
@@ -85,75 +84,75 @@ unsigned char *pHwRgb[2] = {nullptr, nullptr};
 int sum1 = 0;
 int sum2 = 0;
 
-cudaStream_t stream[2];
+// cudaStream_t stream[2];
 
 string data_home = "/mnt/data/cmhu/tmp/";
 
 
-#define checkCudaErrors(S) do {CUresult  status; \
-        status = S; \
-        if (status != CUDA_SUCCESS ) std::cout << __LINE__ <<" checkCudaErrors - status = " << status << std::endl; \
-        } while (false)
+// #define checkCudaErrors(S) do {CUresult  status; \
+//         status = S; \
+//         if (status != CUDA_SUCCESS ) std::cout << __LINE__ <<" checkCudaErrors - status = " << status << std::endl; \
+//         } while (false)
 
 
-static void gpu_helper(int gpuid)
-{
-    cudaSetDevice(gpuid);
+// static void gpu_helper(int gpuid)
+// {
+//     cudaSetDevice(gpuid);
 
-    // int *dn;
-    // cudaMalloc((void **)&dn, 1 * sizeof(int));
+//     // int *dn;
+//     // cudaMalloc((void **)&dn, 1 * sizeof(int));
 
-	size_t free_byte;
-	size_t total_byte;
+// 	size_t free_byte;
+// 	size_t total_byte;
 
-	CUresult cuda_status = cuMemGetInfo(&free_byte, &total_byte);
+// 	CUresult cuda_status = cuMemGetInfo(&free_byte, &total_byte);
 
-	const char *pStr = nullptr;
-	if (CUDA_SUCCESS != cuda_status) {
-		cuGetErrorString(cuda_status, &pStr);
-		printf("Error: cudaMemGetInfo fails, %s \n", pStr);
-		return;
-	}
+// 	const char *pStr = nullptr;
+// 	if (CUDA_SUCCESS != cuda_status) {
+// 		cuGetErrorString(cuda_status, &pStr);
+// 		printf("Error: cudaMemGetInfo fails, %s \n", pStr);
+// 		return;
+// 	}
 
-	double free_db = (double)free_byte;
-	double total_db = (double)total_byte;
-	double used_db_1 = (total_db - free_db) / 1024.0 / 1024.0;
+// 	double free_db = (double)free_byte;
+// 	double total_db = (double)total_byte;
+// 	double used_db_1 = (total_db - free_db) / 1024.0 / 1024.0;
 
-	std::cout <<"显存已使用 " << used_db_1 << " MB\n";
+// 	std::cout <<"显存已使用 " << used_db_1 << " MB\n";
 
-    // cudaFree(dn);
-}
+//     // cudaFree(dn);
+// }
 
-int CheckCUDAProperty( int devId )
-{
-    cuInit(0);
+// int CheckCUDAProperty( int devId )
+// {
+//     cuInit(0);
 
-	CUdevice dev = devId;
-	size_t memSize = 0;
-	char devName[256] = {0};
-	int major = 0, minor = 0;
-	CUresult rlt = CUDA_SUCCESS;
+// 	CUdevice dev = devId;
+// 	size_t memSize = 0;
+// 	char devName[256] = {0};
+// 	int major = 0, minor = 0;
+// 	CUresult rlt = CUDA_SUCCESS;
 
-    rlt = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
-    checkCudaErrors( rlt );
+//     rlt = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
+//     checkCudaErrors( rlt );
 
-    rlt = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
-	checkCudaErrors( rlt );
+//     rlt = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
+// 	checkCudaErrors( rlt );
 
-	rlt = cuDeviceGetName( devName, sizeof( devName ), dev );
-	checkCudaErrors( rlt );
+// 	rlt = cuDeviceGetName( devName, sizeof( devName ), dev );
+// 	checkCudaErrors( rlt );
 
-	printf( "Using GPU Device %d: %s has SM %d.%d compute capability\n",
-		    dev, devName, major, minor );
+// 	printf( "Using GPU Device %d: %s has SM %d.%d compute capability\n",
+// 		    dev, devName, major, minor );
 
-	rlt = cuDeviceTotalMem( &memSize, dev );
-	checkCudaErrors( rlt );
+// 	rlt = cuDeviceTotalMem( &memSize, dev );
+// 	checkCudaErrors( rlt );
 
-	printf( "Total amount of global memory:   %4.4f MB\n",
-		   (float)memSize / ( 1024 * 1024 ) );
+// 	printf( "Total amount of global memory:   %4.4f MB\n",
+// 		   (float)memSize / ( 1024 * 1024 ) );
 
-	return 0;
-}
+// 	return 0;
+// }
 
 /**
  * 注意: gpuFrame 在解码器设置的显卡上,后续操作要十分注意这一点,尤其是多线程情况
@@ -378,7 +377,7 @@ int main(int argc, char* argv[]){
 
     // av_log_set_callback(&logFF);
 
-    CheckCUDAProperty(atoi(gpuid));
+    // CheckCUDAProperty(atoi(gpuid));
 
     pthread_t m_decode_thread;
     pthread_create(&m_decode_thread,0,
diff --git a/src/gb28181/FFGB28181Decoder.cpp b/src/gb28181/FFGB28181Decoder.cpp
index a4ea5da..3ff9edd 100644
--- a/src/gb28181/FFGB28181Decoder.cpp
+++ b/src/gb28181/FFGB28181Decoder.cpp
@@ -2,7 +2,7 @@
 #include <iostream>
 #include "FFGB28181Decoder.h"
 
-#include "../nvdecoder/FFCuContextManager.h"
+
 
 extern "C" {
 #include "libavutil/avstring.h"
@@ -17,8 +17,9 @@ extern "C" {
 
 #include "common_header.h"
 
-#include "../nvdecoder/GpuRgbMemory.hpp"
-#include "../nvdecoder/cuda_kernels.h"
+#include "../nvdec/FFCuContextManager.h"
+#include "../nvdec/GpuRgbMemory.hpp"
+#include "../nvdec/cuda_kernels.h"
 
 #define ECLOSED 0
 #define ECLOSING 1
diff --git a/src/gb28181/Makefile b/src/gb28181/Makefile
index d15ead3..d154115 100644
--- a/src/gb28181/Makefile
+++ b/src/gb28181/Makefile
@@ -50,7 +50,8 @@ all:$(TARGET)
 # 生成最终目标
 $(TARGET):$(OBJS) | $(LIB_DIR)
 	@echo -e "\e[32m""Linking static library $(TARGET)""\e[0m"
-# @ar -rc $@ $^
+	@echo -e "ar -rc $@ $^"
+	@ar -rc $@ $^
 
 # 若没有lib目录则自动生成
 $(LIB_DIR):
@@ -59,6 +60,7 @@ $(LIB_DIR):
 # 生成中间目标文件
 $(OBJ_DIR)/%.o:$(SRC_DIR)/%.cpp $(DEP_DIR)/%.d | $(OBJ_DIR) $(DEP_DIR)
 	@echo -e "\e[33m""Building object $@""\e[0m"
+	@echo -e "$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $<"
 	@$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $<
 
 # 若没有obj目录则自动生成
diff --git a/src/interface/FFNvDecoderManager.cpp b/src/interface/FFNvDecoderManager.cpp
index b274b12..bd05873 100644
--- a/src/interface/FFNvDecoderManager.cpp
+++ b/src/interface/FFNvDecoderManager.cpp
@@ -1,7 +1,7 @@
 #include "FFNvDecoderManager.h"
 
 #ifdef USE_NVDEC
-#include "../nvdecoder/FFNvDecoder.h"
+#include "../nvdec/FFNvDecoder.h"
 #include "../gb28181/FFGB28181Decoder.h"
 #endif
 
diff --git a/src/interface/Makefile b/src/interface/Makefile
index 2b9ab32..5b38ce4 100644
--- a/src/interface/Makefile
+++ b/src/interface/Makefile
@@ -30,7 +30,7 @@ INCLUDE= -I $(DEPEND_DIR)/include \
   -I $(JRTP_ROOT)/jrtplib/include/jrtplib3 \
   -I $(JRTP_ROOT)/jthread/include/jthread \
   -I $(TOP_DIR)/src/gb28181 \
-  -I $(TOP_DIR)/src/nvdecoder \
+  -I $(TOP_DIR)/src/nvdec \
 
 LIBSPATH= -L $(DEPEND_DIR)/lib -lavformat -lavcodec -lswscale -lavutil -lavfilter -lswresample -lavdevice \
    -L $(CUDA_ROOT)/lib64 -lcuda -lcudart -lnvcuvid -lcurand -lcublas -lnvjpeg \
@@ -45,7 +45,7 @@ CXXFLAGS= -g -O0 -fPIC $(INCLUDE) $(DEFS) -lpthread -lrt -lz -fexceptions -std=c
 # 最终目标文件
 TARGET:=$(LIB_DIR)/$(MODULE).a
 
-# MODULE_LIBS:=$(BUILD_DIR)/nvdecoder/lib/nvdecoder.a \
+# MODULE_LIBS:=$(BUILD_DIR)/nvdec/lib/nvdec.a \
 # 			$(BUILD_DIR)/gb28181/lib/gb28181.a\
 
 # 默认最终目标
@@ -54,8 +54,9 @@ all:$(TARGET)
 
 # 生成最终目标
 $(TARGET):$(OBJS) | $(LIB_DIR)
-	# @echo -e "\e[32m""Linking static library $(TARGET)""\e[0m"
-# @ar -rc $@ $^
+	@echo -e "\e[32m""Linking static library $(TARGET)""\e[0m"
+	@echo -e "ar -rc $@ $^"
+	@ar -rc $@ $^
 
 # 若没有lib目录则自动生成
 $(LIB_DIR):
@@ -64,6 +65,7 @@ $(LIB_DIR):
 # 生成中间目标文件
 $(OBJ_DIR)/%.o:$(SRC_DIR)/%.cpp $(DEP_DIR)/%.d | $(OBJ_DIR) $(DEP_DIR)
 	@echo -e "\e[33m""Building object $@""\e[0m"
+	@echo -e "$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $<"
 # @$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@  $(MODULE_LIBS) $<
 	@$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $<
 
diff --git a/src/nvdec/DrawImageOnGPU.cu b/src/nvdec/DrawImageOnGPU.cu
new file mode 100644
index 0000000..1fa99dc
--- /dev/null
+++ b/src/nvdec/DrawImageOnGPU.cu
@@ -0,0 +1,126 @@
+#include "cuda_kernels.h"
+
+#include "../interface/logger.hpp"
+
+typedef unsigned char   uchar;
+typedef unsigned int    uint32;
+typedef int             int32;
+
+namespace cuda_common
+{
+	__global__ void kernel_drawPixel(float* d_srcRGB, int src_width, int src_height,
+		int left, int top, int right, int bottom)
+	{
+		const int x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (((x == left || x == right) && y >= top && y <= bottom) || ((y == top || y == bottom) && x >= left && x <= right))
+		{
+			d_srcRGB[(y*src_width) + x] = 0;
+			d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255;
+			d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0;
+		}
+	}
+
+	cudaError_t DrawImage(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom)
+	{
+		dim3 block(32, 16, 1);
+		dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1);
+
+		kernel_drawPixel << < grid, block >> >(d_srcRGB, src_width, src_height, left, top, right, bottom);
+
+		cudaError_t cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			LOG_ERROR("Draw 32 kernel_memcopy launch failed:{}",cudaGetErrorString(cudaStatus));
+			return cudaStatus;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus);
+			return cudaStatus;
+		}
+
+		return cudaStatus;
+	}
+
+	__global__ void kernel_drawPixel(unsigned char* d_srcRGB, int src_width, int src_height,
+		int left, int top, int right, int bottom)
+	{
+		const int x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (((x == left || x == right) && y >= top && y <= bottom) || ((y == top || y == bottom) && x >= left && x <= right))
+		{
+			d_srcRGB[(y*src_width) + x] = 0;
+			d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255;
+			d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0;
+		}
+	}
+
+	cudaError_t DrawImage(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom)
+	{
+		dim3 block(32, 16, 1);
+		dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1);
+
+		kernel_drawPixel << < grid, block >> >(d_srcRGB, src_width, src_height, left, top, right, bottom);
+
+		cudaError_t cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			LOG_ERROR("Draw 68 kernel_memcopy launch failed: {}",cudaGetErrorString(cudaStatus));
+			return cudaStatus;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus);
+			return cudaStatus;
+		}
+
+		return cudaStatus;
+	}
+
+	__global__ void kernel_drawLine(float* d_srcRGB, int src_width, int src_height,
+		int begin_x, int begin_y, int end_x, int end_y)
+	{
+		int min_x = end_x < begin_x ? end_x : begin_x;
+		int max_x = end_x < begin_x ? begin_x : end_x;
+
+		int min_y = end_y < begin_y ? end_y : begin_y;
+		int max_y = end_y < begin_y ? begin_y : end_y;
+
+		const int x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if ((x - begin_x) * (end_y - begin_y) == (end_x - begin_x) * (y - begin_y)
+			&& min_x <= x && x <= max_x
+			&& min_y <= y && y <= max_y)
+		{
+			d_srcRGB[(y*src_width) + x] = 0;
+			d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255;
+			d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0;
+		}
+	}
+
+	cudaError_t DrawLine(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y)
+	{
+		dim3 block(32, 16, 1);
+		dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1);
+
+		kernel_drawLine << < grid, block >> >(d_srcRGB, src_width, src_height, begin_x, begin_y, end_x, end_y);
+
+		cudaError_t cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			LOG_ERROR("Draw 112 kernel_memcopy launch failed: {}",cudaGetErrorString(cudaStatus));
+			return cudaStatus;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus);
+			return cudaStatus;
+		}
+
+		return cudaStatus;
+	}
+}
\ No newline at end of file
diff --git a/src/nvdec/FFCuContextManager.cpp b/src/nvdec/FFCuContextManager.cpp
new file mode 100644
index 0000000..382c4d8
--- /dev/null
+++ b/src/nvdec/FFCuContextManager.cpp
@@ -0,0 +1,41 @@
+#include "FFCuContextManager.h"
+
+#include "common_header.h"
+
+using namespace std;
+
+extern "C"
+{
+	#include <libavcodec/avcodec.h> 
+	#include <libavdevice/avdevice.h> 
+	#include <libavformat/avformat.h> 
+	#include <libavfilter/avfilter.h> 
+	#include <libavutil/avutil.h> 
+    #include <libavutil/pixdesc.h> 
+	#include <libswscale/swscale.h>
+    #include <libavutil/imgutils.h>
+}
+
+FFCuContextManager::~FFCuContextManager()
+{
+    for(auto iter = ctxMap.begin(); iter != ctxMap.end(); iter++){
+        av_buffer_unref(&iter->second);
+    }
+    ctxMap.clear();
+}
+
+AVBufferRef *FFCuContextManager::getCuCtx(string gpuid)
+{
+     AVBufferRef *hw_device_ctx = ctxMap[gpuid];
+     if (nullptr == hw_device_ctx)
+     {
+        // 初始化硬件解码器
+        if (av_hwdevice_ctx_create(&hw_device_ctx, AV_HWDEVICE_TYPE_CUDA, gpuid.c_str(), nullptr, 0) < 0) 
+        {
+            LOG_ERROR("Failed to create specified HW device.");
+            return nullptr;
+        }
+        ctxMap[gpuid] = hw_device_ctx;
+     }
+     return hw_device_ctx;
+}
\ No newline at end of file
diff --git a/src/nvdec/FFCuContextManager.h b/src/nvdec/FFCuContextManager.h
new file mode 100644
index 0000000..758167c
--- /dev/null
+++ b/src/nvdec/FFCuContextManager.h
@@ -0,0 +1,28 @@
+
+#include<map>
+#include<string>
+
+using namespace std;
+
+struct AVBufferRef;
+
+class FFCuContextManager{
+public:
+    static FFCuContextManager* getInstance(){
+		static FFCuContextManager* singleton = nullptr;
+		if (singleton == nullptr){
+			singleton = new FFCuContextManager();
+		}
+		return singleton;
+	}
+
+    AVBufferRef *getCuCtx(string gpuid);
+
+private:
+    FFCuContextManager(){}
+	~FFCuContextManager();
+
+private:
+    map<string,AVBufferRef *> ctxMap;
+
+};
\ No newline at end of file
diff --git a/src/nvdec/FFNvDecoder.cpp b/src/nvdec/FFNvDecoder.cpp
new file mode 100644
index 0000000..e64e2a5
--- /dev/null
+++ b/src/nvdec/FFNvDecoder.cpp
@@ -0,0 +1,513 @@
+#include "FFNvDecoder.h"
+
+#include <chrono>
+#include <thread>
+#include <fstream>
+
+#include <chrono>
+
+#include "FFCuContextManager.h"
+
+#include "common_header.h"
+
+#include "GpuRgbMemory.hpp"
+#include "cuda_kernels.h"
+
+using namespace std;
+
+// 参考博客: https://blog.csdn.net/qq_40116098/article/details/120704340
+
+static AVPixelFormat get_hw_format(AVCodecContext *avctx, const AVPixelFormat *pix_fmts)
+{
+	FFNvDecoder* _this = (FFNvDecoder*)avctx->opaque;
+
+	const AVPixelFormat *p;
+
+	for (p = pix_fmts; *p != -1; p++) {
+		if (*p == _this->getHwPixFmt())
+			return *p;
+	}
+
+	LOG_ERROR("Failed to get HW surface format");
+	return AV_PIX_FMT_NONE;
+}
+
+FFNvDecoder::FFNvDecoder()
+{
+	// 初始化解码对象
+	fmt_ctx = nullptr;
+	avctx = nullptr;
+	m_bRunning = false;
+
+	stream = nullptr;
+    stream_index = -1;
+    hw_pix_fmt = AV_PIX_FMT_NONE;
+    m_dec_name = "";
+
+	m_bPause = false;
+	m_bReal = true;
+
+	m_decode_thread = 0;
+	m_post_decode_thread = 0;
+
+	m_bFinished = false;
+	m_dec_keyframe = false;
+	m_fps = 0.0;
+}
+
+FFNvDecoder::~FFNvDecoder()
+{
+	m_dec_keyframe = false;
+}
+
+bool FFNvDecoder::init(FFDecConfig& cfg)
+{
+	m_cfg = cfg;
+	m_dec_name = cfg.dec_name;
+
+	fstream infile(cfg.uri);
+	if (infile.is_open()){
+		m_bReal = false;
+		infile.close();
+	}else {
+		m_bReal = true;
+	}
+
+	post_decoded_cbk = cfg.post_decoded_cbk;
+    decode_finished_cbk = cfg.decode_finished_cbk;
+
+	return init(cfg.uri.c_str(), cfg.gpuid.c_str(),cfg.force_tcp);
+}
+
+bool FFNvDecoder::init(const char* uri, const char* gpuid, bool force_tcp)
+{
+	// av_log_set_level(AV_LOG_DEBUG);
+
+	avformat_network_init();
+
+	// 打开输入视频文件
+	AVDictionary *options = nullptr;
+	av_dict_set( &options, "bufsize", "655360", 0 );
+	av_dict_set( &options, "rtsp_transport", force_tcp ? "tcp" : "udp", 0 );
+	// av_dict_set( &options, "listen_timeout", "30", 0 ); // 单位为s
+	av_dict_set( &options, "stimeout", "30000000", 0 ); // 单位为 百万分之一秒
+	
+	fmt_ctx = avformat_alloc_context();
+	const char* input_file = uri;
+	if (avformat_open_input(&fmt_ctx, input_file, nullptr, &options) != 0) {
+		LOG_ERROR("Cannot open input file:{}",input_file);
+		return false;
+	}
+
+	// 查找流信息
+	if (avformat_find_stream_info(fmt_ctx, nullptr) < 0) {
+		LOG_ERROR("Cannot find input stream information");
+		return false;
+	}
+
+	// 查找视频流信息
+	AVCodec *decoder = nullptr;
+	stream_index = av_find_best_stream(fmt_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, &decoder, 0);
+	if (stream_index < 0) {
+		LOG_ERROR("Cannot find a video stream in the input file");
+		return false;
+	}
+
+	string cuvid_dec_name = string(decoder->name) + "_cuvid";
+	AVCodec *vcodec = avcodec_find_decoder_by_name(cuvid_dec_name.c_str());
+	if (!(avctx = avcodec_alloc_context3(vcodec)))
+		return (bool)AVERROR(ENOMEM);
+
+	// 得到视频流对象
+	stream = fmt_ctx->streams[stream_index];
+	if (avcodec_parameters_to_context(avctx, stream->codecpar) < 0)
+		return false;
+
+	m_fps = av_q2d(stream ->avg_frame_rate);
+
+	avctx->opaque = this;
+	// 设置解码器管理器的像素格式回调函数
+	avctx->get_format = get_hw_format;
+
+	hw_pix_fmt = AV_PIX_FMT_CUDA;
+
+	FFCuContextManager* pCtxMgr = FFCuContextManager::getInstance();
+
+	AVBufferRef *hw_device_ctx = pCtxMgr->getCuCtx(gpuid);
+	if(nullptr == hw_device_ctx){
+		av_log(nullptr, AV_LOG_ERROR, "create CUDA context failed ! \n");
+		return false;
+	}
+	avctx->hw_device_ctx = av_buffer_ref(hw_device_ctx);
+	if (nullptr == avctx->hw_device_ctx)
+	{
+		return false;
+	}
+
+	// 打开解码器流
+	AVDictionary *op = nullptr;
+	av_dict_set( &op, "gpu", gpuid, 0 );
+	// av_dict_set( &op, "surfaces", "5", 0 );
+	if (avcodec_open2(avctx, vcodec, &op) < 0) {
+		LOG_ERROR("Failed to open codec for stream");
+		return false;
+	}
+	
+	return true;
+}
+
+bool FFNvDecoder::isSurport(FFDecConfig& cfg)
+{
+	bool bRet = init(cfg);
+    decode_finished();
+    return bRet;
+}
+
+bool FFNvDecoder::start(){
+
+	m_bRunning = true;
+
+	pthread_create(&m_decode_thread,0,
+        [](void* arg)
+        {
+            FFNvDecoder* a=(FFNvDecoder*)arg;
+            a->decode_thread();
+            return (void*)0;
+        }
+    ,this);
+
+	return true;
+}
+
+void FFNvDecoder::decode_thread()
+{
+	AVPacket* pkt ;
+	pkt = av_packet_alloc();
+	av_init_packet( pkt );
+
+	pthread_create(&m_post_decode_thread,0,
+        [](void* arg)
+        {
+            FFNvDecoder* a=(FFNvDecoder*)arg;
+            a->post_decode_thread();
+            return (void*)0;
+        }
+    ,this);
+
+	// long start_time = UtilTools::get_cur_time_ms();
+
+	while (m_bRunning)
+	{
+		if (!m_bReal)
+		{
+			if (m_bPause)
+			{
+				std::this_thread::sleep_for(std::chrono::milliseconds(3));
+				continue;
+			}
+		}
+		
+		int result = av_read_frame(fmt_ctx, pkt);
+		if (result == AVERROR_EOF || result < 0)
+		{
+			LOG_ERROR("Failed to read frame!");
+			break;
+		}
+
+		if (m_dec_keyframe && !(pkt->flags & AV_PKT_FLAG_KEY)) {
+			av_packet_unref(pkt);
+			continue;
+		}
+
+		if (stream_index == pkt->stream_index){
+			result = avcodec_send_packet(avctx, pkt);
+			if (result < 0){
+				av_packet_unref(pkt);
+				LOG_ERROR("{} - Failed to send pkt: {}", m_dec_name, result);
+				continue;
+			}
+
+			AVFrame* gpuFrame = av_frame_alloc();
+			result = avcodec_receive_frame(avctx, gpuFrame);
+			if ((result == AVERROR(EAGAIN) || result == AVERROR_EOF) || result < 0){
+				LOG_ERROR("{} - Failed to receive frame: {}", m_dec_name, result);
+				av_frame_free(&gpuFrame); 
+				av_packet_unref(pkt);
+				continue;
+			}
+			av_packet_unref(pkt);
+
+			if (m_bReal){
+				if (m_bPause){
+					av_frame_free(&gpuFrame); 
+					std::this_thread::sleep_for(std::chrono::milliseconds(3));
+					continue;
+				}
+			}
+
+			if(gpuFrame != nullptr){
+				m_queue_mutex.lock();
+				if(mFrameQueue.size() <= 10){
+					mFrameQueue.push(gpuFrame);
+				}else{
+					av_frame_free(&gpuFrame); 
+				}
+				m_queue_mutex.unlock();
+			}
+		}
+		av_packet_unref(pkt);
+	}
+
+	m_bRunning = false;
+	av_packet_free(&pkt);
+
+	// long end_time = UtilTools::get_cur_time_ms();
+	// cout << "解码用时:" << end_time - start_time << endl;
+
+	if (m_post_decode_thread != 0)
+	{
+		pthread_join(m_post_decode_thread,0);
+	}
+
+	decode_finished_cbk(m_finishedDecArg);
+
+	decode_finished();
+
+	// 清空队列
+	while(mFrameQueue.size() > 0){
+		AVFrame * gpuFrame = mFrameQueue.front();
+		av_frame_free(&gpuFrame); 
+		mFrameQueue.pop();
+	}
+
+	LOG_INFO("{} - decode thread exited.", m_dec_name);
+}
+
+void FFNvDecoder::decode_finished(){
+	if (avctx)
+	{
+		avcodec_free_context(&avctx);
+	}
+	
+	if (fmt_ctx)
+	{
+		avformat_close_input(&fmt_ctx);
+	}
+
+	m_bFinished = true;
+	m_dec_keyframe = false;
+}
+
+void FFNvDecoder::post_decode_thread(){
+	int skip_frame = m_cfg.skip_frame;
+	if (skip_frame <= 0){
+		skip_frame = 1;
+	}
+	
+	int index = 0;
+	while (m_bRunning)
+	{
+		if(mFrameQueue.size() > 0){
+			std::lock_guard<std::mutex> l(m_snapshot_mutex);
+			// 取队头数据
+			m_queue_mutex.lock();
+			AVFrame * gpuFrame = mFrameQueue.front();
+			mFrameQueue.pop();
+			m_queue_mutex.unlock();
+			// 跳帧
+			if (skip_frame == 1 || index % skip_frame == 0){
+				post_decoded_cbk(m_postDecArg, convert2bgr(gpuFrame));
+				index = 0;
+			}
+
+			av_frame_free(&gpuFrame); 
+
+			index++;
+		}
+	}
+
+	LOG_INFO("post decode thread exited.");
+}
+
+void FFNvDecoder::close(){
+	m_bRunning=false;
+	if(m_decode_thread != 0){
+		pthread_join(m_decode_thread,0);
+	}
+	m_dec_keyframe = false;
+}
+
+AVPixelFormat FFNvDecoder::getHwPixFmt(){
+	return hw_pix_fmt;
+}
+
+bool FFNvDecoder::isRunning(){
+	return m_bRunning;
+}
+
+bool FFNvDecoder::isFinished(){
+	return m_bFinished;
+}
+
+bool FFNvDecoder::isPausing(){
+	return m_bPause;
+}
+
+bool FFNvDecoder::getResolution( int &width, int &height ){
+	if (avctx != nullptr)
+	{
+		width = avctx->width;
+		height = avctx->height;
+		return true;
+	}
+	
+	return false;
+}
+
+void FFNvDecoder::pause(){
+	m_bPause = true;
+}
+
+void FFNvDecoder::resume(){
+	m_bPause = false;
+}
+
+void FFNvDecoder::setDecKeyframe(bool bKeyframe)
+{
+	m_dec_keyframe = bKeyframe;
+}
+
+int FFNvDecoder::getCachedQueueLength(){
+	m_queue_mutex.lock();
+	int queue_size = mFrameQueue.size(); 
+	m_queue_mutex.lock();
+	return queue_size;
+}
+
+float FFNvDecoder::fps(){
+	return m_fps;
+}
+
+void FFNvDecoder::setPostDecArg(const void* postDecArg){
+	m_postDecArg = postDecArg;
+}
+
+void FFNvDecoder::setFinishedDecArg(const void* finishedDecArg){
+	m_finishedDecArg = finishedDecArg;
+}
+
+DeviceRgbMemory* FFNvDecoder::convert2bgr(AVFrame * gpuFrame){
+	if (gpuFrame != nullptr && gpuFrame->format == AV_PIX_FMT_CUDA ){
+		LOG_DEBUG("decode task: gpuid: {}  width: {} height: {}", m_cfg.gpuid, gpuFrame->width, gpuFrame->height);
+		GpuRgbMemory* gpuMem = new GpuRgbMemory(3, gpuFrame->width, gpuFrame->height, getName(), m_cfg.gpuid, false, true);
+
+		do{
+			if (gpuMem->getMem() == nullptr){
+				LOG_ERROR("new GpuRgbMemory failed !!!");
+				break;
+			}
+			
+			cudaSetDevice(atoi(m_cfg.gpuid.c_str()));
+			cuda_common::setColorSpace( ITU_709, 0 );
+			cudaError_t cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], gpuMem->getMem(), gpuFrame->width, gpuFrame->height);
+			cudaDeviceSynchronize();
+			if (cudaStatus != cudaSuccess) {
+				LOG_ERROR("CUDAToBGR failed failed !!!");
+				break;
+			}
+
+			return gpuMem;
+		}while(0);
+
+		delete gpuMem;
+		gpuMem = nullptr;
+	}
+
+	return nullptr;
+}
+
+FFImgInfo* FFNvDecoder::snapshot(){
+
+	// 锁住停止队列消耗
+	std::lock_guard<std::mutex> l(m_snapshot_mutex);
+
+	AVFrame * gpuFrame = nullptr;
+
+	bool bFirst = true;
+	while(true){
+		m_queue_mutex.lock();
+		if(mFrameQueue.size() <= 0){
+			m_queue_mutex.unlock();
+			if(bFirst){
+				std::this_thread::sleep_for(std::chrono::milliseconds(100));
+				bFirst = false;
+				continue;
+			}else{
+				// 再进来说明前面已经等了 100 ms
+				// 100 ms都没有等到解码数据,则退出
+				return nullptr;
+			}
+		}
+
+		// 队列中数据大于1 
+		gpuFrame = mFrameQueue.front();
+		m_queue_mutex.unlock();
+		break;
+	}
+
+	if (gpuFrame != nullptr && gpuFrame->format == AV_PIX_FMT_CUDA ){
+		LOG_DEBUG("decode task: gpuid: {}  width: {} height: {}", m_cfg.gpuid, gpuFrame->width, gpuFrame->height);
+		GpuRgbMemory* gpuMem = new GpuRgbMemory(3, gpuFrame->width, gpuFrame->height, getName(), m_cfg.gpuid, false, true);
+
+		if (gpuMem->getMem() == nullptr){
+			LOG_ERROR("new GpuRgbMemory failed !!!");
+			return nullptr;
+		}
+		
+		cudaSetDevice(atoi(m_cfg.gpuid.c_str()));
+		cuda_common::setColorSpace( ITU_709, 0 );
+		cudaError_t cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], gpuMem->getMem(), gpuFrame->width, gpuFrame->height);
+		cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			LOG_ERROR("CUDAToBGR failed failed !!!");
+			return nullptr;
+		}
+
+		unsigned char * pHwRgb = gpuMem->getMem();
+		int channel = gpuMem->getChannel();
+		int width = gpuMem->getWidth();
+		int height = gpuMem->getHeight();
+
+		if (pHwRgb != nullptr && channel > 0 && width > 0 && height > 0){
+			int nSize = channel * height * width;
+
+			LOG_INFO("channel:{} height:{} width:{}", channel, height, width);
+			// unsigned char* cpu_data = new unsigned char[nSize];
+
+            unsigned char* cpu_data = (unsigned char *)av_malloc(nSize * sizeof(unsigned char));
+
+			cudaMemcpy(cpu_data, pHwRgb, nSize * sizeof(unsigned char), cudaMemcpyDeviceToHost);
+			cudaDeviceSynchronize();
+
+			delete gpuMem;
+			gpuMem = nullptr;
+
+			FFImgInfo* imgInfo = new FFImgInfo();
+			imgInfo->dec_name = m_dec_name;
+			imgInfo->pData = cpu_data;
+			imgInfo->height = height;
+			imgInfo->width = width;
+			imgInfo->timestamp = UtilTools::get_cur_time_ms();
+			imgInfo->index = m_index;
+
+			m_index++;
+
+			return imgInfo;
+		}
+
+		delete gpuMem;
+		gpuMem = nullptr;
+	}
+
+	return nullptr;
+}
\ No newline at end of file
diff --git a/src/nvdec/FFNvDecoder.h b/src/nvdec/FFNvDecoder.h
new file mode 100644
index 0000000..4784ab6
--- /dev/null
+++ b/src/nvdec/FFNvDecoder.h
@@ -0,0 +1,107 @@
+#include<string>
+#include <pthread.h>
+
+#include <mutex>
+
+extern "C"
+{
+	#include <libavcodec/avcodec.h> 
+	#include <libavdevice/avdevice.h> 
+	#include <libavformat/avformat.h> 
+	#include <libavfilter/avfilter.h> 
+	#include <libavutil/avutil.h> 
+    #include <libavutil/pixdesc.h> 
+	#include <libswscale/swscale.h>
+    #include <libavutil/imgutils.h>
+}
+
+#include "common_header.h"
+
+#include "../interface/AbstractDecoder.h"
+
+using namespace std;
+
+class FFNvDecoder : public AbstractDecoder {
+public:
+    FFNvDecoder();
+    ~FFNvDecoder();
+    bool init(FFDecConfig& cfg);
+    void close();
+    bool start();
+    void pause();
+    void resume();
+
+    void setDecKeyframe(bool bKeyframe);
+
+    bool isRunning();
+    bool isFinished();
+    bool isPausing();
+    bool getResolution( int &width, int &height );
+
+    bool isSurport(FFDecConfig& cfg);
+
+    int getCachedQueueLength();
+
+    float fps();
+
+    DECODER_TYPE getDecoderType(){ return DECODER_TYPE_FFMPEG; }
+
+    FFImgInfo* snapshot();
+
+    void setName(string nm){
+        m_dec_name = nm;
+    }
+
+    string getName(){
+        return m_dec_name;
+    }
+
+    void setPostDecArg(const void* postDecArg);
+    void setFinishedDecArg(const void* finishedDecArg);
+
+public:
+    AVPixelFormat getHwPixFmt();
+
+private:
+    void decode_thread();
+    void post_decode_thread();
+    bool init(const char* uri, const char* gpuid, bool force_tcp);
+    void decode_finished();
+
+    DeviceRgbMemory* convert2bgr(AVFrame * gpuFrame);
+
+private:
+    string m_dec_name;
+    FFDecConfig m_cfg;
+
+    AVStream* stream;
+    AVCodecContext *avctx;
+    int stream_index;
+    AVFormatContext *fmt_ctx;
+    AVPixelFormat hw_pix_fmt;
+
+    pthread_t m_decode_thread;
+    pthread_t m_post_decode_thread;
+    
+    bool m_bRunning;
+    bool m_bFinished;
+
+    bool m_bPause;
+
+    bool m_bReal; // 是否实时流
+
+    float m_fps;
+
+    queue<AVFrame*> mFrameQueue;
+    mutex m_queue_mutex;
+    mutex m_snapshot_mutex;
+    long m_index{0};
+
+    bool m_dec_keyframe;
+
+    const void * m_postDecArg;
+    POST_DECODE_CALLBACK post_decoded_cbk;  // 解码数据回调接口
+
+    const void * m_finishedDecArg;
+    DECODE_FINISHED_CALLBACK decode_finished_cbk;
+};
\ No newline at end of file
diff --git a/src/nvdec/GpuRgbMemory.hpp b/src/nvdec/GpuRgbMemory.hpp
new file mode 100644
index 0000000..35eac65
--- /dev/null
+++ b/src/nvdec/GpuRgbMemory.hpp
@@ -0,0 +1,34 @@
+#include<string>
+
+#include "../interface/DeviceRgbMemory.hpp"
+#include "cuda_kernels.h"
+#include "define.hpp"
+#include "common_header.h"
+
+using namespace std;
+
+class GpuRgbMemory : public DeviceRgbMemory{
+
+public:
+     GpuRgbMemory(int _channel, int _width, int _height, string _id, string _gpuid, bool _key_frame, bool _isused)
+     :DeviceRgbMemory(_channel, _width, _height, _id, _gpuid, _key_frame, _isused){
+        gpuid = _gpuid;
+        cudaSetDevice(atoi(gpuid.c_str()));
+        CHECK_CUDA(cudaMalloc((void **)&pHwRgb, data_size * sizeof(unsigned char)));
+    }
+
+    ~GpuRgbMemory(){
+        if (pHwRgb) {
+            cudaSetDevice(atoi(gpuid.c_str()));
+            CHECK_CUDA(cudaFree(pHwRgb));
+            pHwRgb = nullptr;
+        }
+    }
+
+    string getGpuId() {
+        return gpuid;
+    }
+
+private:
+    string gpuid;
+};
\ No newline at end of file
diff --git a/src/nvdec/ImageSaveGPU.cpp b/src/nvdec/ImageSaveGPU.cpp
new file mode 100644
index 0000000..dde9b64
--- /dev/null
+++ b/src/nvdec/ImageSaveGPU.cpp
@@ -0,0 +1,123 @@
+#include "cuda_kernels.h"
+
+#include "common_header.h"
+
+
+//int saveJPEG(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height)
+//{
+//	return jpegNPP(szOutputFile, d_srcRGB, img_width, img_height);
+//	//return 0;
+//}
+//
+//int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height)
+//{
+//	return jpegNPP(szOutputFile, d_srcRGB, img_width, img_height);
+//	//return 0;
+//}
+//
+//int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB)
+//{
+//	return jpegNPP(szOutputFile, d_srcRGB);
+//}
+//
+//int saveJPEG(const char *szOutputFile, float* d_srcRGB)
+//{
+//	return jpegNPP(szOutputFile, d_srcRGB);
+//}
+
+int resizeFrame(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height)
+{
+	cudaError_t cudaStatus = cuda_common::ResizeImage(d_srcRGB, src_width, src_height, d_dstRGB, dst_width, dst_height);
+	if (cudaStatus != cudaSuccess) {
+		LOG_ERROR("cuda_common::ResizeImage failed: {}",cudaGetErrorString(cudaStatus));
+		return -1;
+	}
+
+	return 0;
+}
+
+//int initTables()
+//{
+//	initTable();
+//	return 0;
+//}
+//
+//int initTables(int flag, int width, int height)
+//{
+//	initTable(0, width, height);
+//	return 0;
+//}
+
+int drawImageOnGPU(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom)
+{
+	cuda_common::DrawImage(d_srcRGB, src_width, src_height, left, top, right, bottom);
+	return 0;
+}
+
+int drawImageOnGPU(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom)
+{
+	cuda_common::DrawImage(d_srcRGB, src_width, src_height, left, top, right, bottom);
+	return 0;
+}
+
+int drawLineOnGPU(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y)
+{
+	cuda_common::DrawLine(d_srcRGB, src_width, src_height, begin_x, begin_y, end_x, end_y);
+	return 0;
+}
+
+//int releaseJpegSaver()
+//{
+//	releaseJpegNPP();
+//	return 0;
+//}
+
+int partMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom)
+{
+	cudaError_t cudaStatus = cuda_common::PartMemCopy(d_srcRGB, src_width, src_height, d_dstRGB, left, top, right, bottom);
+	if (cudaStatus != cudaSuccess) {
+		LOG_ERROR("cuda_common::77 PartMemCopy failed: {} {} {} {} {} {} {}",cudaGetErrorString(cudaStatus), left, top, right, bottom, src_height, d_dstRGB);
+		return -1;
+	}
+
+	return 0;
+}
+//#include <fstream>
+//extern std::ofstream g_os;
+int PartMemResizeBatch(unsigned char * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB,
+	int count, int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h,
+	float submeanb, float submeang, float submeanr,
+	float varianceb, float varianceg, float variancer)
+{
+	//g_os << "cudaMemcpyHostToDevice begin 9" << std::endl;
+	cudaError_t cudaStatus = cuda_common::PartMemResizeBatch(
+		d_srcRGB, src_width, src_height, d_dstRGB, count, vleft, vtop, vright, vbottom, dst_w, dst_h,
+		submeanb, submeang, submeanr,
+		varianceb, varianceg, variancer);
+	//g_os << "cudaMemcpyHostToDevice end 9" << std::endl;
+	if (cudaStatus != cudaSuccess) {
+		LOG_ERROR("cuda_common::PartMemResizeBatch failed: {}",cudaGetErrorString(cudaStatus));
+		return -1;
+	}
+
+	return 0;
+}
+
+
+//int PartMemResizeBatch(float * d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, 
+//	int count, int* vleft, int * vtop, int* vright, int* vbottom, int dst_w, int dst_h,
+//	float submeanb, float submeang, float submeanr,
+//	float varianceb, float varianceg, float variancer)
+//
+//{
+//	cudaError_t cudaStatus = cuda_common::PartMemResizeBatch(
+//		d_srcRGB, src_width, src_height, d_dstRGB, count, vleft, vtop, vright, vbottom, dst_w, dst_h,
+//		submeanb, submeang, submeanr,
+//		varianceb, varianceg, variancer);
+//	if (cudaStatus != cudaSuccess) {
+//		fprintf(stderr, "cuda_common::PartMemCopy failed: %s\n", cudaGetErrorString(cudaStatus));
+//		return -1;
+//	}
+//
+//	return 0;
+//}
\ No newline at end of file
diff --git a/src/nvdec/ImageSaveGPU.h b/src/nvdec/ImageSaveGPU.h
new file mode 100644
index 0000000..272a6d2
--- /dev/null
+++ b/src/nvdec/ImageSaveGPU.h
@@ -0,0 +1,65 @@
+/*******************************************************************************************
+* Version: VPT_x64_V2.0.0_20170904
+* CopyRight: 中科院自动化研究所模式识别实验室图像视频组
+* UpdateDate: 20170904
+* Content: 人车物监测跟踪
+********************************************************************************************/
+
+#ifndef IMAGESAVEGPU_H_
+#define IMAGESAVEGPU_H_
+
+#ifdef _MSC_VER
+	#ifdef IMAGESAVEGPU_EXPORTS
+		#define IMAGESAVEGPU_API __declspec(dllexport)
+	#else
+		#define IMAGESAVEGPU_API __declspec(dllimport)
+	#endif
+#else
+#define IMAGESAVEGPU_API __attribute__((visibility ("default")))
+#endif
+// 功能:保存成jpeg文件
+// szOutputFile		输出图片路径,如D:\\out.jpg
+// d_srcRGB			输入RGB数据,由cudaMalloc分配的显存空间,数据排列形式为:BBBBBB......GGGGGG......RRRRRRRR......
+// img_width		RGB数据图片的宽度
+// img_height		RGB数据图片的高度
+//
+//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height);
+//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, float* d_srcRGB);
+//
+//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height);
+//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB);
+
+// 功能:防缩图像
+IMAGESAVEGPU_API int resizeFrame(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height);
+
+// 功能:部分拷贝数据
+IMAGESAVEGPU_API int partMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom);
+
+//IMAGESAVEGPU_API int partMemResizeImage(float * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB,
+//	int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h,
+//	float submeanb, float submeang, float submeanr,
+//	float varianceb, float varianceg, float variancer);
+
+
+IMAGESAVEGPU_API int PartMemResizeBatch(unsigned char * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB,
+	int count, int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h,
+	float submeanb, float submeang, float submeanr,
+	float varianceb, float varianceg, float variancer);
+
+
+//// 功能:初始化GPU保存图像的各种量化表
+//IMAGESAVEGPU_API int initTables();
+//IMAGESAVEGPU_API int initTables(int falg, int width, int height);
+//
+//// 功能:释放资源
+//IMAGESAVEGPU_API int releaseJpegSaver();
+
+// 功能:在GPU中绘制快照包围框
+IMAGESAVEGPU_API int drawImageOnGPU(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom);
+
+IMAGESAVEGPU_API int drawImageOnGPU(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom);
+
+// 功能:在GPU中绘制直线
+IMAGESAVEGPU_API int drawLineOnGPU(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y);
+
+#endif
diff --git a/src/nvdec/Makefile b/src/nvdec/Makefile
new file mode 100644
index 0000000..1b49ca4
--- /dev/null
+++ b/src/nvdec/Makefile
@@ -0,0 +1,101 @@
+# 各项目录
+LIB_DIR:=$(BUILD_DIR)/$(MODULE)/lib
+DEP_DIR:=$(BUILD_DIR)/$(MODULE)/.dep
+OBJ_DIR:=$(BUILD_DIR)/$(MODULE)/obj
+SRC_DIR:=$(TOP_DIR)/$(MODULE)
+
+# 源文件以及中间目标文件和依赖文件
+SRCS:=$(notdir $(wildcard $(SRC_DIR)/*.cpp))
+OBJS:=$(addprefix $(OBJ_DIR)/, $(patsubst %.cpp, %.o, $(SRCS)))
+DEPS:=$(addprefix $(DEP_DIR)/, $(patsubst %.cpp, %.d,a $(SRCS)))
+
+CUDA_ROOT = /usr/local/cuda-11.1
+NVCC = $(CUDA_ROOT)/bin/nvcc
+
+# 自动生成头文件依赖选项
+DEPFLAGS=-MT $@ -MMD -MP -MF $(DEP_DIR)/$*.d
+
+DEFS = -DENABLE_DVPP_INTERFACE
+
+# 最终目标文件
+TARGET:=$(LIB_DIR)/$(MODULE).a
+
+
+PROJECT_ROOT= /mnt/data/cmhu/FFNvDecoder
+
+DEPEND_DIR = $(PROJECT_ROOT)/bin
+THIRDPARTY_ROOT = $(PROJECT_ROOT)/3rdparty
+SPDLOG_ROOT = $(THIRDPARTY_ROOT)/spdlog-1.9.2/release
+JRTP_ROOT = $(THIRDPARTY_ROOT)/jrtp_export
+
+
+INCLUDE= -I $(DEPEND_DIR)/include \
+  -I $(CUDA_ROOT)/include \
+  -I $(TOP_DIR)/common/inc \
+  -I $(TOP_DIR)/common/UtilNPP \
+  -I $(TOP_DIR)/ \
+  -I $(SPDLOG_ROOT)/include \
+  -I $(JRTP_ROOT)/jrtplib/include/jrtplib3 \
+  -I $(JRTP_ROOT)/jthread/include/jthread
+
+LIBSPATH= -L $(DEPEND_DIR)/lib -lavformat -lavcodec -lswscale -lavutil -lavfilter -lswresample -lavdevice \
+   -L $(CUDA_ROOT)/lib64 -lcuda -lcudart -lnvcuvid -lcurand -lcublas -lnvjpeg \
+   -L $(SPDLOG_ROOT) -l:libspdlog.a \
+   -L $(JRTP_ROOT)/jthread/lib -l:libjthread.a \
+   -L $(JRTP_ROOT)/jrtplib/lib -l:libjrtp.a
+
+
+CXXFLAGS= -g -O0 -fPIC $(INCLUDE) $(DEFS) -lpthread -lrt -lz -fexceptions -std=c++11 -fvisibility=hidden -Wl,-Bsymbolic -ldl -Wwrite-strings
+	# -DUNICODE -D_UNICODE
+
+NFLAGS_LIB=-g -c -shared -Xcompiler -fPIC -Xcompiler -fvisibility=hidden
+NFLAGS = $(NFLAGS_LIB) $(INCLUDE) -std=c++11
+
+CU_SOURCES:=$(notdir $(wildcard $(SRC_DIR)/*.cu))
+CU_OBJS:=$(addprefix $(OBJ_DIR)/, $(patsubst %.cu, %.o, $(CU_SOURCES)))
+
+
+# 默认最终目标
+.PHONY:all
+all:$(TARGET)
+
+# 生成最终目标
+$(TARGET):$(OBJS) $(CU_OBJS) | $(LIB_DIR)
+	@echo -e "\e[32m""Linking static library $(TARGET)""\e[0m"
+	@echo -e "ar -rc $@ $^"
+	@ar -rc $@ $^
+
+# 若没有lib目录则自动生成
+$(LIB_DIR):
+	@mkdir -p $@
+
+# 生成中间目标文件
+$(OBJ_DIR)/%.o:$(SRC_DIR)/%.cpp $(DEP_DIR)/%.d | $(OBJ_DIR) $(DEP_DIR)
+	@echo -e "\e[33m""Building object $@""\e[0m"
+	@echo "$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $<"
+	@$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $<
+
+$(OBJ_DIR)%.o:$(SRC_DIR)/%.cu
+	@echo -e "\e[33m""Building object $@""\e[0m"
+	@echo "$(NVCC) $(NFLAGS) -o $@ $<"
+	$(NVCC) $(NFLAGS) -o $@ $< 
+
+
+# 若没有obj目录则自动生成
+$(OBJ_DIR):
+	@mkdir -p $@
+
+# 若没有.dep目录则自动生成
+$(DEP_DIR):
+	@mkdir -p $@
+
+# 依赖文件会在生成中间文件的时候自动生成,这里只是为了防止报错
+$(DEPS):
+
+# 引入中间目标文件头文件依赖关系
+include $(wildcard $(DEPS))
+
+# 直接删除组件build目录
+.PHONY:clean
+clean:
+	@rm -rf $(BUILD_DIR)/$(MODULE)
diff --git a/src/nvdec/NV12ToRGB.cu b/src/nvdec/NV12ToRGB.cu
new file mode 100644
index 0000000..68e54ac
--- /dev/null
+++ b/src/nvdec/NV12ToRGB.cu
@@ -0,0 +1,345 @@
+
+#include "cuda_kernels.h"
+
+#include <builtin_types.h>
+#include "helper_cuda_drvapi.h"
+
+typedef unsigned char   uint8;
+typedef unsigned int    uint32;
+typedef int             int32;
+
+#define COLOR_COMPONENT_MASK            0x3FF
+#define COLOR_COMPONENT_BIT_SIZE        10
+
+namespace cuda_common
+{
+
+#define MUL(x,y)    ((x)*(y))
+
+	__constant__ float  constHueColorSpaceMat2[9];  //默认分配到0卡上,未找到分配到指定卡上设置方法,当前也未用到,先注释掉
+
+	__device__ void YUV2RGB2(uint32 *yuvi, float *red, float *green, float *blue)
+	{
+		float luma, chromaCb, chromaCr;
+
+		// Prepare for hue adjustment
+		luma = (float)yuvi[0];
+		chromaCb = (float)((int32)yuvi[1] - 512.0f);
+		chromaCr = (float)((int32)yuvi[2] - 512.0f);
+
+
+		// Convert YUV To RGB with hue adjustment
+		*red = MUL(luma, constHueColorSpaceMat2[0]) +
+			MUL(chromaCb, constHueColorSpaceMat2[1]) +
+			MUL(chromaCr, constHueColorSpaceMat2[2]);
+		*green = MUL(luma, constHueColorSpaceMat2[3]) +
+			MUL(chromaCb, constHueColorSpaceMat2[4]) +
+			MUL(chromaCr, constHueColorSpaceMat2[5]);
+		*blue = MUL(luma, constHueColorSpaceMat2[6]) +
+			MUL(chromaCb, constHueColorSpaceMat2[7]) +
+			MUL(chromaCr, constHueColorSpaceMat2[8]);
+
+	}
+
+	__device__ unsigned char clip_v(int x, int min_val, int  max_val) {
+		if (x>max_val) {
+			return max_val;
+		}
+		else if (x<min_val) {
+			return min_val;
+		}
+		else {
+			return x;
+		}
+	}
+	// CUDA kernel for outputing the final RGB output from NV12;
+	extern "C"
+		__global__ void NV12ToRGB_drvapi2(uint32 *srcImage, size_t nSourcePitch, unsigned char *dstImage, int width, int height)
+	{
+
+		int32 x, y;
+		uint32 yuv101010Pel[2];
+		uint32 processingPitch = ((width)+63) & ~63;
+		uint8 *srcImageU8 = (uint8 *)srcImage;
+
+		processingPitch = nSourcePitch;
+
+		// Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
+		x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
+		y = blockIdx.y *  blockDim.y + threadIdx.y;
+
+		if (x >= width)
+		{
+			//printf("x >= width\n");
+			//*flag = -1;
+			return; //x = width - 1;
+		}
+			//return; //x = width - 1;
+
+		if (y >= height)
+		{
+			//printf("y >= height\n");
+			//*flag = -1;
+			return; // y = height - 1;
+		}
+
+		// Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
+		// if we move to texture we could read 4 luminance values
+		yuv101010Pel[0] = (srcImageU8[y * processingPitch + x]) << 2;
+		yuv101010Pel[1] = (srcImageU8[y * processingPitch + x + 1]) << 2;
+
+		uint32 chromaOffset = processingPitch * height;
+		int32 y_chroma = y >> 1;
+
+		if (y & 1)  // odd scanline ?
+		{
+			uint32 chromaCb;
+			uint32 chromaCr;
+
+			chromaCb = srcImageU8[chromaOffset + y_chroma * processingPitch + x];
+			chromaCr = srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1];
+
+			if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
+			{
+				chromaCb = (chromaCb + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x] + 1) >> 1;
+				chromaCr = (chromaCr + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x + 1] + 1) >> 1;
+			}
+
+			yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
+			yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+
+			yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
+			yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+		}
+		else
+		{
+			yuv101010Pel[0] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
+			yuv101010Pel[0] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+
+			yuv101010Pel[1] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
+			yuv101010Pel[1] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+		}
+
+		// this steps performs the color conversion
+		uint32 yuvi[6];
+		float red[2], green[2], blue[2];
+
+		yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK);
+		yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
+		yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
+
+		yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK);
+		yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
+		yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
+
+		// YUV to RGB Transformation conversion
+		YUV2RGB2(&yuvi[0], &red[0], &green[0], &blue[0]);
+		YUV2RGB2(&yuvi[3], &red[1], &green[1], &blue[1]);
+
+
+		dstImage[y * width * 3 + x * 3] = clip_v(blue[0] * 0.25,0 ,255);
+		dstImage[y * width * 3 + x * 3 + 3] = clip_v(blue[1] * 0.25,0, 255);
+
+		dstImage[width * y * 3 + x * 3 + 1] = clip_v(green[0] * 0.25,0 ,255);
+		dstImage[width * y * 3 + x * 3 + 4] = clip_v(green[1] * 0.25,0, 255);
+
+		dstImage[width * y * 3 + x * 3 + 2] = clip_v(red[0] * 0.25, 0, 255);
+		dstImage[width * y * 3 + x * 3 + 5] = clip_v(red[1] * 0.25,0 ,255);
+
+
+		//dstImage[y * width * 3 + x * 3] = blue[0] * 0.25;
+		//dstImage[y * width * 3 + x * 3 + 3] = blue[1] * 0.25;
+
+		//dstImage[width * y * 3 + x * 3 + 1] =green[0] * 0.25;
+		//dstImage[width * y * 3 + x * 3 + 4] = green[1] * 0.25;
+
+		//dstImage[width * y * 3 + x * 3 + 2] = red[0] * 0.25;
+		//dstImage[width * y * 3 + x * 3 + 5] = red[1] * 0.25;
+
+		// Clamp the results to BBBBBB....GGGGGGG.......RRRRRRR....
+		//              dstImage[y * width + x] = blue[0] * 0.25;
+		//              dstImage[y * width + x + 1] = blue[1] * 0.25;
+
+		//              dstImage[width * height + y * width + x] = green[0] * 0.25;
+		//              dstImage[width * height + y * width + x + 1] = green[1] * 0.25;
+
+		//              dstImage[width * height * 2 + y * width + x] = red[0] * 0.25;
+		//              dstImage[width * height * 2 + y * width + x + 1] = red[1] * 0.25;
+		return;
+
+	}
+
+		// CUDA kernel for outputing the final RGB output from NV12;
+	extern "C"
+		__global__ void CUDAToBGR_drvapi(uint32 *dataY, uint32 *dataUV, size_t pitchY, size_t pitchUV, unsigned char *dstImage, int width, int height)
+	{
+
+		int32 x, y;
+
+		// Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
+		x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
+		y = blockIdx.y *  blockDim.y + threadIdx.y;
+
+		if (x >= width)
+		{
+			return; 
+		}
+
+		if (y >= height)
+		{
+			return; 
+		}
+
+		uint32 yuv101010Pel[2];
+		uint8 *srcImageU8_Y = (uint8 *)dataY;
+		uint8 *srcImageU8_UV = (uint8 *)dataUV;
+
+		// Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
+		// if we move to texture we could read 4 luminance values
+		yuv101010Pel[0] = (srcImageU8_Y[y * pitchY + x]) << 2;
+		yuv101010Pel[1] = (srcImageU8_Y[y * pitchY + x + 1]) << 2;
+
+		int32 y_chroma = y >> 1;
+
+		if (y & 1)  // odd scanline ?
+		{
+			uint32 chromaCb;
+			uint32 chromaCr;
+
+			chromaCb = srcImageU8_UV[y_chroma * pitchUV + x];
+			chromaCr = srcImageU8_UV[y_chroma * pitchUV + x + 1];
+
+			if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
+			{
+				chromaCb = (chromaCb + srcImageU8_UV[(y_chroma + 1) * pitchUV + x] + 1) >> 1;
+				chromaCr = (chromaCr + srcImageU8_UV[(y_chroma + 1) * pitchUV + x + 1] + 1) >> 1;
+			}
+
+			yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
+			yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+
+			yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
+			yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+		}
+		else
+		{
+			yuv101010Pel[0] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
+			yuv101010Pel[0] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+
+			yuv101010Pel[1] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
+			yuv101010Pel[1] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+		}
+
+		// this steps performs the color conversion
+		uint32 yuvi[6];
+		float red[2], green[2], blue[2];
+
+		yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK);
+		yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
+		yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
+
+		yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK);
+		yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
+		yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
+
+		// YUV to RGB Transformation conversion
+		YUV2RGB2(&yuvi[0], &red[0], &green[0], &blue[0]);
+		YUV2RGB2(&yuvi[3], &red[1], &green[1], &blue[1]);
+
+
+		dstImage[y * width * 3 + x * 3] = clip_v(blue[0] * 0.25,0 ,255);
+		dstImage[y * width * 3 + x * 3 + 3] = clip_v(blue[1] * 0.25,0, 255);
+
+		dstImage[width * y * 3 + x * 3 + 1] = clip_v(green[0] * 0.25,0 ,255);
+		dstImage[width * y * 3 + x * 3 + 4] = clip_v(green[1] * 0.25,0, 255);
+
+		dstImage[width * y * 3 + x * 3 + 2] = clip_v(red[0] * 0.25, 0, 255);
+		dstImage[width * y * 3 + x * 3 + 5] = clip_v(red[1] * 0.25,0 ,255);
+	}
+
+	cudaError_t setColorSpace(FF_ColorSpace CSC, float hue)
+	{
+		float hueSin = sin(hue);
+		float hueCos = cos(hue);
+
+		float hueCSC[9];
+		if (CSC == ITU_601)
+		{
+			//CCIR 601
+			hueCSC[0] = 1.1644f;
+			hueCSC[1] = hueSin * 1.5960f;
+			hueCSC[2] = hueCos * 1.5960f;
+			hueCSC[3] = 1.1644f;
+			hueCSC[4] = (hueCos * -0.3918f) - (hueSin * 0.8130f);
+			hueCSC[5] = (hueSin *  0.3918f) - (hueCos * 0.8130f);
+			hueCSC[6] = 1.1644f;
+			hueCSC[7] = hueCos *  2.0172f;
+			hueCSC[8] = hueSin * -2.0172f;
+		}
+		else if (CSC == ITU_709)
+		{
+			//CCIR 709
+			hueCSC[0] = 1.0f;
+			hueCSC[1] = hueSin * 1.57480f;
+			hueCSC[2] = hueCos * 1.57480f;
+			hueCSC[3] = 1.0;
+			hueCSC[4] = (hueCos * -0.18732f) - (hueSin * 0.46812f);
+			hueCSC[5] = (hueSin *  0.18732f) - (hueCos * 0.46812f);
+			hueCSC[6] = 1.0f;
+			hueCSC[7] = hueCos *  1.85560f;
+			hueCSC[8] = hueSin * -1.85560f;
+		}
+
+		cudaError_t cudaStatus = cudaMemcpyToSymbol(constHueColorSpaceMat2, hueCSC, 9 * sizeof(float), 0, cudaMemcpyHostToDevice);
+		float tmpf[9];
+		memset(tmpf, 0, 9 * sizeof(float));
+		cudaMemcpyFromSymbol(tmpf, constHueColorSpaceMat2, 9 * sizeof(float), 0, ::cudaMemcpyDefault);
+		cudaDeviceSynchronize();
+
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaMemcpyToSymbol failed: %s\n", cudaGetErrorString(cudaStatus));
+		}
+
+		return cudaStatus;
+	}
+
+	cudaError_t NV12ToRGBnot(CUdeviceptr d_srcNV12, size_t nSourcePitch, unsigned char* d_dstRGB, int width, int height)
+	{
+		dim3 block(32, 16, 1);
+		dim3 grid((width + (2 * block.x - 1)) / (2 * block.x), (height + (block.y - 1)) / block.y, 1);
+		NV12ToRGB_drvapi2 << < grid, block >> >((uint32 *)d_srcNV12, nSourcePitch, d_dstRGB, width, height);
+		cudaError_t cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "NV12ToRGB_drvapi launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			return cudaStatus;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching NV12ToRGB_drvapi !\n", cudaStatus);
+			return cudaStatus;
+		}
+
+		return cudaStatus;
+	}
+
+	cudaError_t CUDAToBGR(CUdeviceptr dataY, CUdeviceptr dataUV, size_t pitchY, size_t pitchUV, unsigned char* d_dstRGB, int width, int height)
+	{
+		dim3 block(32, 16, 1);
+		dim3 grid((width + (2 * block.x - 1)) / (2 * block.x), (height + (block.y - 1)) / block.y, 1);
+		CUDAToBGR_drvapi << < grid, block >> >((uint32 *)dataY, (uint32 *)dataUV, pitchY, pitchUV, d_dstRGB, width, height);
+		cudaError_t cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "NV12ToRGB_drvapi launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			return cudaStatus;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching NV12ToRGB_drvapi !\n", cudaStatus);
+			return cudaStatus;
+		}
+
+		return cudaStatus;
+	}
+}
\ No newline at end of file
diff --git a/src/nvdec/NvDecoderApi.cpp b/src/nvdec/NvDecoderApi.cpp
new file mode 100644
index 0000000..efb63cd
--- /dev/null
+++ b/src/nvdec/NvDecoderApi.cpp
@@ -0,0 +1,133 @@
+#include "NvDecoderApi.h"
+#include "FFNvDecoder.h"
+
+NvDecoderApi::NvDecoderApi(){
+    m_pDecoder = nullptr;
+}
+
+NvDecoderApi::~NvDecoderApi(){
+    if(m_pDecoder != nullptr){
+        delete m_pDecoder;
+        m_pDecoder = nullptr;
+    }
+}
+
+bool NvDecoderApi::init(FFDecConfig& cfg){
+    m_pDecoder = new FFNvDecoder();
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->init(cfg);
+    }
+    return false;
+}
+
+void NvDecoderApi::close(){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->close();
+    }
+}
+
+bool NvDecoderApi::start(){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->start();
+    }
+    return false;
+}
+
+void NvDecoderApi::pause(){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->pause();
+    }
+}
+
+void NvDecoderApi::resume(){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->resume();
+    }
+}
+
+void NvDecoderApi::setDecKeyframe(bool bKeyframe){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->setDecKeyframe(bKeyframe);
+    }
+}
+
+bool NvDecoderApi::isRunning(){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->isRunning();
+    }
+    return false;
+}
+
+bool NvDecoderApi::isFinished(){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->isFinished();
+    }
+    return false;
+}
+
+bool NvDecoderApi::isPausing(){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->isPausing();
+    }
+    return false;
+}
+
+bool NvDecoderApi::getResolution(int &width, int &height){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->getResolution(width, height);
+    }
+    return false;
+}
+
+bool NvDecoderApi::isSurport(FFDecConfig& cfg){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->isSurport(cfg);
+    }
+    return false;
+}
+
+float NvDecoderApi::fps(){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->fps();
+    }
+    return 0.0;
+}
+
+int NvDecoderApi::getCachedQueueLength(){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->getCachedQueueLength();
+    }
+    return 0;
+}
+
+void NvDecoderApi::setName(string nm){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->setName(nm);
+    }
+}
+
+string NvDecoderApi::getName(){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->getName();
+    }
+    return nullptr;
+}
+
+FFImgInfo* NvDecoderApi::snapshot(){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->snapshot();
+    }
+    return nullptr;
+}
+
+void NvDecoderApi::setPostDecArg(const void* postDecArg){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->setPostDecArg(postDecArg);
+    }
+}
+
+void NvDecoderApi::setFinishedDecArg(const void* finishedDecArg){
+    if(m_pDecoder != nullptr){
+        return m_pDecoder->setFinishedDecArg(finishedDecArg);
+    }
+}
\ No newline at end of file
diff --git a/src/nvdec/NvDecoderApi.h b/src/nvdec/NvDecoderApi.h
new file mode 100644
index 0000000..f742dd8
--- /dev/null
+++ b/src/nvdec/NvDecoderApi.h
@@ -0,0 +1,44 @@
+#include<string>
+#include <pthread.h>
+
+#include "common_header.h"
+#include "../interface/AbstractDecoder.h"
+
+using namespace std;
+
+class FFNvDecoder;
+
+class NvDecoderApi : public AbstractDecoder{
+public:
+    NvDecoderApi();
+    ~NvDecoderApi();
+    bool init(FFDecConfig& cfg);
+    void close();
+    bool start();
+    void pause();
+    void resume();
+
+    void setDecKeyframe(bool bKeyframe);
+
+    bool isRunning();
+    bool isFinished();
+    bool isPausing();
+    bool getResolution( int &width, int &height );
+
+    bool isSurport(FFDecConfig& cfg);
+
+    int getCachedQueueLength();
+
+    float fps();
+
+    FFImgInfo* snapshot();
+
+    DECODER_TYPE getDecoderType(){ return DECODER_TYPE_DVPP; }
+    void setName(string nm);
+    string getName();
+
+    void setPostDecArg(const void* postDecArg);
+    void setFinishedDecArg(const void* finishedDecArg);
+private:
+    FFNvDecoder* m_pDecoder;
+};
\ No newline at end of file
diff --git a/src/nvdec/NvJpegEncoder.cpp b/src/nvdec/NvJpegEncoder.cpp
new file mode 100644
index 0000000..7ee0727
--- /dev/null
+++ b/src/nvdec/NvJpegEncoder.cpp
@@ -0,0 +1,90 @@
+#include "NvJpegEncoder.h"
+
+#include <fstream>
+#include <vector>
+#include <iostream>
+
+
+#define CHECK_NVJPEG(S) do {nvjpegStatus_t  status; \
+        status = S; \
+        if (status != NVJPEG_STATUS_SUCCESS ) std::cout << __LINE__ <<" CHECK_NVJPEG - status = " << status << std::endl; \
+        } while (false)
+
+
+int saveJpeg(const char * filepath, unsigned char* d_srcBGR, int width, int height, cudaStream_t stream)
+{
+    nvjpegHandle_t nvjpeg_handle;
+    nvjpegEncoderState_t encoder_state;
+    nvjpegEncoderParams_t encoder_params;
+
+    cudaEvent_t ev_start, ev_end;
+    cudaEventCreate(&ev_start);
+    cudaEventCreate(&ev_end);
+
+    nvjpegImage_t input;
+    nvjpegInputFormat_t input_format = NVJPEG_INPUT_BGRI;
+    int image_width = width;
+    int image_height = height;
+
+    // int channel_size = image_width * image_height;
+    // for (int i = 0; i < 3; i++)
+    // {
+    //     input.pitch[i] = image_width;
+    //     (cudaMalloc((void**)&(input.channel[i]), channel_size));
+    //     (cudaMemset(input.channel[i], 50 * 40 * i, channel_size));
+    // }
+
+    input.channel[0] = d_srcBGR;
+    input.pitch[0] = image_width * 3;
+
+    nvjpegBackend_t backend = NVJPEG_BACKEND_DEFAULT;
+
+    CHECK_NVJPEG(nvjpegCreate(backend, nullptr, &nvjpeg_handle));
+    
+    CHECK_NVJPEG(nvjpegEncoderParamsCreate(nvjpeg_handle, &encoder_params, stream));
+    CHECK_NVJPEG(nvjpegEncoderStateCreate(nvjpeg_handle, &encoder_state, stream));
+
+    // set params
+    CHECK_NVJPEG(nvjpegEncoderParamsSetEncoding(encoder_params, nvjpegJpegEncoding_t::NVJPEG_ENCODING_PROGRESSIVE_DCT_HUFFMAN, stream));
+    CHECK_NVJPEG(nvjpegEncoderParamsSetOptimizedHuffman(encoder_params, 1, stream));
+    CHECK_NVJPEG(nvjpegEncoderParamsSetQuality(encoder_params, 70, stream));
+    CHECK_NVJPEG(nvjpegEncoderParamsSetSamplingFactors(encoder_params, nvjpegChromaSubsampling_t::NVJPEG_CSS_420, stream));
+
+    cudaEventRecord(ev_start);
+    CHECK_NVJPEG(nvjpegEncodeImage(nvjpeg_handle, encoder_state, encoder_params, &input, input_format, image_width, image_height, stream));
+    cudaEventRecord(ev_end);
+
+    std::vector<unsigned char> obuffer;
+    size_t length;
+    CHECK_NVJPEG(nvjpegEncodeRetrieveBitstream(
+        nvjpeg_handle,
+        encoder_state,
+        NULL,
+        &length,
+        stream));
+
+    obuffer.resize(length);
+    CHECK_NVJPEG(nvjpegEncodeRetrieveBitstream(
+        nvjpeg_handle,
+        encoder_state,
+        obuffer.data(),
+        &length,
+        stream));
+
+    cudaEventSynchronize(ev_end);
+
+    // 用完销毁,避免显存泄露
+    nvjpegEncoderParamsDestroy(encoder_params);
+    nvjpegEncoderStateDestroy(encoder_state);
+    nvjpegDestroy(nvjpeg_handle);
+
+    float ms;
+    cudaEventElapsedTime(&ms, ev_start, ev_end);
+    // std::cout << "time spend " << ms << " ms" << std::endl;
+
+    std::ofstream outputFile(filepath, std::ios::out | std::ios::binary);
+    outputFile.write(reinterpret_cast<const char *>(obuffer.data()), static_cast<int>(length));
+    outputFile.close();
+    
+    return 0;
+}
\ No newline at end of file
diff --git a/src/nvdec/NvJpegEncoder.h b/src/nvdec/NvJpegEncoder.h
new file mode 100644
index 0000000..3c27ba8
--- /dev/null
+++ b/src/nvdec/NvJpegEncoder.h
@@ -0,0 +1,3 @@
+#include <nvjpeg.h>
+
+int saveJpeg(const char * filepath, unsigned char* d_srcBGR, int width, int height, cudaStream_t stream);
\ No newline at end of file
diff --git a/src/nvdec/PartMemCopy.cu b/src/nvdec/PartMemCopy.cu
new file mode 100644
index 0000000..396765b
--- /dev/null
+++ b/src/nvdec/PartMemCopy.cu
@@ -0,0 +1,289 @@
+#include "cuda_kernels.h"
+#include <algorithm>
+typedef unsigned char   uchar;
+typedef unsigned int    uint32;
+typedef int             int32;
+
+#define MAX_SNAPSHOT_WIDTH 320
+#define MAX_SNAPSHOT_HEIGHT 320
+
+namespace cuda_common
+{
+	__global__ void kernel_memcopy(unsigned char* d_srcRGB, int src_width, int src_height,
+		unsigned char* d_dstRGB, int left, int top, int right, int bottom)
+	{
+		const int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
+		const int dst_width = right - left;
+		const int dst_height = bottom - top;
+		if (dst_x < dst_width && dst_y < dst_height)
+		{
+			int src_x = left + dst_x;
+			int src_y = top + dst_y;
+
+			//bgr...bgr...bgr...
+			d_dstRGB[(dst_y*dst_width + dst_x) * 3] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3];
+			d_dstRGB[(dst_y*dst_width + dst_x)
+				* 3 + 1] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3 + 1];
+			d_dstRGB[(dst_y*dst_width + dst_x) * 3 + 2] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3 + 2];
+
+			//bbb...ggg...rrr...
+			//d_dstRGB[(dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(src_y*src_width) + src_x];
+			//d_dstRGB[(dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(src_width*src_height) + (src_y*src_width) + src_x];
+			//d_dstRGB[(2 * dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(2 * src_width*src_height) + (src_y*src_width) + src_x];
+
+			/*	memcpy(d_dstRGB + (dst_y*src_width) + dst_x, d_srcRGB + (src_y*src_width) + src_x, sizeof(float));
+			memcpy(d_dstRGB + (src_width*src_height) + (dst_y*src_width) + dst_x, d_srcRGB + (src_width*src_height) + (src_y*src_width) + src_x, sizeof(float));
+			memcpy(d_dstRGB + (2 * src_width*src_height) + (dst_y*src_width) + dst_x, d_srcRGB + (2 * src_width*src_height) + (src_y*src_width) + src_x, sizeof(float));*/
+		}
+	}
+
+	cudaError_t PartMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom)
+	{
+		dim3 block(32, 16, 1);
+		dim3 grid(((right - left) + (block.x - 1)) / block.x, ((bottom - top) + (block.y - 1)) / block.y, 1);
+
+		kernel_memcopy << < grid, block >> > (d_srcRGB, src_width, src_height, d_dstRGB, left, top, right, bottom);
+
+		cudaError_t cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "Part 50 kernel_memcopy launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			return cudaStatus;
+		}
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus);
+			return cudaStatus;
+		}
+		return cudaStatus;
+	}
+
+
+	//    __global__ void kernel_memcopy_mean_variance(float* d_srcRGB, int src_width, int src_height, 
+	//            unsigned char* vd_dstRGB, int count, int * vleft, int* vtop, int* vright, int * vbottom, float submeanb,float submeang, float submeanr, float varianceb,float varianceg, float variancer)
+	//    {
+	//        const int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
+	//        const int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
+	//        for (int i=0;i<count;i++)
+	//        {
+	//                const int left = vleft[i];
+	//                const int right = vright[i];
+	//                const int top = vtop[i];
+	//                const int bottom = vbottom[i];
+	//        
+	//                const int dst_width = right - left;
+	//                const int dst_height = bottom - top;
+	//
+	//
+	//                unsigned char * d_dstRGB = vd_dstRGB + i *   ;
+	//
+	//                if (dst_x < dst_width && dst_y < dst_height)
+	//                {
+	//                    int src_x = left + dst_x;
+	//                    int src_y = top + dst_y;
+	//        
+	//                    d_dstRGB[(dst_y*dst_width) + dst_x] = (d_srcRGB[(src_y*src_width) + src_x] - submeanb)*varianceb;
+	//                    d_dstRGB[(dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (d_srcRGB[(src_width*src_height) + (src_y*src_width) + src_x] -submeang)*varianceg;
+	//                    d_dstRGB[(2 * dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (d_srcRGB[(2 * src_width*src_height) + (src_y*src_width) + src_x] - submeanr) * variancer;
+	//        
+	//                }
+	//        }
+	//    }
+	__global__ void PartCopy_ResizeImgBilinearBGR_Mean_Variance_CUDAKernel(
+		unsigned char * d_srcRGB, int srcimg_width, int srcimg_height,
+		int* vleft, int* vtop, int* vright, int * vbottom,
+		unsigned char** vd_dstRGB, int count, int *dst_width, int *dst_height,
+		float submeanb, float submeang, float submeanr,
+		float varianceb, float varianceg, float variancer)
+	{
+		int i = blockIdx.z;
+
+		//for (int i = 0; i<count; i++)
+		{
+			const int left = vleft[i];
+			const int right = vright[i];
+			const int top = vtop[i];
+			const int bottom = vbottom[i];
+			const int cur_dst_width = dst_width[i];
+			const int cur_dst_height = dst_height[i];
+
+			unsigned char* d_dstRGB =  vd_dstRGB[i];
+
+			const int src_width = right - left;
+			const int src_height = bottom - top;
+			const int x = blockIdx.x * blockDim.x + threadIdx.x;// + left;
+			const int y = blockIdx.y * blockDim.y + threadIdx.y;//+ top;
+			const int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
+			const int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+			/*if (dst_x == 0 && dst_y == 0)
+				printf("%d %d %d %d %d\n", i, vleft[i], vright[i], cur_dst_width, cur_dst_height);*/
+
+			unsigned char * src_img = d_srcRGB;
+			unsigned char * dst_img = d_dstRGB;
+			if (dst_x < cur_dst_width && dst_y < cur_dst_height)
+			{
+				float fx = (x + 0.5)*src_width / (float)cur_dst_width - 0.5 + left;
+				float fy = (y + 0.5)*src_height / (float)cur_dst_height - 0.5 + top;
+				int ax = floor(fx);
+				int ay = floor(fy);
+				if (ax < 0)
+				{
+					ax = 0;
+				}
+				if (ax > srcimg_width - 2)
+				{
+					ax = srcimg_width - 2;
+				}
+				if (ay < 0) {
+					ay = 0;
+				}
+				if (ay > srcimg_height - 2)
+				{
+					ay = srcimg_height - 2;
+				}
+
+				int A = ax + ay*srcimg_width;
+				int B = ax + ay*srcimg_width + 1;
+				int C = ax + ay*srcimg_width + srcimg_width;
+				int D = ax + ay*srcimg_width + srcimg_width + 1;
+
+				float w1, w2, w3, w4;
+				w1 = fx - ax;
+				w2 = 1 - w1;
+				w3 = fy - ay;
+				w4 = 1 - w3;
+				float blue = src_img[A * 3] * w2*w4 + src_img[B * 3] * w1*w4 + src_img[C * 3] * w2*w3 + src_img[D * 3] * w1*w3;
+				float green = src_img[A * 3 + 1] * w2*w4 + src_img[B * 3 + 1] * w1*w4
+					+ src_img[C * 3 + 1] * w2*w3 + src_img[D * 3 + 1] * w1*w3;
+				float red = src_img[A * 3 + 2] * w2*w4 + src_img[B * 3 + 2] * w1*w4
+					+ src_img[C * 3 + 2] * w2*w3 + src_img[D * 3 + 2] * w1*w3;
+
+				/*dst_img[(dst_y * dst_width + dst_x) * 3] = (unsigned char)(blue - submeanb)*varianceb;
+				dst_img[(dst_y * dst_width + dst_x) * 3 + 1] =(unsigned char) (green - submeang)*varianceg;
+				dst_img[(dst_y * dst_width + dst_x) * 3 + 2] = (unsigned char) (red - submeanr)*variancer;*/
+
+				if (blue < 0)
+					blue = 0;
+				else if (blue > 255)
+					blue = 255;
+
+				if (green < 0)
+					green = 0;
+				else if (green > 255)
+					green = 255;
+
+				if (red < 0)
+					red = 0;
+				else if (red > 255)
+					red = 255;
+
+				dst_img[(dst_y * cur_dst_width + dst_x) * 3] = (unsigned char)blue;
+				dst_img[(dst_y * cur_dst_width + dst_x) * 3 + 1] = (unsigned char)green;
+				dst_img[(dst_y * cur_dst_width + dst_x) * 3 + 2] = (unsigned char)red;
+
+
+				/*if (src_img[(dst_y * dst_width + dst_x) * 3] < 0)
+					src_img[(dst_y * dst_width + dst_x) * 3] = 0;
+				else if (src_img[(dst_y * dst_width + dst_x) * 3] > 255)
+					src_img[(dst_y * dst_width + dst_x) * 3] = 255;
+
+				if (src_img[(dst_y * dst_width + dst_x) * 3 + 1] < 0)
+					src_img[(dst_y * dst_width + dst_x) * 3 + 1] = 0;
+				else if (src_img[(dst_y * dst_width + dst_x) * 3 + 1] > 255)
+					src_img[(dst_y * dst_width + dst_x) * 3 + 1] = 255;
+
+				if (src_img[(dst_y * dst_width + dst_x) * 3 + 2] < 0)
+					src_img[(dst_y * dst_width + dst_x) * 3 + 2] = 0;
+				else if (src_img[(dst_y * dst_width + dst_x) * 3 + 2] > 255)
+					src_img[(dst_y * dst_width + dst_x) * 3 + 2] = 255;
+
+
+				dst_img[(dst_y * dst_width + dst_x) * 3] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3];
+				dst_img[(dst_y * dst_width + dst_x) * 3 + 1] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3 + 1];
+				dst_img[(dst_y * dst_width + dst_x) * 3 + 2] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3 + 2];*/
+			}
+		}
+	}
+
+	cudaError_t PartMemResizeBatch(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, int count, int* left, int* top, int* right, int* bottom, int *dst_w, int *dst_h, float submeanb, float submeang, float submeanr,
+		float varianceb, float varianceg, float variancer)
+	{
+	/*	cudaEvent_t start, stop;
+		float time;
+		cudaEventCreate(&start);
+		cudaEventCreate(&stop);
+		cudaEventRecord(start, 0);*/
+
+		dim3 block(32, 16, 1);
+		dim3 grid((*std::max_element(dst_w, dst_w+ count) + (block.x - 1)) / block.x, (*std::max_element(dst_h, dst_h + count) + (block.y - 1)) / block.y, count);
+
+		int * gpu_left;
+		cudaMalloc(&gpu_left, 1000 * sizeof(int));
+		cudaMemcpy(gpu_left, left, count * sizeof(int), cudaMemcpyHostToDevice);
+
+		int * gpu_right;
+		cudaMalloc(&gpu_right, 1000 * sizeof(int));
+		cudaMemcpy(gpu_right, right, count * sizeof(int), cudaMemcpyHostToDevice);
+
+		int * gpu_top;
+		cudaMalloc(&gpu_top, 1000 * sizeof(int));
+		cudaMemcpy(gpu_top, top, count * sizeof(int), cudaMemcpyHostToDevice);
+
+		int * gpu_bottom;
+		cudaMalloc(&gpu_bottom, 1000 * sizeof(int));
+		cudaMemcpy(gpu_bottom, bottom, count * sizeof(int), cudaMemcpyHostToDevice);
+
+		int * gpu_dst_w;
+		cudaMalloc(&gpu_dst_w, 1000 * sizeof(int));
+		cudaMemcpy(gpu_dst_w, dst_w, count * sizeof(int), cudaMemcpyHostToDevice);
+
+		int * gpu_dst_h;
+		cudaMalloc(&gpu_dst_h, 1000 * sizeof(int));
+		cudaMemcpy(gpu_dst_h, dst_h, count * sizeof(int), cudaMemcpyHostToDevice);
+
+		unsigned char** gpu_dst_rgb;
+		cudaMalloc(&gpu_dst_rgb, 1000 * sizeof(unsigned char*));
+		cudaMemcpy(gpu_dst_rgb, d_dstRGB, count * sizeof(unsigned char*), cudaMemcpyHostToDevice);
+
+		//cudaMemcpy(cpu_personfloat, d_srcRGB, 112*224*2*sizeof(float), cudaMemcpyDeviceToHost);
+		//            for(int i=0;i<100;i++)
+		//            {
+		//                  printf("the score is %f\t",cpu_personfloat[i]);
+		//            }
+		PartCopy_ResizeImgBilinearBGR_Mean_Variance_CUDAKernel << < grid, block >> > (
+			d_srcRGB, src_width, src_height,
+			gpu_left, gpu_top, gpu_right, gpu_bottom,
+			gpu_dst_rgb, count, gpu_dst_w, gpu_dst_h,
+			submeanb, submeang, submeanr,
+			varianceb, varianceg, variancer);
+		cudaFree(gpu_top);
+		cudaFree(gpu_bottom);
+		cudaFree(gpu_left);
+		cudaFree(gpu_right);
+		cudaFree(gpu_dst_w);
+		cudaFree(gpu_dst_h);
+		cudaFree(gpu_dst_rgb);
+	
+		cudaError_t cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "Part 270 kernel_memcopy launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			return cudaStatus;
+		}
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus);
+			return cudaStatus;
+		}
+
+		/*cudaEventRecord(stop, 0);
+		cudaEventSynchronize(stop);
+		cudaEventElapsedTime(&time, start, stop);
+		cudaEventDestroy(start);
+		cudaEventDestroy(stop);
+		printf("�˺�������ʱ��:%f\n", time);*/
+
+		return cudaStatus;
+	}
+
+}
\ No newline at end of file
diff --git a/src/nvdec/RGB2YUV.cu b/src/nvdec/RGB2YUV.cu
new file mode 100644
index 0000000..7202c3a
--- /dev/null
+++ b/src/nvdec/RGB2YUV.cu
@@ -0,0 +1,263 @@
+
+
+#include "cuda_kernels.h"
+
+typedef unsigned char   uint8;
+typedef unsigned int    uint32;
+typedef int             int32;
+
+namespace cuda_common
+{
+	__device__ unsigned char clip_value(unsigned char x, unsigned char min_val, unsigned char  max_val){
+		if (x>max_val){
+			return max_val;
+		}
+		else if (x<min_val){
+			return min_val;
+		}
+		else{
+			return x;
+		}
+	}
+
+	__global__ void kernel_rgb2yuv(unsigned char *src_img, unsigned char* Y, unsigned char* u, unsigned char* v,
+		int src_width, int src_height, size_t yPitch)
+	{
+		const int x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (x >= src_width)
+			return; //x = width - 1;
+
+		if (y >= src_height)
+			return; // y = height - 1;
+		
+		int B = src_img[y * src_width * 3 + x * 3];
+		int G = src_img[y * src_width * 3 + x * 3 + 1];
+		int R = src_img[y * src_width * 3 + x * 3 + 2];
+
+		/*int B = src_img[y * src_width + x];
+		int G = src_img[src_width * src_height + y * src_width + x];
+		int R = src_img[src_width * src_height * 2 + y * src_width + x];*/
+
+		Y[y * yPitch + x] = clip_value((unsigned char)(0.299 * R + 0.587 * G + 0.114 * B), 0, 255);
+		u[y * src_width + x] = clip_value((unsigned char)(-0.147 * R - 0.289 * G + 0.436 * B + 128), 0, 255);
+		v[y * src_width + x] = clip_value((unsigned char)(0.615 * R - 0.515 * G - 0.100 * B + 128), 0, 255);
+
+		//Y[y * yPitch + x] = clip_value((unsigned char)(0.257 * R + 0.504 * G + 0.098 * B + 16), 0, 255);
+		//u[y * src_width + x] = clip_value((unsigned char)(-0.148 * R - 0.291 * G + 0.439 * B + 128), 0, 255);
+		//v[y * src_width + x] = clip_value((unsigned char)(0.439 * R - 0.368 * G - 0.071 * B + 128), 0, 255);
+	}
+
+	__global__ void kernel_rgb2yuv(float *src_img, unsigned char* Y, unsigned char* u, unsigned char* v,
+		int src_width, int src_height, size_t yPitch)
+	{
+		const int x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (x >= src_width)
+			return; //x = width - 1;
+
+		if (y >= src_height)
+			return; // y = height - 1;
+
+		float B = src_img[y * src_width + x];
+		float G = src_img[src_width * src_height + y * src_width + x];
+		float R = src_img[src_width * src_height * 2 + y * src_width + x];
+
+		Y[y * yPitch + x] = clip_value((unsigned char)(0.299 * R + 0.587 * G + 0.114 * B), 0, 255);
+		u[y * src_width + x] = clip_value((unsigned char)(-0.147 * R - 0.289 * G + 0.436 * B + 128), 0, 255);
+		v[y * src_width + x] = clip_value((unsigned char)(0.615 * R - 0.515 * G - 0.100 * B + 128), 0, 255);
+
+		//Y[y * yPitch + x] = clip_value((unsigned char)(0.257 * R + 0.504 * G + 0.098 * B + 16), 0, 255);
+		//u[y * src_width + x] = clip_value((unsigned char)(-0.148 * R - 0.291 * G + 0.439 * B + 128), 0, 255);
+		//v[y * src_width + x] = clip_value((unsigned char)(0.439 * R - 0.368 * G - 0.071 * B + 128), 0, 255);
+	}
+
+	extern "C"
+	__global__ void kernel_resize_UV(unsigned char* src_img, unsigned char *dst_img,
+		int src_width, int src_height, int dst_width, int dst_height, int nPitch)
+	{
+		const int x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (x >= dst_width)
+			return; //x = width - 1;
+
+		if (y >= dst_height)
+			return; // y = height - 1;
+
+		float fx = (x + 0.5)*src_width / (float)dst_width - 0.5;
+		float fy = (y + 0.5)*src_height / (float)dst_height - 0.5;
+		int ax = floor(fx);
+		int ay = floor(fy);
+		if (ax < 0)
+		{
+			ax = 0;
+		}
+		else if (ax > src_width - 2)
+		{
+			ax = src_width - 2;
+		}
+
+		if (ay < 0){
+			ay = 0;
+		}
+		else if (ay > src_height - 2)
+		{
+			ay = src_height - 2;
+		}
+
+		int A = ax + ay*src_width;
+		int B = ax + ay*src_width + 1;
+		int C = ax + ay*src_width + src_width;
+		int D = ax + ay*src_width + src_width + 1;
+
+		float w1, w2, w3, w4;
+		w1 = fx - ax;
+		w2 = 1 - w1;
+		w3 = fy - ay;
+		w4 = 1 - w3;
+
+		unsigned char val = src_img[A] * w2*w4 + src_img[B] * w1*w4 + src_img[C] * w2*w3 + src_img[D] * w1*w3;
+
+		dst_img[y * nPitch + x] = clip_value(val,0,255);
+	}
+
+	cudaError_t RGB2YUV(float* d_srcRGB, int src_width, int src_height,
+						unsigned char* Y, size_t yPitch, int yWidth, int yHeight,
+						unsigned char* U, size_t uPitch, int uWidth, int uHeight,
+						unsigned char* V, size_t vPitch, int vWidth, int vHeight)
+	{
+		unsigned char * u ;
+		unsigned char * v ;
+
+		cudaError_t cudaStatus;
+
+		cudaStatus = cudaMalloc((void**)&u, src_width * src_height * sizeof(unsigned char));
+		cudaStatus = cudaMalloc((void**)&v, src_width * src_height * sizeof(unsigned char));
+
+		dim3 block(32, 16, 1);
+		dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1);
+		dim3 grid1((uWidth + (block.x - 1)) / block.x, (uHeight + (block.y - 1)) / block.y, 1);
+		dim3 grid2((vWidth + (block.x - 1)) / block.x, (vHeight + (block.y - 1)) / block.y, 1);
+
+		kernel_rgb2yuv << < grid, block >> >(d_srcRGB, Y, u, v, src_width, src_height, yPitch);
+
+		cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "kernel_rgb2yuv launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			goto Error;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_rgb2yuv!\n", cudaStatus);
+			goto Error;
+		}
+
+		kernel_resize_UV << < grid1, block >> >(u, U, src_width, src_height, uWidth, uHeight, uPitch);
+
+		cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			goto Error;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus);
+			goto Error;
+		}
+
+		kernel_resize_UV << < grid2, block >> >(v, V, src_width, src_height, vWidth, vHeight, vPitch);
+
+		cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			goto Error;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus);
+			goto Error;
+		}
+
+Error :
+		cudaFree(u);
+		cudaFree(v);
+
+		return cudaStatus;
+	}
+
+
+
+	cudaError_t RGB2YUV(unsigned char* d_srcRGB, int src_width, int src_height,
+		unsigned char* Y, size_t yPitch, int yWidth, int yHeight,
+		unsigned char* U, size_t uPitch, int uWidth, int uHeight,
+		unsigned char* V, size_t vPitch, int vWidth, int vHeight)
+	{
+		unsigned char * u;
+		unsigned char * v;
+
+		cudaError_t cudaStatus;
+
+		cudaStatus = cudaMalloc((void**)&u, src_width * src_height * sizeof(unsigned char));
+		cudaStatus = cudaMalloc((void**)&v, src_width * src_height * sizeof(unsigned char));
+
+		dim3 block(32, 16, 1);
+		dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1);
+		dim3 grid1((uWidth + (block.x - 1)) / block.x, (uHeight + (block.y - 1)) / block.y, 1);
+		dim3 grid2((vWidth + (block.x - 1)) / block.x, (vHeight + (block.y - 1)) / block.y, 1);
+
+		kernel_rgb2yuv << < grid, block >> >(d_srcRGB, Y, u, v, src_width, src_height, yPitch);
+
+		cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "kernel_rgb2yuv launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			goto Error;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_rgb2yuv!\n", cudaStatus);
+			goto Error;
+		}
+
+		kernel_resize_UV << < grid1, block >> >(u, U, src_width, src_height, uWidth, uHeight, uPitch);
+
+		cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			goto Error;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus);
+			goto Error;
+		}
+
+		kernel_resize_UV << < grid2, block >> >(v, V, src_width, src_height, vWidth, vHeight, vPitch);
+
+		cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			goto Error;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus);
+			goto Error;
+		}
+
+	Error:
+		cudaFree(u);
+		cudaFree(v);
+
+		return cudaStatus;
+	}
+}
+
diff --git a/src/nvdec/ResizeImage.cu b/src/nvdec/ResizeImage.cu
new file mode 100644
index 0000000..fdc6961
--- /dev/null
+++ b/src/nvdec/ResizeImage.cu
@@ -0,0 +1,84 @@
+#include "cuda_kernels.h"
+
+typedef unsigned char   uchar;
+typedef unsigned int    uint32;
+typedef int             int32;
+
+namespace cuda_common
+{
+	__global__ void kernel_bilinear(float *src_img, float *dst_img,
+		int src_width, int src_height, int dst_width, int dst_height)
+	{
+		const int x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (x < dst_width && y < dst_height)
+		{
+			float fx = (x + 0.5)*src_width / (float)dst_width - 0.5;
+			float fy = (y + 0.5)*src_height / (float)dst_height - 0.5;
+			int ax = floor(fx);
+			int ay = floor(fy);
+			if (ax < 0)
+			{
+				ax = 0;
+			}
+			else if (ax > src_width - 2)
+			{
+				ax = src_width - 2;
+			}
+
+			if (ay < 0){
+				ay = 0;
+			}
+			else if (ay > src_height - 2)
+			{
+				ay = src_height - 2;
+			}
+
+			int A = ax + ay*src_width;
+			int B = ax + ay*src_width + 1;
+			int C = ax + ay*src_width + src_width;
+			int D = ax + ay*src_width + src_width + 1;
+
+			float w1, w2, w3, w4;
+			w1 = fx - ax;
+			w2 = 1 - w1;
+			w3 = fy - ay;
+			w4 = 1 - w3;
+
+			float blue = src_img[A] * w2*w4 + src_img[B] * w1*w4 + src_img[C] * w2*w3 + src_img[D] * w1*w3;
+
+			float green = src_img[src_width * src_height + A] * w2*w4 + src_img[src_width * src_height + B] * w1*w4 
+				+ src_img[src_width * src_height + C] * w2*w3 + src_img[src_width * src_height + D] * w1*w3;
+
+			float red = src_img[src_width * src_height * 2 + A] * w2*w4 + src_img[src_width * src_height * 2 + B] * w1*w4 
+				+ src_img[src_width * src_height * 2 + C] * w2*w3 + src_img[src_width * src_height * 2 + D] * w1*w3;
+
+			dst_img[y * dst_width + x] = blue;
+			dst_img[dst_width * dst_height + y * dst_width + x] = green;
+			dst_img[dst_width * dst_height * 2 + y * dst_width + x] = red;
+		}
+	}
+
+	cudaError_t ResizeImage(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height)
+	{
+		dim3 block(32, 16, 1);
+		dim3 grid((dst_width + (block.x - 1)) / block.x, (dst_height + (block.y - 1)) / block.y, 1);
+
+		kernel_bilinear << < grid, block >> >(d_srcRGB, d_dstRGB, src_width, src_height, dst_width, dst_height);
+
+		cudaError_t cudaStatus = cudaGetLastError();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "kernel_bilinear launch failed: %s\n", cudaGetErrorString(cudaStatus));
+			return cudaStatus;
+		}
+
+		cudaStatus = cudaDeviceSynchronize();
+		if (cudaStatus != cudaSuccess) {
+			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus);
+			return cudaStatus;
+		}
+
+		return cudaStatus;
+	}
+}
\ No newline at end of file
diff --git a/src/nvdec/common_header.h b/src/nvdec/common_header.h
new file mode 100644
index 0000000..cf45c91
--- /dev/null
+++ b/src/nvdec/common_header.h
@@ -0,0 +1,9 @@
+#ifndef _COMMON_HEADER_H_
+#define _COMMON_HEADER_H_
+
+
+#include "../interface/logger.hpp"
+#include "../interface/utiltools.hpp"
+#include "../interface/interface_headers.h"
+
+#endif
\ No newline at end of file
diff --git a/src/nvdec/cuda_kernels.h b/src/nvdec/cuda_kernels.h
new file mode 100644
index 0000000..cd1eb00
--- /dev/null
+++ b/src/nvdec/cuda_kernels.h
@@ -0,0 +1,63 @@
+#pragma once
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <string.h>
+#include <math.h>
+
+#include <cuda.h>
+
+typedef enum
+{
+	ITU_601 = 1,
+	ITU_709 = 2
+} FF_ColorSpace;
+
+namespace cuda_common
+{
+	cudaError_t setColorSpace(FF_ColorSpace CSC, float hue);
+
+	cudaError_t NV12ToRGBnot(CUdeviceptr d_srcNV12, size_t nSourcePitch, unsigned char* d_dstRGB, int width, int height);
+	cudaError_t CUDAToBGR(CUdeviceptr dataY, CUdeviceptr dataUV, size_t pitchY, size_t pitchUV, unsigned char* d_dstRGB, int width, int height);
+
+	
+	cudaError_t ResizeImage(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height);
+
+	cudaError_t RGB2YUV(float* d_srcRGB, int src_width, int src_height,
+		unsigned char* Y, size_t yPitch, int yWidth, int yHeight,
+		unsigned char* U, size_t uPitch, int uWidth, int uHeight,
+		unsigned char* V, size_t vPitch, int vWidth, int vHeight);
+
+	cudaError_t RGB2YUV(unsigned char* d_srcRGB, int src_width, int src_height,
+		unsigned char* Y, size_t yPitch, int yWidth, int yHeight,
+		unsigned char* U, size_t uPitch, int uWidth, int uHeight,
+		unsigned char* V, size_t vPitch, int vWidth, int vHeight);
+
+	cudaError_t PartMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom);
+	//	cudaError_t PartMemResize(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int left, int top, int right, int bottom);
+
+	cudaError_t PartMemResizeBatch(unsigned char* d_srcRGB, int srcimg_width, int srcimg_height, unsigned char** d_dstRGB, int count,
+		int* left, int* top, int* right, int* bottom, int *dst_w, int *dst_h,
+		float submeanb, float submeang, float submeanr,
+		float varianceb, float varianceg, float variancer);
+
+	cudaError_t DrawImage(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom);
+	cudaError_t DrawImage(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom);
+
+	cudaError_t DrawLine(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y);
+}
+
+
+int jpegNPP(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height);
+int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height);
+
+int jpegNPP(const char *szOutputFile, float* d_srcRGB);
+int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB);
+
+int initTable();
+int initTable(int flag, int width, int height);
+int releaseJpegNPP();
+
diff --git a/src/nvdec/define.hpp b/src/nvdec/define.hpp
new file mode 100644
index 0000000..ed20540
--- /dev/null
+++ b/src/nvdec/define.hpp
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <string>
+
+
+#define CHECK_CUDA(call) \
+{\
+    const cudaError_t error_code = call;\
+    if (cudaSuccess != error_code)\
+        LOG_ERROR("CUDA error, code: {} reason: {}", error_code, cudaGetErrorString(error_code));\
+}
diff --git a/src/nvdec/jpegNPP.cpp-1 b/src/nvdec/jpegNPP.cpp-1
new file mode 100644
index 0000000..f0bf2e6
--- /dev/null
+++ b/src/nvdec/jpegNPP.cpp-1
@@ -0,0 +1,1193 @@
+/*
+* Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.  This source code is a "commercial item" as
+* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer software" and "commercial computer software
+* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*/
+
+// This sample needs at least CUDA 5.5 and a GPU that has at least Compute Capability 2.0
+
+// This sample demonstrates a simple image processing pipeline.
+// First, a JPEG file is huffman decoded and inverse DCT transformed and dequantized.
+// Then the different planes are resized. Finally, the resized image is quantized, forward
+// DCT transformed and huffman encoded.
+
+#include "cuda_kernels.h"
+
+#include <npp.h>
+#include <cuda_runtime.h>
+#include "common/UtilNPP/Exceptions.h"
+
+#include "Endianess.h"
+#include <math.h>
+
+#include <string.h>
+#include <fstream>
+#include <iostream>
+
+#include "common/inc/helper_string.h"
+#include "common/inc/helper_cuda.h"
+//#include "MacroDef.h"
+#include "cuda.h"
+
+using namespace std;
+
+struct FrameHeader
+{
+	unsigned char nSamplePrecision;
+	unsigned short nHeight;
+	unsigned short nWidth;
+	unsigned char nComponents;
+	unsigned char aComponentIdentifier[3];
+	unsigned char aSamplingFactors[3];
+	unsigned char aQuantizationTableSelector[3];
+};
+
+struct ScanHeader
+{
+	unsigned char nComponents;
+	unsigned char aComponentSelector[3];
+	unsigned char aHuffmanTablesSelector[3];
+	unsigned char nSs;
+	unsigned char nSe;
+	unsigned char nA;
+};
+
+struct QuantizationTable
+{
+	unsigned char nPrecisionAndIdentifier;
+	unsigned char aTable[64];
+};
+
+struct HuffmanTable
+{
+	unsigned char nClassAndIdentifier;
+	unsigned char aCodes[16];
+	unsigned char aTable[256];
+};
+
+//??准?炼??藕?量??模??
+//unsigned char std_Y_QT[64] =
+//{
+//	16, 11, 10, 16, 24, 40, 51, 61,
+//	12, 12, 14, 19, 26, 58, 60, 55,
+//	14, 13, 16, 24, 40, 57, 69, 56,
+//	14, 17, 22, 29, 51, 87, 80, 62,
+//	18, 22, 37, 56, 68, 109, 103, 77,
+//	24, 35, 55, 64, 81, 104, 113, 92,
+//	49, 64, 78, 87, 103, 121, 120, 101,
+//	72, 92, 95, 98, 112, 100, 103, 99
+//};
+//
+////??准色???藕?量??模??
+//unsigned char std_UV_QT[64] =
+//{
+//	17, 18, 24, 47, 99, 99, 99, 99,
+//	18, 21, 26, 66, 99, 99, 99, 99,
+//	24, 26, 56, 99, 99, 99, 99, 99,
+//	47, 66, 99, 99, 99, 99, 99, 99,
+//	99, 99, 99, 99, 99, 99, 99, 99,
+//	99, 99, 99, 99, 99, 99, 99, 99,
+//	99, 99, 99, 99, 99, 99, 99, 99,
+//	99, 99, 99, 99, 99, 99, 99, 99
+//};
+
+////?炼??藕?量??模??
+//unsigned char std_Y_QT[64] =
+//{
+//	6, 4, 5, 6, 5, 4, 6, 6,
+//	5, 6, 7, 7, 6, 8, 10, 16,
+//	10, 10, 9, 9, 10, 20, 14, 15,
+//	12, 16, 23, 20, 24, 24, 23, 20,
+//	22, 22, 26, 29, 37, 31, 26, 27,
+//	35, 28, 22, 22, 32, 44, 32, 35,
+//	38, 39, 41, 42, 41, 25, 31, 45,
+//	48, 45, 40, 48, 37, 40, 41, 40
+//};
+//
+////色???藕?量??模??
+//unsigned char std_UV_QT[64] =
+//{
+//	7, 7, 7, 10, 8, 10, 19, 10,
+//	10, 19, 40, 26, 22, 26, 40, 40,
+//	40, 40, 40, 40, 40, 40, 40, 40,
+//	40, 40, 40, 40, 40, 40, 40, 40,
+//	40, 40, 40, 40, 40, 40, 40, 40,
+//	40, 40, 40, 40, 40, 40, 40, 40,
+//	40, 40, 40, 40, 40, 40, 40, 40,
+//	40, 40, 40, 40, 40, 40, 40, 40
+//};
+
+//?炼??藕?量??模??
+unsigned char std_Y_QT[64] =
+{
+	0.75 * 6, 0.75 * 4, 0.75 * 5, 0.75 * 6, 0.75 * 5, 0.75 * 4, 0.75 * 6, 0.75 * 6,
+	0.75 * 5, 0.75 * 6, 0.75 * 7, 0.75 * 7, 0.75 * 6, 0.75 * 8, 0.75 * 10, 0.75 * 16,
+	0.75 * 10, 0.75 * 10, 0.75 * 9, 0.75 * 9, 0.75 * 10, 0.75 * 20, 0.75 * 14, 0.75 * 15,
+	0.75 * 12, 0.75 * 16, 0.75 * 23, 0.75 * 20, 0.75 * 24, 0.75 * 24, 0.75 * 23, 0.75 * 20,
+	0.75 * 22, 0.75 * 22, 0.75 * 26, 0.75 * 29, 0.75 * 37, 0.75 * 31, 0.75 * 26, 0.75 * 27,
+	0.75 * 35, 0.75 * 28, 0.75 * 22, 0.75 * 22, 0.75 * 32, 0.75 * 44, 0.75 * 32, 0.75 * 35,
+	0.75 * 38, 0.75 * 39, 0.75 * 41, 0.75 * 42, 0.75 * 41, 0.75 * 25, 0.75 * 31, 0.75 * 45,
+	0.75 * 48, 0.75 * 45, 0.75 * 40, 0.75 * 48, 0.75 * 37, 0.75 * 40, 0.75 * 41, 0.75 * 40
+};
+
+//色???藕?量??模??
+unsigned char std_UV_QT[64] =
+{
+	0.75 * 7, 0.75 * 7, 0.75 * 7, 0.75 * 10, 0.75 * 8, 0.75 * 10, 0.75 * 19, 0.75 * 10,
+	0.75 * 10, 0.75 * 19, 0.75 * 40, 0.75 * 26, 0.75 * 22, 0.75 * 26, 0.75 * 40, 0.75 * 40,
+	30, 30, 30, 30, 30, 30, 30, 30,
+	30, 30, 30, 30, 30, 30, 30, 30,
+	30, 30, 30, 30, 30, 30, 30, 30,
+	30, 30, 30, 30, 30, 30, 30, 30,
+	30, 30, 30, 30, 30, 30, 30, 30,
+	30, 30, 30, 30, 30, 30, 30, 30
+};
+
+unsigned char STD_DC_Y_NRCODES[16] = { 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
+unsigned char STD_DC_Y_VALUES[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+
+unsigned char STD_DC_UV_NRCODES[16] = { 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
+unsigned char STD_DC_UV_VALUES[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+
+unsigned char STD_AC_Y_NRCODES[16] = { 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0X7D };
+unsigned char STD_AC_Y_VALUES[162] =
+{
+	0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
+	0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
+	0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+	0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
+	0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
+	0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+	0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+	0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+	0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+	0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+	0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+	0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+	0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+	0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+	0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
+	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
+	0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+	0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
+	0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+	0xf9, 0xfa
+};
+
+unsigned char STD_AC_UV_NRCODES[16] = { 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0X77 };
+unsigned char STD_AC_UV_VALUES[162] =
+{
+	0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
+	0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
+	0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+	0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
+	0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
+	0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+	0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
+	0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+	0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+	0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+	0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
+	0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
+	0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
+	0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+	0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
+	0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
+	0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+	0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
+	0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+	0xf9, 0xfa
+};
+
+int DivUp(int x, int d)
+{
+	return (x + d - 1) / d;
+}
+
+template<typename T>
+void writeAndAdvance(unsigned char *&pData, T nElement)
+{
+	writeBigEndian<T>(pData, nElement);
+	pData += sizeof(T);
+}
+
+void writeMarker(unsigned char nMarker, unsigned char *&pData)
+{
+	*pData++ = 0x0ff;
+	*pData++ = nMarker;
+}
+
+void writeJFIFTag(unsigned char *&pData)
+{
+	const char JFIF_TAG[] =
+	{
+		0x4a, 0x46, 0x49, 0x46, 0x00,
+		0x01, 0x02,
+		0x00,
+		0x00, 0x01, 0x00, 0x01,
+		0x00, 0x00
+	};
+
+	writeMarker(0x0e0, pData);
+	writeAndAdvance<unsigned short>(pData, sizeof(JFIF_TAG) + sizeof(unsigned short));
+	memcpy(pData, JFIF_TAG, sizeof(JFIF_TAG));
+	pData += sizeof(JFIF_TAG);
+}
+
+void writeFrameHeader(const FrameHeader &header, unsigned char *&pData)
+{
+	unsigned char aTemp[128];
+	unsigned char *pTemp = aTemp;
+
+	writeAndAdvance<unsigned char>(pTemp, header.nSamplePrecision);
+	writeAndAdvance<unsigned short>(pTemp, header.nHeight);
+	writeAndAdvance<unsigned short>(pTemp, header.nWidth);
+	writeAndAdvance<unsigned char>(pTemp, header.nComponents);
+
+	for (int c = 0; c<header.nComponents; ++c)
+	{
+		writeAndAdvance<unsigned char>(pTemp, header.aComponentIdentifier[c]);
+		writeAndAdvance<unsigned char>(pTemp, header.aSamplingFactors[c]);
+		writeAndAdvance<unsigned char>(pTemp, header.aQuantizationTableSelector[c]);
+	}
+
+	unsigned short nLength = (unsigned short)(pTemp - aTemp);
+
+	writeMarker(0x0C0, pData);
+	writeAndAdvance<unsigned short>(pData, nLength + 2);
+	memcpy(pData, aTemp, nLength);
+	pData += nLength;
+}
+
+void writeScanHeader(const ScanHeader &header, unsigned char *&pData)
+{
+	unsigned char aTemp[128];
+	unsigned char *pTemp = aTemp;
+
+	writeAndAdvance<unsigned char>(pTemp, header.nComponents);
+
+	for (int c = 0; c<header.nComponents; ++c)
+	{
+		writeAndAdvance<unsigned char>(pTemp, header.aComponentSelector[c]);
+		writeAndAdvance<unsigned char>(pTemp, header.aHuffmanTablesSelector[c]);
+	}
+
+	writeAndAdvance<unsigned char>(pTemp, header.nSs);
+	writeAndAdvance<unsigned char>(pTemp, header.nSe);
+	writeAndAdvance<unsigned char>(pTemp, header.nA);
+
+	unsigned short nLength = (unsigned short)(pTemp - aTemp);
+
+	writeMarker(0x0DA, pData);
+	writeAndAdvance<unsigned short>(pData, nLength + 2);
+	memcpy(pData, aTemp, nLength);
+	pData += nLength;
+}
+
+void writeQuantizationTable(const QuantizationTable &table, unsigned char *&pData)
+{
+	writeMarker(0x0DB, pData);
+	writeAndAdvance<unsigned short>(pData, sizeof(QuantizationTable) + 2);
+	memcpy(pData, &table, sizeof(QuantizationTable));
+	pData += sizeof(QuantizationTable);
+}
+
+void writeHuffmanTable(const HuffmanTable &table, unsigned char *&pData)
+{
+	writeMarker(0x0C4, pData);
+
+	// Number of Codes for Bit Lengths [1..16]
+	int nCodeCount = 0;
+
+	for (int i = 0; i < 16; ++i)
+	{
+		nCodeCount += table.aCodes[i];
+	}
+
+	writeAndAdvance<unsigned short>(pData, 17 + nCodeCount + 2);
+	memcpy(pData, &table, 17 + nCodeCount);
+	pData += 17 + nCodeCount;
+}
+
+bool printfNPPinfo(int cudaVerMajor, int cudaVerMinor)
+{
+	const NppLibraryVersion *libVer = nppGetLibVersion();
+
+	printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);
+
+	int driverVersion, runtimeVersion;
+	cudaDriverGetVersion(&driverVersion);
+	cudaRuntimeGetVersion(&runtimeVersion);
+
+	printf("  CUDA Driver  Version: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10);
+	printf("  CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
+
+	bool bVal = checkCudaCapabilities(cudaVerMajor, cudaVerMinor);
+	return bVal;
+}
+
+NppiDCTState *pDCTState;
+FrameHeader oFrameHeader;
+FrameHeader oFrameHeaderFixedSize;
+ScanHeader oScanHeader;
+QuantizationTable aQuantizationTables[4];
+Npp8u *pdQuantizationTables;
+HuffmanTable aHuffmanTables[4];
+HuffmanTable *pHuffmanDCTables;
+HuffmanTable *pHuffmanACTables;
+int nMCUBlocksH;
+int nMCUBlocksV;
+int nMCUBlocksHFixedSize;
+int nMCUBlocksVFixedSize;
+Npp8u *pdScan;
+NppiEncodeHuffmanSpec *apHuffmanDCTable[3];
+NppiEncodeHuffmanSpec *apHuffmanACTable[3];
+unsigned char *pDstJpeg;
+unsigned char *pDstOutput;
+int nRestartInterval;
+
+int initTable()
+{
+	NPP_CHECK_NPP(nppiDCTInitAlloc(&pDCTState));
+
+	nRestartInterval = -1;
+
+	cudaMalloc(&pdQuantizationTables, 64 * 4);
+	pHuffmanDCTables = aHuffmanTables;
+	pHuffmanACTables = &aHuffmanTables[2];
+	memset(aQuantizationTables, 0, 4 * sizeof(QuantizationTable));
+	memset(aHuffmanTables, 0, 4 * sizeof(HuffmanTable));
+	memset(&oFrameHeader, 0, sizeof(FrameHeader));
+
+
+	//????Huffman??
+	aHuffmanTables[0].nClassAndIdentifier = 0;
+	memcpy(aHuffmanTables[0].aCodes, STD_DC_Y_NRCODES, 16);
+	memcpy(aHuffmanTables[0].aTable, STD_DC_Y_VALUES, 12);
+
+	aHuffmanTables[1].nClassAndIdentifier = 1;
+	memcpy(aHuffmanTables[1].aCodes, STD_DC_UV_NRCODES, 16);
+	memcpy(aHuffmanTables[1].aTable, STD_DC_UV_VALUES, 12);
+
+	aHuffmanTables[2].nClassAndIdentifier = 16;
+	memcpy(aHuffmanTables[2].aCodes, STD_AC_Y_NRCODES, 16);
+	memcpy(aHuffmanTables[2].aTable, STD_AC_Y_VALUES, 162);
+
+	aHuffmanTables[3].nClassAndIdentifier = 17;
+	memcpy(aHuffmanTables[3].aCodes, STD_AC_UV_NRCODES, 16);
+	memcpy(aHuffmanTables[3].aTable, STD_AC_UV_VALUES, 162);
+
+
+	//????量????
+	aQuantizationTables[0].nPrecisionAndIdentifier = 0;
+	memcpy(aQuantizationTables[0].aTable, std_Y_QT, 64);
+	aQuantizationTables[1].nPrecisionAndIdentifier = 1;
+	memcpy(aQuantizationTables[1].aTable, std_UV_QT, 64);
+
+	NPP_CHECK_CUDA(cudaMemcpyAsync(pdQuantizationTables, aQuantizationTables[0].aTable, 64, cudaMemcpyHostToDevice));
+	NPP_CHECK_CUDA(cudaMemcpyAsync(pdQuantizationTables + 64, aQuantizationTables[1].aTable, 64, cudaMemcpyHostToDevice));
+
+	oFrameHeader.nSamplePrecision = 8;
+	oFrameHeader.nComponents = 3;
+	oFrameHeader.aComponentIdentifier[0] = 1;
+	oFrameHeader.aComponentIdentifier[1] = 2;
+	oFrameHeader.aComponentIdentifier[2] = 3;
+	oFrameHeader.aSamplingFactors[0] = 34;
+	oFrameHeader.aSamplingFactors[1] = 17;
+	oFrameHeader.aSamplingFactors[2] = 17;
+	oFrameHeader.aQuantizationTableSelector[0] = 0;
+	oFrameHeader.aQuantizationTableSelector[1] = 1;
+	oFrameHeader.aQuantizationTableSelector[2] = 1;
+
+	for (int i = 0; i < oFrameHeader.nComponents; ++i)
+	{
+		nMCUBlocksV = max(nMCUBlocksV, oFrameHeader.aSamplingFactors[i] & 0x0f);
+		nMCUBlocksH = max(nMCUBlocksH, oFrameHeader.aSamplingFactors[i] >> 4);
+	}
+	NPP_CHECK_CUDA(cudaMalloc(&pdScan, 4 << 20));
+
+
+
+	oScanHeader.nComponents = 3;
+	oScanHeader.aComponentSelector[0] = 1;
+	oScanHeader.aComponentSelector[1] = 2;
+	oScanHeader.aComponentSelector[2] = 3;
+	oScanHeader.aHuffmanTablesSelector[0] = 0;
+	oScanHeader.aHuffmanTablesSelector[1] = 17;
+	oScanHeader.aHuffmanTablesSelector[2] = 17;
+	oScanHeader.nSs = 0;
+	oScanHeader.nSe = 63;
+	oScanHeader.nA = 0;
+
+
+	return 0;
+}
+
+NppiSize aSrcSize[3];
+Npp16s *apdDCT[3];// = { 0, 0, 0 };
+Npp32s aDCTStep[3];
+
+Npp8u *apSrcImage[3];// = { 0, 0, 0 };
+Npp32s aSrcImageStep[3];
+size_t aSrcPitch[3];
+
+
+int releaseJpegNPP()
+{
+	nppiDCTFree(pDCTState);
+	cudaFree(pdQuantizationTables);
+	cudaFree(pdScan);
+	for (int i = 0; i < 3; ++i)
+	{
+		cudaFree(apdDCT[i]);
+		cudaFree(apSrcImage[i]);
+	}
+	return 0;
+}
+
+
+int initTable(int flag, int width, int height)
+{
+	//????帧头
+	oFrameHeaderFixedSize.nSamplePrecision = 8;
+	oFrameHeaderFixedSize.nComponents = 3;
+	oFrameHeaderFixedSize.aComponentIdentifier[0] = 1;
+	oFrameHeaderFixedSize.aComponentIdentifier[1] = 2;
+	oFrameHeaderFixedSize.aComponentIdentifier[2] = 3;
+	oFrameHeaderFixedSize.aSamplingFactors[0] = 34;
+	oFrameHeaderFixedSize.aSamplingFactors[1] = 17;
+	oFrameHeaderFixedSize.aSamplingFactors[2] = 17;
+	oFrameHeaderFixedSize.aQuantizationTableSelector[0] = 0;
+	oFrameHeaderFixedSize.aQuantizationTableSelector[1] = 1;
+	oFrameHeaderFixedSize.aQuantizationTableSelector[2] = 1;
+	oFrameHeaderFixedSize.nWidth = width;
+	oFrameHeaderFixedSize.nHeight = height;
+
+	for (int i = 0; i < oFrameHeaderFixedSize.nComponents; ++i)
+	{
+		nMCUBlocksVFixedSize = max(nMCUBlocksVFixedSize, oFrameHeaderFixedSize.aSamplingFactors[i] & 0x0f);
+		nMCUBlocksHFixedSize = max(nMCUBlocksHFixedSize, oFrameHeaderFixedSize.aSamplingFactors[i] >> 4);
+	}
+
+	for (int i = 0; i < oFrameHeaderFixedSize.nComponents; ++i)
+	{
+		NppiSize oBlocks;
+		NppiSize oBlocksPerMCU = { oFrameHeaderFixedSize.aSamplingFactors[i] >> 4, oFrameHeaderFixedSize.aSamplingFactors[i] & 0x0f };
+
+		oBlocks.width = (int)ceil((oFrameHeaderFixedSize.nWidth + 7) / 8 *
+			static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksHFixedSize);
+		oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;
+
+		oBlocks.height = (int)ceil((oFrameHeaderFixedSize.nHeight + 7) / 8 *
+			static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksVFixedSize);
+		oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;
+
+		aSrcSize[i].width = oBlocks.width * 8;
+		aSrcSize[i].height = oBlocks.height * 8;
+
+		// Allocate Memory
+		size_t nPitch;
+		NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height));
+		aDCTStep[i] = static_cast<Npp32s>(nPitch);
+
+		NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height));
+
+		aSrcPitch[i] = nPitch;
+		aSrcImageStep[i] = static_cast<Npp32s>(nPitch);
+	}
+
+	return 0;
+}
+
+int jpegNPP(const char *szOutputFile, float* d_srcRGB)
+{
+	//RGB2YUV
+	cudaError_t cudaStatus;
+	cudaStatus = cuda_common::RGB2YUV(d_srcRGB, oFrameHeaderFixedSize.nWidth, oFrameHeaderFixedSize.nHeight,
+		apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height,
+		apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height,
+		apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height);
+
+	/**
+	* Forward DCT, quantization and level shift part of the JPEG encoding.
+	* Input is expected in 8x8 macro blocks and output is expected to be in 64x1
+	* macro blocks. The new version of the primitive takes the ROI in image pixel size and
+	* works with DCT coefficients that are in zig-zag order.
+	*/
+	int k = 0;
+	//LOG_INFO("NPP_CHECK_NPP:%d", 1);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0],
+		apdDCT[0], aDCTStep[0],
+		pdQuantizationTables + k * 64,
+		aSrcSize[0],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	k = 1;
+	//LOG_INFO("NPP_CHECK_NPP:%d", 2);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1],
+		apdDCT[1], aDCTStep[1],
+		pdQuantizationTables + k * 64,
+		aSrcSize[1],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	//LOG_INFO("NPP_CHECK_NPP:%d", 3);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2],
+		apdDCT[2], aDCTStep[2],
+		pdQuantizationTables + k * 64,
+		aSrcSize[2],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	// Huffman Encoding
+
+	Npp32s nScanLength;
+	Npp8u *pJpegEncoderTemp;
+
+#if (CUDA_VERSION == 8000)
+		Npp32s nTempSize; //when using CUDA8
+#else
+		size_t nTempSize; //when using CUDA9
+#endif
+	//modified by Junlin 190221
+
+	//LOG_INFO("NPP_CHECK_NPP:%d",4);
+	if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize)))
+	{
+		printf("nppiEncodeHuffmanGetSize Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",5);
+	NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize));
+
+	/**
+	* Allocates memory and creates a Huffman table in a format that is suitable for the encoder.
+	*/
+	NppStatus t_status;
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]);
+
+	/**
+	* Huffman Encoding of the JPEG Encoding.
+	* Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan.
+	*/
+	Npp32s nSs = 0;
+	Npp32s nSe = 63;
+	Npp32s nH = 0;
+	Npp32s nL = 0;
+	//LOG_INFO("NPP_CHECK_NPP:%d",6);
+	if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep,
+		0, nSs, nSe, nH, nL,
+		pdScan, &nScanLength,
+		apHuffmanDCTable,
+		apHuffmanACTable,
+		aSrcSize,
+		pJpegEncoderTemp)))
+	{
+		printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	for (int i = 0; i < 3; ++i)
+	{
+		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]);
+		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]);
+	}
+	// Write JPEG
+	pDstJpeg = new unsigned char[4 << 20]{};
+	pDstOutput = pDstJpeg;
+
+	writeMarker(0x0D8, pDstOutput);
+	writeJFIFTag(pDstOutput);
+	writeQuantizationTable(aQuantizationTables[0], pDstOutput);
+	writeQuantizationTable(aQuantizationTables[1], pDstOutput);
+	writeHuffmanTable(pHuffmanDCTables[0], pDstOutput);
+	writeHuffmanTable(pHuffmanACTables[0], pDstOutput);
+	writeHuffmanTable(pHuffmanDCTables[1], pDstOutput);
+	writeHuffmanTable(pHuffmanACTables[1], pDstOutput);
+	writeFrameHeader(oFrameHeaderFixedSize, pDstOutput);
+	writeScanHeader(oScanHeader, pDstOutput);
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",7);
+	NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost));
+
+	pDstOutput += nScanLength;
+	writeMarker(0x0D9, pDstOutput);
+	{
+		// Write result to file.
+		std::ofstream outputFile(szOutputFile, ios::out | ios::binary);
+		outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg));
+	}
+
+	// Cleanup
+	cudaFree(pJpegEncoderTemp);
+	delete[] pDstJpeg;
+
+
+	return EXIT_SUCCESS;
+}
+
+int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB)
+{
+	//RGB2YUV
+	cudaError_t cudaStatus;
+	cudaStatus = cuda_common::RGB2YUV(d_srcRGB, oFrameHeaderFixedSize.nWidth, oFrameHeaderFixedSize.nHeight,
+		apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height,
+		apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height,
+		apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height);
+
+	/**
+	* Forward DCT, quantization and level shift part of the JPEG encoding.
+	* Input is expected in 8x8 macro blocks and output is expected to be in 64x1
+	* macro blocks. The new version of the primitive takes the ROI in image pixel size and
+	* works with DCT coefficients that are in zig-zag order.
+	*/
+	int k = 0;
+	//LOG_INFO("NPP_CHECK_NPP:%d", 1);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0],
+		apdDCT[0], aDCTStep[0],
+		pdQuantizationTables + k * 64,
+		aSrcSize[0],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	k = 1;
+	//LOG_INFO("NPP_CHECK_NPP:%d", 2);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1],
+		apdDCT[1], aDCTStep[1],
+		pdQuantizationTables + k * 64,
+		aSrcSize[1],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	//LOG_INFO("NPP_CHECK_NPP:%d", 3);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2],
+		apdDCT[2], aDCTStep[2],
+		pdQuantizationTables + k * 64,
+		aSrcSize[2],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	// Huffman Encoding
+
+	Npp32s nScanLength;
+	Npp8u *pJpegEncoderTemp;
+
+#if (CUDA_VERSION == 8000)
+	Npp32s nTempSize; //when using CUDA8
+#else
+	size_t nTempSize; //when using CUDA9
+#endif
+					  //modified by Junlin 190221
+
+					  //LOG_INFO("NPP_CHECK_NPP:%d",4);
+	if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize)))
+	{
+		printf("nppiEncodeHuffmanGetSize Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",5);
+	NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize));
+
+	/**
+	* Allocates memory and creates a Huffman table in a format that is suitable for the encoder.
+	*/
+	NppStatus t_status;
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]);
+
+	/**
+	* Huffman Encoding of the JPEG Encoding.
+	* Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan.
+	*/
+	Npp32s nSs = 0;
+	Npp32s nSe = 63;
+	Npp32s nH = 0;
+	Npp32s nL = 0;
+	//LOG_INFO("NPP_CHECK_NPP:%d",6);
+	if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep,
+		0, nSs, nSe, nH, nL,
+		pdScan, &nScanLength,
+		apHuffmanDCTable,
+		apHuffmanACTable,
+		aSrcSize,
+		pJpegEncoderTemp)))
+	{
+		printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	for (int i = 0; i < 3; ++i)
+	{
+		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]);
+		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]);
+	}
+	// Write JPEG
+	pDstJpeg = new unsigned char[4 << 20]{};
+	pDstOutput = pDstJpeg;
+
+	writeMarker(0x0D8, pDstOutput);
+	writeJFIFTag(pDstOutput);
+	writeQuantizationTable(aQuantizationTables[0], pDstOutput);
+	writeQuantizationTable(aQuantizationTables[1], pDstOutput);
+	writeHuffmanTable(pHuffmanDCTables[0], pDstOutput);
+	writeHuffmanTable(pHuffmanACTables[0], pDstOutput);
+	writeHuffmanTable(pHuffmanDCTables[1], pDstOutput);
+	writeHuffmanTable(pHuffmanACTables[1], pDstOutput);
+	writeFrameHeader(oFrameHeaderFixedSize, pDstOutput);
+	writeScanHeader(oScanHeader, pDstOutput);
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",7);
+	NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost));
+
+	pDstOutput += nScanLength;
+	writeMarker(0x0D9, pDstOutput);
+	{
+		// Write result to file.
+		std::ofstream outputFile(szOutputFile, ios::out | ios::binary);
+		outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg));
+	}
+
+	// Cleanup
+	cudaFree(pJpegEncoderTemp);
+	delete[] pDstJpeg;
+
+
+	return EXIT_SUCCESS;
+}
+
+
+int jpegNPP(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height)
+{
+	NppiSize aSrcSize[3];
+	Npp16s *apdDCT[3] = { 0, 0, 0 };
+	Npp32s aDCTStep[3];
+
+	Npp8u *apSrcImage[3] = { 0, 0, 0 };
+	Npp32s aSrcImageStep[3];
+	size_t aSrcPitch[3];
+
+
+	//????帧头
+	oFrameHeader.nWidth = img_width;
+	oFrameHeader.nHeight = img_height;
+
+	for (int i = 0; i < oFrameHeader.nComponents; ++i)
+	{
+		NppiSize oBlocks;
+		NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] >> 4, oFrameHeader.aSamplingFactors[i] & 0x0f };
+
+		oBlocks.width = (int)ceil((oFrameHeader.nWidth + 7) / 8 *
+			static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH);
+		oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;
+
+		oBlocks.height = (int)ceil((oFrameHeader.nHeight + 7) / 8 *
+			static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV);
+		oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;
+
+		aSrcSize[i].width = oBlocks.width * 8;
+		aSrcSize[i].height = oBlocks.height * 8;
+
+		// Allocate Memory
+		size_t nPitch;
+		//LOG_INFO("NPP_CHECK_CUDA:%d",1);
+		NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height));
+		aDCTStep[i] = static_cast<Npp32s>(nPitch);
+
+		//LOG_INFO("NPP_CHECK_CUDA:%d",2);
+		NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height));
+
+		aSrcPitch[i] = nPitch;
+		aSrcImageStep[i] = static_cast<Npp32s>(nPitch);
+	}
+
+	//RGB2YUV
+	cudaError_t cudaStatus;
+	cudaStatus = cuda_common::RGB2YUV(d_srcRGB, img_width, img_height,
+		apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height,
+		apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height,
+		apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height);
+
+	/**
+	* Forward DCT, quantization and level shift part of the JPEG encoding.
+	* Input is expected in 8x8 macro blocks and output is expected to be in 64x1
+	* macro blocks. The new version of the primitive takes the ROI in image pixel size and
+	* works with DCT coefficients that are in zig-zag order.
+	*/
+	int k = 0;
+	//LOG_INFO("NPP_CHECK_CUDA:%d",3);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0],
+		apdDCT[0], aDCTStep[0],
+		pdQuantizationTables + k * 64,
+		aSrcSize[0],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+	k = 1;
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",4);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1],
+		apdDCT[1], aDCTStep[1],
+		pdQuantizationTables + k * 64,
+		aSrcSize[1],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",5);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2],
+		apdDCT[2], aDCTStep[2],
+		pdQuantizationTables + k * 64,
+		aSrcSize[2],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	// Huffman Encoding
+
+	Npp32s nScanLength;
+	Npp8u *pJpegEncoderTemp;
+
+#if (CUDA_VERSION == 8000)
+	Npp32s nTempSize; //when using CUDA8
+#else
+	size_t nTempSize; //when using CUDA9
+#endif
+					  //modified by Junlin 190221
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",6);
+	if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize)))
+	{
+		printf("nppiEncodeHuffmanGetSize Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",7);
+	NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize));
+
+	/**
+	* Allocates memory and creates a Huffman table in a format that is suitable for the encoder.
+	*/
+	NppStatus t_status;
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]);
+
+	/**
+	* Huffman Encoding of the JPEG Encoding.
+	* Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan.
+	*/
+	Npp32s nSs = 0;
+	Npp32s nSe = 63;
+	Npp32s nH = 0;
+	Npp32s nL = 0;
+	//LOG_INFO("NPP_CHECK_CUDA:%d",8);
+	if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep,
+		0, nSs, nSe, nH, nL,
+		pdScan, &nScanLength,
+		apHuffmanDCTable,
+		apHuffmanACTable,
+		aSrcSize,
+		pJpegEncoderTemp)))
+	{
+		printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	for (int i = 0; i < 3; ++i)
+	{
+		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]);
+		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]);
+	}
+	// Write JPEG
+	pDstJpeg = new unsigned char[4 << 20]{};
+	pDstOutput = pDstJpeg;
+
+	writeMarker(0x0D8, pDstOutput);
+	writeJFIFTag(pDstOutput);
+	writeQuantizationTable(aQuantizationTables[0], pDstOutput);
+	writeQuantizationTable(aQuantizationTables[1], pDstOutput);
+	writeHuffmanTable(pHuffmanDCTables[0], pDstOutput);
+	writeHuffmanTable(pHuffmanACTables[0], pDstOutput);
+	writeHuffmanTable(pHuffmanDCTables[1], pDstOutput);
+	writeHuffmanTable(pHuffmanACTables[1], pDstOutput);
+	writeFrameHeader(oFrameHeader, pDstOutput);
+	writeScanHeader(oScanHeader, pDstOutput);
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",9);
+	NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost));
+
+	pDstOutput += nScanLength;
+	writeMarker(0x0D9, pDstOutput);
+
+	{
+		// Write result to file.
+		std::ofstream outputFile(szOutputFile, ios::out | ios::binary);
+		outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg));
+	}
+
+	// Cleanup
+	cudaFree(pJpegEncoderTemp);
+	delete[] pDstJpeg;
+	for (int i = 0; i < 3; ++i)
+	{
+		cudaFree(apdDCT[i]);
+		cudaFree(apSrcImage[i]);
+	}
+
+	return EXIT_SUCCESS;
+}
+
+
+int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height)
+{
+	NppiSize aSrcSize[3];
+	Npp16s *apdDCT[3] = { 0, 0, 0 };
+	Npp32s aDCTStep[3];
+
+	Npp8u *apSrcImage[3] = { 0, 0, 0 };
+	Npp32s aSrcImageStep[3];
+	size_t aSrcPitch[3];
+
+
+	//????帧头
+	oFrameHeader.nWidth = img_width;
+	oFrameHeader.nHeight = img_height;
+
+	for (int i = 0; i < oFrameHeader.nComponents; ++i)
+	{
+		NppiSize oBlocks;
+		NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] >> 4, oFrameHeader.aSamplingFactors[i] & 0x0f };
+
+		oBlocks.width = (int)ceil((oFrameHeader.nWidth + 7) / 8 *
+			static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH);
+		oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;
+
+		oBlocks.height = (int)ceil((oFrameHeader.nHeight + 7) / 8 *
+			static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV);
+		oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;
+
+		aSrcSize[i].width = oBlocks.width * 8;
+		aSrcSize[i].height = oBlocks.height * 8;
+
+		// Allocate Memory
+		size_t nPitch;
+		//LOG_INFO("NPP_CHECK_CUDA:%d",1);
+		NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height));
+		aDCTStep[i] = static_cast<Npp32s>(nPitch);
+
+		//LOG_INFO("NPP_CHECK_CUDA:%d",2);
+		NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height));
+
+		aSrcPitch[i] = nPitch;
+		aSrcImageStep[i] = static_cast<Npp32s>(nPitch);
+	}
+
+	//RGB2YUV
+	cudaError_t cudaStatus;
+	cudaStatus = cuda_common::RGB2YUV(d_srcRGB, img_width, img_height,
+		apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height,
+		apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height,
+		apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height);
+
+	/**
+	* Forward DCT, quantization and level shift part of the JPEG encoding.
+	* Input is expected in 8x8 macro blocks and output is expected to be in 64x1
+	* macro blocks. The new version of the primitive takes the ROI in image pixel size and
+	* works with DCT coefficients that are in zig-zag order.
+	*/
+	int k = 0;
+	//LOG_INFO("NPP_CHECK_CUDA:%d",3);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0],
+		apdDCT[0], aDCTStep[0],
+		pdQuantizationTables + k * 64,
+		aSrcSize[0],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+	k = 1;
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",4);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1],
+		apdDCT[1], aDCTStep[1],
+		pdQuantizationTables + k * 64,
+		aSrcSize[1],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",5);
+	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2],
+		apdDCT[2], aDCTStep[2],
+		pdQuantizationTables + k * 64,
+		aSrcSize[2],
+		pDCTState)))
+	{
+		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	// Huffman Encoding
+
+	Npp32s nScanLength;
+	Npp8u *pJpegEncoderTemp;
+
+#if (CUDA_VERSION == 8000)
+	Npp32s nTempSize; //when using CUDA8
+#else
+	size_t nTempSize; //when using CUDA9
+#endif
+					  //modified by Junlin 190221
+
+					  //LOG_INFO("NPP_CHECK_CUDA:%d",6);
+	if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize)))
+	{
+		printf("nppiEncodeHuffmanGetSize Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",7);
+	NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize));
+
+	/**
+	* Allocates memory and creates a Huffman table in a format that is suitable for the encoder.
+	*/
+	NppStatus t_status;
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]);
+	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]);
+
+	/**
+	* Huffman Encoding of the JPEG Encoding.
+	* Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan.
+	*/
+	Npp32s nSs = 0;
+	Npp32s nSe = 63;
+	Npp32s nH = 0;
+	Npp32s nL = 0;
+	//LOG_INFO("NPP_CHECK_CUDA:%d",8);
+	if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep,
+		0, nSs, nSe, nH, nL,
+		pdScan, &nScanLength,
+		apHuffmanDCTable,
+		apHuffmanACTable,
+		aSrcSize,
+		pJpegEncoderTemp)))
+	{
+		printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	for (int i = 0; i < 3; ++i)
+	{
+		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]);
+		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]);
+	}
+	// Write JPEG
+	pDstJpeg = new unsigned char[4 << 20]{};
+	pDstOutput = pDstJpeg;
+
+	writeMarker(0x0D8, pDstOutput);
+	writeJFIFTag(pDstOutput);
+	writeQuantizationTable(aQuantizationTables[0], pDstOutput);
+	writeQuantizationTable(aQuantizationTables[1], pDstOutput);
+	writeHuffmanTable(pHuffmanDCTables[0], pDstOutput);
+	writeHuffmanTable(pHuffmanACTables[0], pDstOutput);
+	writeHuffmanTable(pHuffmanDCTables[1], pDstOutput);
+	writeHuffmanTable(pHuffmanACTables[1], pDstOutput);
+	writeFrameHeader(oFrameHeader, pDstOutput);
+	writeScanHeader(oScanHeader, pDstOutput);
+
+	//LOG_INFO("NPP_CHECK_CUDA:%d",9);
+	NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost));
+
+	pDstOutput += nScanLength;
+	writeMarker(0x0D9, pDstOutput);
+
+	{
+		// Write result to file.
+		std::ofstream outputFile(szOutputFile, ios::out | ios::binary);
+		outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg));
+	}
+
+	// Cleanup
+	cudaFree(pJpegEncoderTemp);
+	delete[] pDstJpeg;
+	for (int i = 0; i < 3; ++i)
+	{
+		cudaFree(apdDCT[i]);
+		cudaFree(apSrcImage[i]);
+	}
+
+	return EXIT_SUCCESS;
+}
diff --git a/src/nvdecoder/DrawImageOnGPU.cu b/src/nvdecoder/DrawImageOnGPU.cu
deleted file mode 100644
index 1fa99dc..0000000
--- a/src/nvdecoder/DrawImageOnGPU.cu
+++ /dev/null
@@ -1,126 +0,0 @@
-#include "cuda_kernels.h"
-
-#include "../interface/logger.hpp"
-
-typedef unsigned char   uchar;
-typedef unsigned int    uint32;
-typedef int             int32;
-
-namespace cuda_common
-{
-	__global__ void kernel_drawPixel(float* d_srcRGB, int src_width, int src_height,
-		int left, int top, int right, int bottom)
-	{
-		const int x = blockIdx.x * blockDim.x + threadIdx.x;
-		const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-		if (((x == left || x == right) && y >= top && y <= bottom) || ((y == top || y == bottom) && x >= left && x <= right))
-		{
-			d_srcRGB[(y*src_width) + x] = 0;
-			d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255;
-			d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0;
-		}
-	}
-
-	cudaError_t DrawImage(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom)
-	{
-		dim3 block(32, 16, 1);
-		dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1);
-
-		kernel_drawPixel << < grid, block >> >(d_srcRGB, src_width, src_height, left, top, right, bottom);
-
-		cudaError_t cudaStatus = cudaGetLastError();
-		if (cudaStatus != cudaSuccess) {
-			LOG_ERROR("Draw 32 kernel_memcopy launch failed:{}",cudaGetErrorString(cudaStatus));
-			return cudaStatus;
-		}
-
-		cudaStatus = cudaDeviceSynchronize();
-		if (cudaStatus != cudaSuccess) {
-			LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus);
-			return cudaStatus;
-		}
-
-		return cudaStatus;
-	}
-
-	__global__ void kernel_drawPixel(unsigned char* d_srcRGB, int src_width, int src_height,
-		int left, int top, int right, int bottom)
-	{
-		const int x = blockIdx.x * blockDim.x + threadIdx.x;
-		const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-		if (((x == left || x == right) && y >= top && y <= bottom) || ((y == top || y == bottom) && x >= left && x <= right))
-		{
-			d_srcRGB[(y*src_width) + x] = 0;
-			d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255;
-			d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0;
-		}
-	}
-
-	cudaError_t DrawImage(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom)
-	{
-		dim3 block(32, 16, 1);
-		dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1);
-
-		kernel_drawPixel << < grid, block >> >(d_srcRGB, src_width, src_height, left, top, right, bottom);
-
-		cudaError_t cudaStatus = cudaGetLastError();
-		if (cudaStatus != cudaSuccess) {
-			LOG_ERROR("Draw 68 kernel_memcopy launch failed: {}",cudaGetErrorString(cudaStatus));
-			return cudaStatus;
-		}
-
-		cudaStatus = cudaDeviceSynchronize();
-		if (cudaStatus != cudaSuccess) {
-			LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus);
-			return cudaStatus;
-		}
-
-		return cudaStatus;
-	}
-
-	__global__ void kernel_drawLine(float* d_srcRGB, int src_width, int src_height,
-		int begin_x, int begin_y, int end_x, int end_y)
-	{
-		int min_x = end_x < begin_x ? end_x : begin_x;
-		int max_x = end_x < begin_x ? begin_x : end_x;
-
-		int min_y = end_y < begin_y ? end_y : begin_y;
-		int max_y = end_y < begin_y ? begin_y : end_y;
-
-		const int x = blockIdx.x * blockDim.x + threadIdx.x;
-		const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-		if ((x - begin_x) * (end_y - begin_y) == (end_x - begin_x) * (y - begin_y)
-			&& min_x <= x && x <= max_x
-			&& min_y <= y && y <= max_y)
-		{
-			d_srcRGB[(y*src_width) + x] = 0;
-			d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255;
-			d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0;
-		}
-	}
-
-	cudaError_t DrawLine(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y)
-	{
-		dim3 block(32, 16, 1);
-		dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1);
-
-		kernel_drawLine << < grid, block >> >(d_srcRGB, src_width, src_height, begin_x, begin_y, end_x, end_y);
-
-		cudaError_t cudaStatus = cudaGetLastError();
-		if (cudaStatus != cudaSuccess) {
-			LOG_ERROR("Draw 112 kernel_memcopy launch failed: {}",cudaGetErrorString(cudaStatus));
-			return cudaStatus;
-		}
-
-		cudaStatus = cudaDeviceSynchronize();
-		if (cudaStatus != cudaSuccess) {
-			LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus);
-			return cudaStatus;
-		}
-
-		return cudaStatus;
-	}
-}
\ No newline at end of file
diff --git a/src/nvdecoder/FFCuContextManager.cpp b/src/nvdecoder/FFCuContextManager.cpp
deleted file mode 100644
index 382c4d8..0000000
--- a/src/nvdecoder/FFCuContextManager.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#include "FFCuContextManager.h"
-
-#include "common_header.h"
-
-using namespace std;
-
-extern "C"
-{
-	#include <libavcodec/avcodec.h> 
-	#include <libavdevice/avdevice.h> 
-	#include <libavformat/avformat.h> 
-	#include <libavfilter/avfilter.h> 
-	#include <libavutil/avutil.h> 
-    #include <libavutil/pixdesc.h> 
-	#include <libswscale/swscale.h>
-    #include <libavutil/imgutils.h>
-}
-
-FFCuContextManager::~FFCuContextManager()
-{
-    for(auto iter = ctxMap.begin(); iter != ctxMap.end(); iter++){
-        av_buffer_unref(&iter->second);
-    }
-    ctxMap.clear();
-}
-
-AVBufferRef *FFCuContextManager::getCuCtx(string gpuid)
-{
-     AVBufferRef *hw_device_ctx = ctxMap[gpuid];
-     if (nullptr == hw_device_ctx)
-     {
-        // 初始化硬件解码器
-        if (av_hwdevice_ctx_create(&hw_device_ctx, AV_HWDEVICE_TYPE_CUDA, gpuid.c_str(), nullptr, 0) < 0) 
-        {
-            LOG_ERROR("Failed to create specified HW device.");
-            return nullptr;
-        }
-        ctxMap[gpuid] = hw_device_ctx;
-     }
-     return hw_device_ctx;
-}
\ No newline at end of file
diff --git a/src/nvdecoder/FFCuContextManager.h b/src/nvdecoder/FFCuContextManager.h
deleted file mode 100644
index 758167c..0000000
--- a/src/nvdecoder/FFCuContextManager.h
+++ /dev/null
@@ -1,28 +0,0 @@
-
-#include<map>
-#include<string>
-
-using namespace std;
-
-struct AVBufferRef;
-
-class FFCuContextManager{
-public:
-    static FFCuContextManager* getInstance(){
-		static FFCuContextManager* singleton = nullptr;
-		if (singleton == nullptr){
-			singleton = new FFCuContextManager();
-		}
-		return singleton;
-	}
-
-    AVBufferRef *getCuCtx(string gpuid);
-
-private:
-    FFCuContextManager(){}
-	~FFCuContextManager();
-
-private:
-    map<string,AVBufferRef *> ctxMap;
-
-};
\ No newline at end of file
diff --git a/src/nvdecoder/FFNvDecoder.cpp b/src/nvdecoder/FFNvDecoder.cpp
deleted file mode 100644
index e64e2a5..0000000
--- a/src/nvdecoder/FFNvDecoder.cpp
+++ /dev/null
@@ -1,513 +0,0 @@
-#include "FFNvDecoder.h"
-
-#include <chrono>
-#include <thread>
-#include <fstream>
-
-#include <chrono>
-
-#include "FFCuContextManager.h"
-
-#include "common_header.h"
-
-#include "GpuRgbMemory.hpp"
-#include "cuda_kernels.h"
-
-using namespace std;
-
-// 参考博客: https://blog.csdn.net/qq_40116098/article/details/120704340
-
-static AVPixelFormat get_hw_format(AVCodecContext *avctx, const AVPixelFormat *pix_fmts)
-{
-	FFNvDecoder* _this = (FFNvDecoder*)avctx->opaque;
-
-	const AVPixelFormat *p;
-
-	for (p = pix_fmts; *p != -1; p++) {
-		if (*p == _this->getHwPixFmt())
-			return *p;
-	}
-
-	LOG_ERROR("Failed to get HW surface format");
-	return AV_PIX_FMT_NONE;
-}
-
-FFNvDecoder::FFNvDecoder()
-{
-	// 初始化解码对象
-	fmt_ctx = nullptr;
-	avctx = nullptr;
-	m_bRunning = false;
-
-	stream = nullptr;
-    stream_index = -1;
-    hw_pix_fmt = AV_PIX_FMT_NONE;
-    m_dec_name = "";
-
-	m_bPause = false;
-	m_bReal = true;
-
-	m_decode_thread = 0;
-	m_post_decode_thread = 0;
-
-	m_bFinished = false;
-	m_dec_keyframe = false;
-	m_fps = 0.0;
-}
-
-FFNvDecoder::~FFNvDecoder()
-{
-	m_dec_keyframe = false;
-}
-
-bool FFNvDecoder::init(FFDecConfig& cfg)
-{
-	m_cfg = cfg;
-	m_dec_name = cfg.dec_name;
-
-	fstream infile(cfg.uri);
-	if (infile.is_open()){
-		m_bReal = false;
-		infile.close();
-	}else {
-		m_bReal = true;
-	}
-
-	post_decoded_cbk = cfg.post_decoded_cbk;
-    decode_finished_cbk = cfg.decode_finished_cbk;
-
-	return init(cfg.uri.c_str(), cfg.gpuid.c_str(),cfg.force_tcp);
-}
-
-bool FFNvDecoder::init(const char* uri, const char* gpuid, bool force_tcp)
-{
-	// av_log_set_level(AV_LOG_DEBUG);
-
-	avformat_network_init();
-
-	// 打开输入视频文件
-	AVDictionary *options = nullptr;
-	av_dict_set( &options, "bufsize", "655360", 0 );
-	av_dict_set( &options, "rtsp_transport", force_tcp ? "tcp" : "udp", 0 );
-	// av_dict_set( &options, "listen_timeout", "30", 0 ); // 单位为s
-	av_dict_set( &options, "stimeout", "30000000", 0 ); // 单位为 百万分之一秒
-	
-	fmt_ctx = avformat_alloc_context();
-	const char* input_file = uri;
-	if (avformat_open_input(&fmt_ctx, input_file, nullptr, &options) != 0) {
-		LOG_ERROR("Cannot open input file:{}",input_file);
-		return false;
-	}
-
-	// 查找流信息
-	if (avformat_find_stream_info(fmt_ctx, nullptr) < 0) {
-		LOG_ERROR("Cannot find input stream information");
-		return false;
-	}
-
-	// 查找视频流信息
-	AVCodec *decoder = nullptr;
-	stream_index = av_find_best_stream(fmt_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, &decoder, 0);
-	if (stream_index < 0) {
-		LOG_ERROR("Cannot find a video stream in the input file");
-		return false;
-	}
-
-	string cuvid_dec_name = string(decoder->name) + "_cuvid";
-	AVCodec *vcodec = avcodec_find_decoder_by_name(cuvid_dec_name.c_str());
-	if (!(avctx = avcodec_alloc_context3(vcodec)))
-		return (bool)AVERROR(ENOMEM);
-
-	// 得到视频流对象
-	stream = fmt_ctx->streams[stream_index];
-	if (avcodec_parameters_to_context(avctx, stream->codecpar) < 0)
-		return false;
-
-	m_fps = av_q2d(stream ->avg_frame_rate);
-
-	avctx->opaque = this;
-	// 设置解码器管理器的像素格式回调函数
-	avctx->get_format = get_hw_format;
-
-	hw_pix_fmt = AV_PIX_FMT_CUDA;
-
-	FFCuContextManager* pCtxMgr = FFCuContextManager::getInstance();
-
-	AVBufferRef *hw_device_ctx = pCtxMgr->getCuCtx(gpuid);
-	if(nullptr == hw_device_ctx){
-		av_log(nullptr, AV_LOG_ERROR, "create CUDA context failed ! \n");
-		return false;
-	}
-	avctx->hw_device_ctx = av_buffer_ref(hw_device_ctx);
-	if (nullptr == avctx->hw_device_ctx)
-	{
-		return false;
-	}
-
-	// 打开解码器流
-	AVDictionary *op = nullptr;
-	av_dict_set( &op, "gpu", gpuid, 0 );
-	// av_dict_set( &op, "surfaces", "5", 0 );
-	if (avcodec_open2(avctx, vcodec, &op) < 0) {
-		LOG_ERROR("Failed to open codec for stream");
-		return false;
-	}
-	
-	return true;
-}
-
-bool FFNvDecoder::isSurport(FFDecConfig& cfg)
-{
-	bool bRet = init(cfg);
-    decode_finished();
-    return bRet;
-}
-
-bool FFNvDecoder::start(){
-
-	m_bRunning = true;
-
-	pthread_create(&m_decode_thread,0,
-        [](void* arg)
-        {
-            FFNvDecoder* a=(FFNvDecoder*)arg;
-            a->decode_thread();
-            return (void*)0;
-        }
-    ,this);
-
-	return true;
-}
-
-void FFNvDecoder::decode_thread()
-{
-	AVPacket* pkt ;
-	pkt = av_packet_alloc();
-	av_init_packet( pkt );
-
-	pthread_create(&m_post_decode_thread,0,
-        [](void* arg)
-        {
-            FFNvDecoder* a=(FFNvDecoder*)arg;
-            a->post_decode_thread();
-            return (void*)0;
-        }
-    ,this);
-
-	// long start_time = UtilTools::get_cur_time_ms();
-
-	while (m_bRunning)
-	{
-		if (!m_bReal)
-		{
-			if (m_bPause)
-			{
-				std::this_thread::sleep_for(std::chrono::milliseconds(3));
-				continue;
-			}
-		}
-		
-		int result = av_read_frame(fmt_ctx, pkt);
-		if (result == AVERROR_EOF || result < 0)
-		{
-			LOG_ERROR("Failed to read frame!");
-			break;
-		}
-
-		if (m_dec_keyframe && !(pkt->flags & AV_PKT_FLAG_KEY)) {
-			av_packet_unref(pkt);
-			continue;
-		}
-
-		if (stream_index == pkt->stream_index){
-			result = avcodec_send_packet(avctx, pkt);
-			if (result < 0){
-				av_packet_unref(pkt);
-				LOG_ERROR("{} - Failed to send pkt: {}", m_dec_name, result);
-				continue;
-			}
-
-			AVFrame* gpuFrame = av_frame_alloc();
-			result = avcodec_receive_frame(avctx, gpuFrame);
-			if ((result == AVERROR(EAGAIN) || result == AVERROR_EOF) || result < 0){
-				LOG_ERROR("{} - Failed to receive frame: {}", m_dec_name, result);
-				av_frame_free(&gpuFrame); 
-				av_packet_unref(pkt);
-				continue;
-			}
-			av_packet_unref(pkt);
-
-			if (m_bReal){
-				if (m_bPause){
-					av_frame_free(&gpuFrame); 
-					std::this_thread::sleep_for(std::chrono::milliseconds(3));
-					continue;
-				}
-			}
-
-			if(gpuFrame != nullptr){
-				m_queue_mutex.lock();
-				if(mFrameQueue.size() <= 10){
-					mFrameQueue.push(gpuFrame);
-				}else{
-					av_frame_free(&gpuFrame); 
-				}
-				m_queue_mutex.unlock();
-			}
-		}
-		av_packet_unref(pkt);
-	}
-
-	m_bRunning = false;
-	av_packet_free(&pkt);
-
-	// long end_time = UtilTools::get_cur_time_ms();
-	// cout << "解码用时:" << end_time - start_time << endl;
-
-	if (m_post_decode_thread != 0)
-	{
-		pthread_join(m_post_decode_thread,0);
-	}
-
-	decode_finished_cbk(m_finishedDecArg);
-
-	decode_finished();
-
-	// 清空队列
-	while(mFrameQueue.size() > 0){
-		AVFrame * gpuFrame = mFrameQueue.front();
-		av_frame_free(&gpuFrame); 
-		mFrameQueue.pop();
-	}
-
-	LOG_INFO("{} - decode thread exited.", m_dec_name);
-}
-
-void FFNvDecoder::decode_finished(){
-	if (avctx)
-	{
-		avcodec_free_context(&avctx);
-	}
-	
-	if (fmt_ctx)
-	{
-		avformat_close_input(&fmt_ctx);
-	}
-
-	m_bFinished = true;
-	m_dec_keyframe = false;
-}
-
-void FFNvDecoder::post_decode_thread(){
-	int skip_frame = m_cfg.skip_frame;
-	if (skip_frame <= 0){
-		skip_frame = 1;
-	}
-	
-	int index = 0;
-	while (m_bRunning)
-	{
-		if(mFrameQueue.size() > 0){
-			std::lock_guard<std::mutex> l(m_snapshot_mutex);
-			// 取队头数据
-			m_queue_mutex.lock();
-			AVFrame * gpuFrame = mFrameQueue.front();
-			mFrameQueue.pop();
-			m_queue_mutex.unlock();
-			// 跳帧
-			if (skip_frame == 1 || index % skip_frame == 0){
-				post_decoded_cbk(m_postDecArg, convert2bgr(gpuFrame));
-				index = 0;
-			}
-
-			av_frame_free(&gpuFrame); 
-
-			index++;
-		}
-	}
-
-	LOG_INFO("post decode thread exited.");
-}
-
-void FFNvDecoder::close(){
-	m_bRunning=false;
-	if(m_decode_thread != 0){
-		pthread_join(m_decode_thread,0);
-	}
-	m_dec_keyframe = false;
-}
-
-AVPixelFormat FFNvDecoder::getHwPixFmt(){
-	return hw_pix_fmt;
-}
-
-bool FFNvDecoder::isRunning(){
-	return m_bRunning;
-}
-
-bool FFNvDecoder::isFinished(){
-	return m_bFinished;
-}
-
-bool FFNvDecoder::isPausing(){
-	return m_bPause;
-}
-
-bool FFNvDecoder::getResolution( int &width, int &height ){
-	if (avctx != nullptr)
-	{
-		width = avctx->width;
-		height = avctx->height;
-		return true;
-	}
-	
-	return false;
-}
-
-void FFNvDecoder::pause(){
-	m_bPause = true;
-}
-
-void FFNvDecoder::resume(){
-	m_bPause = false;
-}
-
-void FFNvDecoder::setDecKeyframe(bool bKeyframe)
-{
-	m_dec_keyframe = bKeyframe;
-}
-
-int FFNvDecoder::getCachedQueueLength(){
-	m_queue_mutex.lock();
-	int queue_size = mFrameQueue.size(); 
-	m_queue_mutex.lock();
-	return queue_size;
-}
-
-float FFNvDecoder::fps(){
-	return m_fps;
-}
-
-void FFNvDecoder::setPostDecArg(const void* postDecArg){
-	m_postDecArg = postDecArg;
-}
-
-void FFNvDecoder::setFinishedDecArg(const void* finishedDecArg){
-	m_finishedDecArg = finishedDecArg;
-}
-
-DeviceRgbMemory* FFNvDecoder::convert2bgr(AVFrame * gpuFrame){
-	if (gpuFrame != nullptr && gpuFrame->format == AV_PIX_FMT_CUDA ){
-		LOG_DEBUG("decode task: gpuid: {}  width: {} height: {}", m_cfg.gpuid, gpuFrame->width, gpuFrame->height);
-		GpuRgbMemory* gpuMem = new GpuRgbMemory(3, gpuFrame->width, gpuFrame->height, getName(), m_cfg.gpuid, false, true);
-
-		do{
-			if (gpuMem->getMem() == nullptr){
-				LOG_ERROR("new GpuRgbMemory failed !!!");
-				break;
-			}
-			
-			cudaSetDevice(atoi(m_cfg.gpuid.c_str()));
-			cuda_common::setColorSpace( ITU_709, 0 );
-			cudaError_t cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], gpuMem->getMem(), gpuFrame->width, gpuFrame->height);
-			cudaDeviceSynchronize();
-			if (cudaStatus != cudaSuccess) {
-				LOG_ERROR("CUDAToBGR failed failed !!!");
-				break;
-			}
-
-			return gpuMem;
-		}while(0);
-
-		delete gpuMem;
-		gpuMem = nullptr;
-	}
-
-	return nullptr;
-}
-
-FFImgInfo* FFNvDecoder::snapshot(){
-
-	// 锁住停止队列消耗
-	std::lock_guard<std::mutex> l(m_snapshot_mutex);
-
-	AVFrame * gpuFrame = nullptr;
-
-	bool bFirst = true;
-	while(true){
-		m_queue_mutex.lock();
-		if(mFrameQueue.size() <= 0){
-			m_queue_mutex.unlock();
-			if(bFirst){
-				std::this_thread::sleep_for(std::chrono::milliseconds(100));
-				bFirst = false;
-				continue;
-			}else{
-				// 再进来说明前面已经等了 100 ms
-				// 100 ms都没有等到解码数据,则退出
-				return nullptr;
-			}
-		}
-
-		// 队列中数据大于1 
-		gpuFrame = mFrameQueue.front();
-		m_queue_mutex.unlock();
-		break;
-	}
-
-	if (gpuFrame != nullptr && gpuFrame->format == AV_PIX_FMT_CUDA ){
-		LOG_DEBUG("decode task: gpuid: {}  width: {} height: {}", m_cfg.gpuid, gpuFrame->width, gpuFrame->height);
-		GpuRgbMemory* gpuMem = new GpuRgbMemory(3, gpuFrame->width, gpuFrame->height, getName(), m_cfg.gpuid, false, true);
-
-		if (gpuMem->getMem() == nullptr){
-			LOG_ERROR("new GpuRgbMemory failed !!!");
-			return nullptr;
-		}
-		
-		cudaSetDevice(atoi(m_cfg.gpuid.c_str()));
-		cuda_common::setColorSpace( ITU_709, 0 );
-		cudaError_t cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], gpuMem->getMem(), gpuFrame->width, gpuFrame->height);
-		cudaDeviceSynchronize();
-		if (cudaStatus != cudaSuccess) {
-			LOG_ERROR("CUDAToBGR failed failed !!!");
-			return nullptr;
-		}
-
-		unsigned char * pHwRgb = gpuMem->getMem();
-		int channel = gpuMem->getChannel();
-		int width = gpuMem->getWidth();
-		int height = gpuMem->getHeight();
-
-		if (pHwRgb != nullptr && channel > 0 && width > 0 && height > 0){
-			int nSize = channel * height * width;
-
-			LOG_INFO("channel:{} height:{} width:{}", channel, height, width);
-			// unsigned char* cpu_data = new unsigned char[nSize];
-
-            unsigned char* cpu_data = (unsigned char *)av_malloc(nSize * sizeof(unsigned char));
-
-			cudaMemcpy(cpu_data, pHwRgb, nSize * sizeof(unsigned char), cudaMemcpyDeviceToHost);
-			cudaDeviceSynchronize();
-
-			delete gpuMem;
-			gpuMem = nullptr;
-
-			FFImgInfo* imgInfo = new FFImgInfo();
-			imgInfo->dec_name = m_dec_name;
-			imgInfo->pData = cpu_data;
-			imgInfo->height = height;
-			imgInfo->width = width;
-			imgInfo->timestamp = UtilTools::get_cur_time_ms();
-			imgInfo->index = m_index;
-
-			m_index++;
-
-			return imgInfo;
-		}
-
-		delete gpuMem;
-		gpuMem = nullptr;
-	}
-
-	return nullptr;
-}
\ No newline at end of file
diff --git a/src/nvdecoder/FFNvDecoder.h b/src/nvdecoder/FFNvDecoder.h
deleted file mode 100644
index 4784ab6..0000000
--- a/src/nvdecoder/FFNvDecoder.h
+++ /dev/null
@@ -1,107 +0,0 @@
-#include<string>
-#include <pthread.h>
-
-#include <mutex>
-
-extern "C"
-{
-	#include <libavcodec/avcodec.h> 
-	#include <libavdevice/avdevice.h> 
-	#include <libavformat/avformat.h> 
-	#include <libavfilter/avfilter.h> 
-	#include <libavutil/avutil.h> 
-    #include <libavutil/pixdesc.h> 
-	#include <libswscale/swscale.h>
-    #include <libavutil/imgutils.h>
-}
-
-#include "common_header.h"
-
-#include "../interface/AbstractDecoder.h"
-
-using namespace std;
-
-class FFNvDecoder : public AbstractDecoder {
-public:
-    FFNvDecoder();
-    ~FFNvDecoder();
-    bool init(FFDecConfig& cfg);
-    void close();
-    bool start();
-    void pause();
-    void resume();
-
-    void setDecKeyframe(bool bKeyframe);
-
-    bool isRunning();
-    bool isFinished();
-    bool isPausing();
-    bool getResolution( int &width, int &height );
-
-    bool isSurport(FFDecConfig& cfg);
-
-    int getCachedQueueLength();
-
-    float fps();
-
-    DECODER_TYPE getDecoderType(){ return DECODER_TYPE_FFMPEG; }
-
-    FFImgInfo* snapshot();
-
-    void setName(string nm){
-        m_dec_name = nm;
-    }
-
-    string getName(){
-        return m_dec_name;
-    }
-
-    void setPostDecArg(const void* postDecArg);
-    void setFinishedDecArg(const void* finishedDecArg);
-
-public:
-    AVPixelFormat getHwPixFmt();
-
-private:
-    void decode_thread();
-    void post_decode_thread();
-    bool init(const char* uri, const char* gpuid, bool force_tcp);
-    void decode_finished();
-
-    DeviceRgbMemory* convert2bgr(AVFrame * gpuFrame);
-
-private:
-    string m_dec_name;
-    FFDecConfig m_cfg;
-
-    AVStream* stream;
-    AVCodecContext *avctx;
-    int stream_index;
-    AVFormatContext *fmt_ctx;
-    AVPixelFormat hw_pix_fmt;
-
-    pthread_t m_decode_thread;
-    pthread_t m_post_decode_thread;
-    
-    bool m_bRunning;
-    bool m_bFinished;
-
-    bool m_bPause;
-
-    bool m_bReal; // 是否实时流
-
-    float m_fps;
-
-    queue<AVFrame*> mFrameQueue;
-    mutex m_queue_mutex;
-    mutex m_snapshot_mutex;
-    long m_index{0};
-
-    bool m_dec_keyframe;
-
-    const void * m_postDecArg;
-    POST_DECODE_CALLBACK post_decoded_cbk;  // 解码数据回调接口
-
-    const void * m_finishedDecArg;
-    DECODE_FINISHED_CALLBACK decode_finished_cbk;
-};
\ No newline at end of file
diff --git a/src/nvdecoder/GpuRgbMemory.hpp b/src/nvdecoder/GpuRgbMemory.hpp
deleted file mode 100644
index 35eac65..0000000
--- a/src/nvdecoder/GpuRgbMemory.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#include<string>
-
-#include "../interface/DeviceRgbMemory.hpp"
-#include "cuda_kernels.h"
-#include "define.hpp"
-#include "common_header.h"
-
-using namespace std;
-
-class GpuRgbMemory : public DeviceRgbMemory{
-
-public:
-     GpuRgbMemory(int _channel, int _width, int _height, string _id, string _gpuid, bool _key_frame, bool _isused)
-     :DeviceRgbMemory(_channel, _width, _height, _id, _gpuid, _key_frame, _isused){
-        gpuid = _gpuid;
-        cudaSetDevice(atoi(gpuid.c_str()));
-        CHECK_CUDA(cudaMalloc((void **)&pHwRgb, data_size * sizeof(unsigned char)));
-    }
-
-    ~GpuRgbMemory(){
-        if (pHwRgb) {
-            cudaSetDevice(atoi(gpuid.c_str()));
-            CHECK_CUDA(cudaFree(pHwRgb));
-            pHwRgb = nullptr;
-        }
-    }
-
-    string getGpuId() {
-        return gpuid;
-    }
-
-private:
-    string gpuid;
-};
\ No newline at end of file
diff --git a/src/nvdecoder/ImageSaveGPU.cpp b/src/nvdecoder/ImageSaveGPU.cpp
deleted file mode 100644
index dde9b64..0000000
--- a/src/nvdecoder/ImageSaveGPU.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-#include "cuda_kernels.h"
-
-#include "common_header.h"
-
-
-//int saveJPEG(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height)
-//{
-//	return jpegNPP(szOutputFile, d_srcRGB, img_width, img_height);
-//	//return 0;
-//}
-//
-//int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height)
-//{
-//	return jpegNPP(szOutputFile, d_srcRGB, img_width, img_height);
-//	//return 0;
-//}
-//
-//int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB)
-//{
-//	return jpegNPP(szOutputFile, d_srcRGB);
-//}
-//
-//int saveJPEG(const char *szOutputFile, float* d_srcRGB)
-//{
-//	return jpegNPP(szOutputFile, d_srcRGB);
-//}
-
-int resizeFrame(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height)
-{
-	cudaError_t cudaStatus = cuda_common::ResizeImage(d_srcRGB, src_width, src_height, d_dstRGB, dst_width, dst_height);
-	if (cudaStatus != cudaSuccess) {
-		LOG_ERROR("cuda_common::ResizeImage failed: {}",cudaGetErrorString(cudaStatus));
-		return -1;
-	}
-
-	return 0;
-}
-
-//int initTables()
-//{
-//	initTable();
-//	return 0;
-//}
-//
-//int initTables(int flag, int width, int height)
-//{
-//	initTable(0, width, height);
-//	return 0;
-//}
-
-int drawImageOnGPU(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom)
-{
-	cuda_common::DrawImage(d_srcRGB, src_width, src_height, left, top, right, bottom);
-	return 0;
-}
-
-int drawImageOnGPU(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom)
-{
-	cuda_common::DrawImage(d_srcRGB, src_width, src_height, left, top, right, bottom);
-	return 0;
-}
-
-int drawLineOnGPU(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y)
-{
-	cuda_common::DrawLine(d_srcRGB, src_width, src_height, begin_x, begin_y, end_x, end_y);
-	return 0;
-}
-
-//int releaseJpegSaver()
-//{
-//	releaseJpegNPP();
-//	return 0;
-//}
-
-int partMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom)
-{
-	cudaError_t cudaStatus = cuda_common::PartMemCopy(d_srcRGB, src_width, src_height, d_dstRGB, left, top, right, bottom);
-	if (cudaStatus != cudaSuccess) {
-		LOG_ERROR("cuda_common::77 PartMemCopy failed: {} {} {} {} {} {} {}",cudaGetErrorString(cudaStatus), left, top, right, bottom, src_height, d_dstRGB);
-		return -1;
-	}
-
-	return 0;
-}
-//#include <fstream>
-//extern std::ofstream g_os;
-int PartMemResizeBatch(unsigned char * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB,
-	int count, int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h,
-	float submeanb, float submeang, float submeanr,
-	float varianceb, float varianceg, float variancer)
-{
-	//g_os << "cudaMemcpyHostToDevice begin 9" << std::endl;
-	cudaError_t cudaStatus = cuda_common::PartMemResizeBatch(
-		d_srcRGB, src_width, src_height, d_dstRGB, count, vleft, vtop, vright, vbottom, dst_w, dst_h,
-		submeanb, submeang, submeanr,
-		varianceb, varianceg, variancer);
-	//g_os << "cudaMemcpyHostToDevice end 9" << std::endl;
-	if (cudaStatus != cudaSuccess) {
-		LOG_ERROR("cuda_common::PartMemResizeBatch failed: {}",cudaGetErrorString(cudaStatus));
-		return -1;
-	}
-
-	return 0;
-}
-
-
-//int PartMemResizeBatch(float * d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, 
-//	int count, int* vleft, int * vtop, int* vright, int* vbottom, int dst_w, int dst_h,
-//	float submeanb, float submeang, float submeanr,
-//	float varianceb, float varianceg, float variancer)
-//
-//{
-//	cudaError_t cudaStatus = cuda_common::PartMemResizeBatch(
-//		d_srcRGB, src_width, src_height, d_dstRGB, count, vleft, vtop, vright, vbottom, dst_w, dst_h,
-//		submeanb, submeang, submeanr,
-//		varianceb, varianceg, variancer);
-//	if (cudaStatus != cudaSuccess) {
-//		fprintf(stderr, "cuda_common::PartMemCopy failed: %s\n", cudaGetErrorString(cudaStatus));
-//		return -1;
-//	}
-//
-//	return 0;
-//}
\ No newline at end of file
diff --git a/src/nvdecoder/ImageSaveGPU.h b/src/nvdecoder/ImageSaveGPU.h
deleted file mode 100644
index 272a6d2..0000000
--- a/src/nvdecoder/ImageSaveGPU.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*******************************************************************************************
-* Version: VPT_x64_V2.0.0_20170904
-* CopyRight: 中科院自动化研究所模式识别实验室图像视频组
-* UpdateDate: 20170904
-* Content: 人车物监测跟踪
-********************************************************************************************/
-
-#ifndef IMAGESAVEGPU_H_
-#define IMAGESAVEGPU_H_
-
-#ifdef _MSC_VER
-	#ifdef IMAGESAVEGPU_EXPORTS
-		#define IMAGESAVEGPU_API __declspec(dllexport)
-	#else
-		#define IMAGESAVEGPU_API __declspec(dllimport)
-	#endif
-#else
-#define IMAGESAVEGPU_API __attribute__((visibility ("default")))
-#endif
-// 功能:保存成jpeg文件
-// szOutputFile		输出图片路径,如D:\\out.jpg
-// d_srcRGB			输入RGB数据,由cudaMalloc分配的显存空间,数据排列形式为:BBBBBB......GGGGGG......RRRRRRRR......
-// img_width		RGB数据图片的宽度
-// img_height		RGB数据图片的高度
-//
-//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height);
-//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, float* d_srcRGB);
-//
-//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height);
-//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB);
-
-// 功能:防缩图像
-IMAGESAVEGPU_API int resizeFrame(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height);
-
-// 功能:部分拷贝数据
-IMAGESAVEGPU_API int partMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom);
-
-//IMAGESAVEGPU_API int partMemResizeImage(float * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB,
-//	int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h,
-//	float submeanb, float submeang, float submeanr,
-//	float varianceb, float varianceg, float variancer);
-
-
-IMAGESAVEGPU_API int PartMemResizeBatch(unsigned char * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB,
-	int count, int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h,
-	float submeanb, float submeang, float submeanr,
-	float varianceb, float varianceg, float variancer);
-
-
-//// 功能:初始化GPU保存图像的各种量化表
-//IMAGESAVEGPU_API int initTables();
-//IMAGESAVEGPU_API int initTables(int falg, int width, int height);
-//
-//// 功能:释放资源
-//IMAGESAVEGPU_API int releaseJpegSaver();
-
-// 功能:在GPU中绘制快照包围框
-IMAGESAVEGPU_API int drawImageOnGPU(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom);
-
-IMAGESAVEGPU_API int drawImageOnGPU(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom);
-
-// 功能:在GPU中绘制直线
-IMAGESAVEGPU_API int drawLineOnGPU(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y);
-
-#endif
diff --git a/src/nvdecoder/Makefile b/src/nvdecoder/Makefile
deleted file mode 100644
index 8b6ceff..0000000
--- a/src/nvdecoder/Makefile
+++ /dev/null
@@ -1,102 +0,0 @@
-# 各项目录
-LIB_DIR:=$(BUILD_DIR)/$(MODULE)/lib
-DEP_DIR:=$(BUILD_DIR)/$(MODULE)/.dep
-OBJ_DIR:=$(BUILD_DIR)/$(MODULE)/obj
-SRC_DIR:=$(TOP_DIR)/$(MODULE)
-
-# 源文件以及中间目标文件和依赖文件
-SRCS:=$(notdir $(wildcard $(SRC_DIR)/*.cpp))
-OBJS:=$(addprefix $(OBJ_DIR)/, $(patsubst %.cpp, %.o, $(SRCS)))
-DEPS:=$(addprefix $(DEP_DIR)/, $(patsubst %.cpp, %.d,a $(SRCS)))
-
-CUDA_ROOT = /usr/local/cuda-11.1
-NVCC = $(CUDA_ROOT)/bin/nvcc
-
-# 自动生成头文件依赖选项
-DEPFLAGS=-MT $@ -MMD -MP -MF $(DEP_DIR)/$*.d
-
-DEFS = -DENABLE_DVPP_INTERFACE
-
-# 最终目标文件
-TARGET:=$(LIB_DIR)/$(MODULE).a
-
-
-PROJECT_ROOT= /mnt/data/cmhu/FFNvDecoder
-
-DEPEND_DIR = $(PROJECT_ROOT)/bin
-THIRDPARTY_ROOT = $(PROJECT_ROOT)/3rdparty
-SPDLOG_ROOT = $(THIRDPARTY_ROOT)/spdlog-1.9.2/release
-JRTP_ROOT = $(THIRDPARTY_ROOT)/jrtp_export
-
-
-INCLUDE= -I $(DEPEND_DIR)/include \
-  -I $(CUDA_ROOT)/include \
-  -I $(TOP_DIR)/common/inc \
-  -I $(TOP_DIR)/common/UtilNPP \
-  -I $(TOP_DIR)/ \
-  -I $(SPDLOG_ROOT)/include \
-  -I $(JRTP_ROOT)/jrtplib/include/jrtplib3 \
-  -I $(JRTP_ROOT)/jthread/include/jthread
-
-LIBSPATH= -L $(DEPEND_DIR)/lib -lavformat -lavcodec -lswscale -lavutil -lavfilter -lswresample -lavdevice \
-   -L $(CUDA_ROOT)/lib64 -lcuda -lcudart -lnvcuvid -lcurand -lcublas -lnvjpeg \
-   -L $(SPDLOG_ROOT) -l:libspdlog.a \
-   -L $(JRTP_ROOT)/jthread/lib -l:libjthread.a \
-   -L $(JRTP_ROOT)/jrtplib/lib -l:libjrtp.a
-
-
-CXXFLAGS= -g -O0 -fPIC $(INCLUDE) $(DEFS) -lpthread -lrt -lz -fexceptions -std=c++11 -fvisibility=hidden -Wl,-Bsymbolic -ldl -Wwrite-strings
-# CFLAGS= -g -fPIC -O0 $(INCLUDE) -pthread -lrt -lz -std=c++11 -fvisibility=hidden -Wl,-Bsymbolic -ldl
-	# -DUNICODE -D_UNICODE
-
-NFLAGS_LIB=-g -c -shared -Xcompiler -fPIC -Xcompiler -fvisibility=hidden
-NFLAGS = $(NFLAGS_LIB) $(INCLUDE) -std=c++11
-
-# CU_SOURCES = $(wildcard ${SRC_DIR}/*.cu)
-# CU_OBJS = $(patsubst %.cu, %.o, $(notdir $(CU_SOURCES)))
-
-CU_SOURCES:=$(notdir $(wildcard $(SRC_DIR)/*.cu))
-CU_OBJS:=$(addprefix $(OBJ_DIR)/, $(patsubst %.cu, %.o, $(CU_SOURCES)))
-
-
-# 默认最终目标
-.PHONY:all
-all:$(TARGET)
-
-# 生成最终目标
-$(TARGET):$(OBJS) $(CU_OBJS) | $(LIB_DIR)
-	@echo -e "\e[32m""Linking static library $(TARGET)""\e[0m"
-# @ar -rc $@ $^
-
-# 若没有lib目录则自动生成
-$(LIB_DIR):
-	@mkdir -p $@
-
-# 生成中间目标文件
-$(OBJ_DIR)/%.o:$(SRC_DIR)/%.cpp $(DEP_DIR)/%.d | $(OBJ_DIR) $(DEP_DIR)
-# @echo -e "\e[33m""Building object $@""\e[0m"
-	@$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $<
-
-$(OBJ_DIR)%.o:$(SRC_DIR)/%.cu
-	@echo "#######################CU_OBJS:$@###############"
-	$(NVCC) $(NFLAGS) -o $@ $< 
-
-
-# 若没有obj目录则自动生成
-$(OBJ_DIR):
-	@mkdir -p $@
-
-# 若没有.dep目录则自动生成
-$(DEP_DIR):
-	@mkdir -p $@
-
-# 依赖文件会在生成中间文件的时候自动生成,这里只是为了防止报错
-$(DEPS):
-
-# 引入中间目标文件头文件依赖关系
-include $(wildcard $(DEPS))
-
-# 直接删除组件build目录
-.PHONY:clean
-clean:
-	@rm -rf $(BUILD_DIR)/$(MODULE)
diff --git a/src/nvdecoder/NV12ToRGB.cu b/src/nvdecoder/NV12ToRGB.cu
deleted file mode 100644
index 68e54ac..0000000
--- a/src/nvdecoder/NV12ToRGB.cu
+++ /dev/null
@@ -1,345 +0,0 @@
-
-#include "cuda_kernels.h"
-
-#include <builtin_types.h>
-#include "helper_cuda_drvapi.h"
-
-typedef unsigned char   uint8;
-typedef unsigned int    uint32;
-typedef int             int32;
-
-#define COLOR_COMPONENT_MASK            0x3FF
-#define COLOR_COMPONENT_BIT_SIZE        10
-
-namespace cuda_common
-{
-
-#define MUL(x,y)    ((x)*(y))
-
-	__constant__ float  constHueColorSpaceMat2[9];  //默认分配到0卡上,未找到分配到指定卡上设置方法,当前也未用到,先注释掉
-
-	__device__ void YUV2RGB2(uint32 *yuvi, float *red, float *green, float *blue)
-	{
-		float luma, chromaCb, chromaCr;
-
-		// Prepare for hue adjustment
-		luma = (float)yuvi[0];
-		chromaCb = (float)((int32)yuvi[1] - 512.0f);
-		chromaCr = (float)((int32)yuvi[2] - 512.0f);
-
-
-		// Convert YUV To RGB with hue adjustment
-		*red = MUL(luma, constHueColorSpaceMat2[0]) +
-			MUL(chromaCb, constHueColorSpaceMat2[1]) +
-			MUL(chromaCr, constHueColorSpaceMat2[2]);
-		*green = MUL(luma, constHueColorSpaceMat2[3]) +
-			MUL(chromaCb, constHueColorSpaceMat2[4]) +
-			MUL(chromaCr, constHueColorSpaceMat2[5]);
-		*blue = MUL(luma, constHueColorSpaceMat2[6]) +
-			MUL(chromaCb, constHueColorSpaceMat2[7]) +
-			MUL(chromaCr, constHueColorSpaceMat2[8]);
-
-	}
-
-	__device__ unsigned char clip_v(int x, int min_val, int  max_val) {
-		if (x>max_val) {
-			return max_val;
-		}
-		else if (x<min_val) {
-			return min_val;
-		}
-		else {
-			return x;
-		}
-	}
-	// CUDA kernel for outputing the final RGB output from NV12;
-	extern "C"
-		__global__ void NV12ToRGB_drvapi2(uint32 *srcImage, size_t nSourcePitch, unsigned char *dstImage, int width, int height)
-	{
-
-		int32 x, y;
-		uint32 yuv101010Pel[2];
-		uint32 processingPitch = ((width)+63) & ~63;
-		uint8 *srcImageU8 = (uint8 *)srcImage;
-
-		processingPitch = nSourcePitch;
-
-		// Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
-		x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
-		y = blockIdx.y *  blockDim.y + threadIdx.y;
-
-		if (x >= width)
-		{
-			//printf("x >= width\n");
-			//*flag = -1;
-			return; //x = width - 1;
-		}
-			//return; //x = width - 1;
-
-		if (y >= height)
-		{
-			//printf("y >= height\n");
-			//*flag = -1;
-			return; // y = height - 1;
-		}
-
-		// Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
-		// if we move to texture we could read 4 luminance values
-		yuv101010Pel[0] = (srcImageU8[y * processingPitch + x]) << 2;
-		yuv101010Pel[1] = (srcImageU8[y * processingPitch + x + 1]) << 2;
-
-		uint32 chromaOffset = processingPitch * height;
-		int32 y_chroma = y >> 1;
-
-		if (y & 1)  // odd scanline ?
-		{
-			uint32 chromaCb;
-			uint32 chromaCr;
-
-			chromaCb = srcImageU8[chromaOffset + y_chroma * processingPitch + x];
-			chromaCr = srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1];
-
-			if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
-			{
-				chromaCb = (chromaCb + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x] + 1) >> 1;
-				chromaCr = (chromaCr + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x + 1] + 1) >> 1;
-			}
-
-			yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
-			yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-
-			yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
-			yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-		}
-		else
-		{
-			yuv101010Pel[0] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
-			yuv101010Pel[0] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-
-			yuv101010Pel[1] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
-			yuv101010Pel[1] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-		}
-
-		// this steps performs the color conversion
-		uint32 yuvi[6];
-		float red[2], green[2], blue[2];
-
-		yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK);
-		yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
-		yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
-
-		yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK);
-		yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
-		yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
-
-		// YUV to RGB Transformation conversion
-		YUV2RGB2(&yuvi[0], &red[0], &green[0], &blue[0]);
-		YUV2RGB2(&yuvi[3], &red[1], &green[1], &blue[1]);
-
-
-		dstImage[y * width * 3 + x * 3] = clip_v(blue[0] * 0.25,0 ,255);
-		dstImage[y * width * 3 + x * 3 + 3] = clip_v(blue[1] * 0.25,0, 255);
-
-		dstImage[width * y * 3 + x * 3 + 1] = clip_v(green[0] * 0.25,0 ,255);
-		dstImage[width * y * 3 + x * 3 + 4] = clip_v(green[1] * 0.25,0, 255);
-
-		dstImage[width * y * 3 + x * 3 + 2] = clip_v(red[0] * 0.25, 0, 255);
-		dstImage[width * y * 3 + x * 3 + 5] = clip_v(red[1] * 0.25,0 ,255);
-
-
-		//dstImage[y * width * 3 + x * 3] = blue[0] * 0.25;
-		//dstImage[y * width * 3 + x * 3 + 3] = blue[1] * 0.25;
-
-		//dstImage[width * y * 3 + x * 3 + 1] =green[0] * 0.25;
-		//dstImage[width * y * 3 + x * 3 + 4] = green[1] * 0.25;
-
-		//dstImage[width * y * 3 + x * 3 + 2] = red[0] * 0.25;
-		//dstImage[width * y * 3 + x * 3 + 5] = red[1] * 0.25;
-
-		// Clamp the results to BBBBBB....GGGGGGG.......RRRRRRR....
-		//              dstImage[y * width + x] = blue[0] * 0.25;
-		//              dstImage[y * width + x + 1] = blue[1] * 0.25;
-
-		//              dstImage[width * height + y * width + x] = green[0] * 0.25;
-		//              dstImage[width * height + y * width + x + 1] = green[1] * 0.25;
-
-		//              dstImage[width * height * 2 + y * width + x] = red[0] * 0.25;
-		//              dstImage[width * height * 2 + y * width + x + 1] = red[1] * 0.25;
-		return;
-
-	}
-
-		// CUDA kernel for outputing the final RGB output from NV12;
-	extern "C"
-		__global__ void CUDAToBGR_drvapi(uint32 *dataY, uint32 *dataUV, size_t pitchY, size_t pitchUV, unsigned char *dstImage, int width, int height)
-	{
-
-		int32 x, y;
-
-		// Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
-		x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
-		y = blockIdx.y *  blockDim.y + threadIdx.y;
-
-		if (x >= width)
-		{
-			return; 
-		}
-
-		if (y >= height)
-		{
-			return; 
-		}
-
-		uint32 yuv101010Pel[2];
-		uint8 *srcImageU8_Y = (uint8 *)dataY;
-		uint8 *srcImageU8_UV = (uint8 *)dataUV;
-
-		// Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
-		// if we move to texture we could read 4 luminance values
-		yuv101010Pel[0] = (srcImageU8_Y[y * pitchY + x]) << 2;
-		yuv101010Pel[1] = (srcImageU8_Y[y * pitchY + x + 1]) << 2;
-
-		int32 y_chroma = y >> 1;
-
-		if (y & 1)  // odd scanline ?
-		{
-			uint32 chromaCb;
-			uint32 chromaCr;
-
-			chromaCb = srcImageU8_UV[y_chroma * pitchUV + x];
-			chromaCr = srcImageU8_UV[y_chroma * pitchUV + x + 1];
-
-			if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
-			{
-				chromaCb = (chromaCb + srcImageU8_UV[(y_chroma + 1) * pitchUV + x] + 1) >> 1;
-				chromaCr = (chromaCr + srcImageU8_UV[(y_chroma + 1) * pitchUV + x + 1] + 1) >> 1;
-			}
-
-			yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
-			yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-
-			yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
-			yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-		}
-		else
-		{
-			yuv101010Pel[0] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
-			yuv101010Pel[0] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-
-			yuv101010Pel[1] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
-			yuv101010Pel[1] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-		}
-
-		// this steps performs the color conversion
-		uint32 yuvi[6];
-		float red[2], green[2], blue[2];
-
-		yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK);
-		yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
-		yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
-
-		yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK);
-		yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
-		yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
-
-		// YUV to RGB Transformation conversion
-		YUV2RGB2(&yuvi[0], &red[0], &green[0], &blue[0]);
-		YUV2RGB2(&yuvi[3], &red[1], &green[1], &blue[1]);
-
-
-		dstImage[y * width * 3 + x * 3] = clip_v(blue[0] * 0.25,0 ,255);
-		dstImage[y * width * 3 + x * 3 + 3] = clip_v(blue[1] * 0.25,0, 255);
-
-		dstImage[width * y * 3 + x * 3 + 1] = clip_v(green[0] * 0.25,0 ,255);
-		dstImage[width * y * 3 + x * 3 + 4] = clip_v(green[1] * 0.25,0, 255);
-
-		dstImage[width * y * 3 + x * 3 + 2] = clip_v(red[0] * 0.25, 0, 255);
-		dstImage[width * y * 3 + x * 3 + 5] = clip_v(red[1] * 0.25,0 ,255);
-	}
-
-	cudaError_t setColorSpace(FF_ColorSpace CSC, float hue)
-	{
-		float hueSin = sin(hue);
-		float hueCos = cos(hue);
-
-		float hueCSC[9];
-		if (CSC == ITU_601)
-		{
-			//CCIR 601
-			hueCSC[0] = 1.1644f;
-			hueCSC[1] = hueSin * 1.5960f;
-			hueCSC[2] = hueCos * 1.5960f;
-			hueCSC[3] = 1.1644f;
-			hueCSC[4] = (hueCos * -0.3918f) - (hueSin * 0.8130f);
-			hueCSC[5] = (hueSin *  0.3918f) - (hueCos * 0.8130f);
-			hueCSC[6] = 1.1644f;
-			hueCSC[7] = hueCos *  2.0172f;
-			hueCSC[8] = hueSin * -2.0172f;
-		}
-		else if (CSC == ITU_709)
-		{
-			//CCIR 709
-			hueCSC[0] = 1.0f;
-			hueCSC[1] = hueSin * 1.57480f;
-			hueCSC[2] = hueCos * 1.57480f;
-			hueCSC[3] = 1.0;
-			hueCSC[4] = (hueCos * -0.18732f) - (hueSin * 0.46812f);
-			hueCSC[5] = (hueSin *  0.18732f) - (hueCos * 0.46812f);
-			hueCSC[6] = 1.0f;
-			hueCSC[7] = hueCos *  1.85560f;
-			hueCSC[8] = hueSin * -1.85560f;
-		}
-
-		cudaError_t cudaStatus = cudaMemcpyToSymbol(constHueColorSpaceMat2, hueCSC, 9 * sizeof(float), 0, cudaMemcpyHostToDevice);
-		float tmpf[9];
-		memset(tmpf, 0, 9 * sizeof(float));
-		cudaMemcpyFromSymbol(tmpf, constHueColorSpaceMat2, 9 * sizeof(float), 0, ::cudaMemcpyDefault);
-		cudaDeviceSynchronize();
-
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "cudaMemcpyToSymbol failed: %s\n", cudaGetErrorString(cudaStatus));
-		}
-
-		return cudaStatus;
-	}
-
-	cudaError_t NV12ToRGBnot(CUdeviceptr d_srcNV12, size_t nSourcePitch, unsigned char* d_dstRGB, int width, int height)
-	{
-		dim3 block(32, 16, 1);
-		dim3 grid((width + (2 * block.x - 1)) / (2 * block.x), (height + (block.y - 1)) / block.y, 1);
-		NV12ToRGB_drvapi2 << < grid, block >> >((uint32 *)d_srcNV12, nSourcePitch, d_dstRGB, width, height);
-		cudaError_t cudaStatus = cudaGetLastError();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "NV12ToRGB_drvapi launch failed: %s\n", cudaGetErrorString(cudaStatus));
-			return cudaStatus;
-		}
-
-		cudaStatus = cudaDeviceSynchronize();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching NV12ToRGB_drvapi !\n", cudaStatus);
-			return cudaStatus;
-		}
-
-		return cudaStatus;
-	}
-
-	cudaError_t CUDAToBGR(CUdeviceptr dataY, CUdeviceptr dataUV, size_t pitchY, size_t pitchUV, unsigned char* d_dstRGB, int width, int height)
-	{
-		dim3 block(32, 16, 1);
-		dim3 grid((width + (2 * block.x - 1)) / (2 * block.x), (height + (block.y - 1)) / block.y, 1);
-		CUDAToBGR_drvapi << < grid, block >> >((uint32 *)dataY, (uint32 *)dataUV, pitchY, pitchUV, d_dstRGB, width, height);
-		cudaError_t cudaStatus = cudaGetLastError();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "NV12ToRGB_drvapi launch failed: %s\n", cudaGetErrorString(cudaStatus));
-			return cudaStatus;
-		}
-
-		cudaStatus = cudaDeviceSynchronize();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching NV12ToRGB_drvapi !\n", cudaStatus);
-			return cudaStatus;
-		}
-
-		return cudaStatus;
-	}
-}
\ No newline at end of file
diff --git a/src/nvdecoder/NvDecoderApi.cpp b/src/nvdecoder/NvDecoderApi.cpp
deleted file mode 100644
index efb63cd..0000000
--- a/src/nvdecoder/NvDecoderApi.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-#include "NvDecoderApi.h"
-#include "FFNvDecoder.h"
-
-NvDecoderApi::NvDecoderApi(){
-    m_pDecoder = nullptr;
-}
-
-NvDecoderApi::~NvDecoderApi(){
-    if(m_pDecoder != nullptr){
-        delete m_pDecoder;
-        m_pDecoder = nullptr;
-    }
-}
-
-bool NvDecoderApi::init(FFDecConfig& cfg){
-    m_pDecoder = new FFNvDecoder();
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->init(cfg);
-    }
-    return false;
-}
-
-void NvDecoderApi::close(){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->close();
-    }
-}
-
-bool NvDecoderApi::start(){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->start();
-    }
-    return false;
-}
-
-void NvDecoderApi::pause(){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->pause();
-    }
-}
-
-void NvDecoderApi::resume(){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->resume();
-    }
-}
-
-void NvDecoderApi::setDecKeyframe(bool bKeyframe){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->setDecKeyframe(bKeyframe);
-    }
-}
-
-bool NvDecoderApi::isRunning(){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->isRunning();
-    }
-    return false;
-}
-
-bool NvDecoderApi::isFinished(){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->isFinished();
-    }
-    return false;
-}
-
-bool NvDecoderApi::isPausing(){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->isPausing();
-    }
-    return false;
-}
-
-bool NvDecoderApi::getResolution(int &width, int &height){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->getResolution(width, height);
-    }
-    return false;
-}
-
-bool NvDecoderApi::isSurport(FFDecConfig& cfg){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->isSurport(cfg);
-    }
-    return false;
-}
-
-float NvDecoderApi::fps(){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->fps();
-    }
-    return 0.0;
-}
-
-int NvDecoderApi::getCachedQueueLength(){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->getCachedQueueLength();
-    }
-    return 0;
-}
-
-void NvDecoderApi::setName(string nm){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->setName(nm);
-    }
-}
-
-string NvDecoderApi::getName(){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->getName();
-    }
-    return nullptr;
-}
-
-FFImgInfo* NvDecoderApi::snapshot(){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->snapshot();
-    }
-    return nullptr;
-}
-
-void NvDecoderApi::setPostDecArg(const void* postDecArg){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->setPostDecArg(postDecArg);
-    }
-}
-
-void NvDecoderApi::setFinishedDecArg(const void* finishedDecArg){
-    if(m_pDecoder != nullptr){
-        return m_pDecoder->setFinishedDecArg(finishedDecArg);
-    }
-}
\ No newline at end of file
diff --git a/src/nvdecoder/NvDecoderApi.h b/src/nvdecoder/NvDecoderApi.h
deleted file mode 100644
index f742dd8..0000000
--- a/src/nvdecoder/NvDecoderApi.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#include<string>
-#include <pthread.h>
-
-#include "common_header.h"
-#include "../interface/AbstractDecoder.h"
-
-using namespace std;
-
-class FFNvDecoder;
-
-class NvDecoderApi : public AbstractDecoder{
-public:
-    NvDecoderApi();
-    ~NvDecoderApi();
-    bool init(FFDecConfig& cfg);
-    void close();
-    bool start();
-    void pause();
-    void resume();
-
-    void setDecKeyframe(bool bKeyframe);
-
-    bool isRunning();
-    bool isFinished();
-    bool isPausing();
-    bool getResolution( int &width, int &height );
-
-    bool isSurport(FFDecConfig& cfg);
-
-    int getCachedQueueLength();
-
-    float fps();
-
-    FFImgInfo* snapshot();
-
-    DECODER_TYPE getDecoderType(){ return DECODER_TYPE_DVPP; }
-    void setName(string nm);
-    string getName();
-
-    void setPostDecArg(const void* postDecArg);
-    void setFinishedDecArg(const void* finishedDecArg);
-private:
-    FFNvDecoder* m_pDecoder;
-};
\ No newline at end of file
diff --git a/src/nvdecoder/NvJpegEncoder.cpp b/src/nvdecoder/NvJpegEncoder.cpp
deleted file mode 100644
index 7ee0727..0000000
--- a/src/nvdecoder/NvJpegEncoder.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-#include "NvJpegEncoder.h"
-
-#include <fstream>
-#include <vector>
-#include <iostream>
-
-
-#define CHECK_NVJPEG(S) do {nvjpegStatus_t  status; \
-        status = S; \
-        if (status != NVJPEG_STATUS_SUCCESS ) std::cout << __LINE__ <<" CHECK_NVJPEG - status = " << status << std::endl; \
-        } while (false)
-
-
-int saveJpeg(const char * filepath, unsigned char* d_srcBGR, int width, int height, cudaStream_t stream)
-{
-    nvjpegHandle_t nvjpeg_handle;
-    nvjpegEncoderState_t encoder_state;
-    nvjpegEncoderParams_t encoder_params;
-
-    cudaEvent_t ev_start, ev_end;
-    cudaEventCreate(&ev_start);
-    cudaEventCreate(&ev_end);
-
-    nvjpegImage_t input;
-    nvjpegInputFormat_t input_format = NVJPEG_INPUT_BGRI;
-    int image_width = width;
-    int image_height = height;
-
-    // int channel_size = image_width * image_height;
-    // for (int i = 0; i < 3; i++)
-    // {
-    //     input.pitch[i] = image_width;
-    //     (cudaMalloc((void**)&(input.channel[i]), channel_size));
-    //     (cudaMemset(input.channel[i], 50 * 40 * i, channel_size));
-    // }
-
-    input.channel[0] = d_srcBGR;
-    input.pitch[0] = image_width * 3;
-
-    nvjpegBackend_t backend = NVJPEG_BACKEND_DEFAULT;
-
-    CHECK_NVJPEG(nvjpegCreate(backend, nullptr, &nvjpeg_handle));
-    
-    CHECK_NVJPEG(nvjpegEncoderParamsCreate(nvjpeg_handle, &encoder_params, stream));
-    CHECK_NVJPEG(nvjpegEncoderStateCreate(nvjpeg_handle, &encoder_state, stream));
-
-    // set params
-    CHECK_NVJPEG(nvjpegEncoderParamsSetEncoding(encoder_params, nvjpegJpegEncoding_t::NVJPEG_ENCODING_PROGRESSIVE_DCT_HUFFMAN, stream));
-    CHECK_NVJPEG(nvjpegEncoderParamsSetOptimizedHuffman(encoder_params, 1, stream));
-    CHECK_NVJPEG(nvjpegEncoderParamsSetQuality(encoder_params, 70, stream));
-    CHECK_NVJPEG(nvjpegEncoderParamsSetSamplingFactors(encoder_params, nvjpegChromaSubsampling_t::NVJPEG_CSS_420, stream));
-
-    cudaEventRecord(ev_start);
-    CHECK_NVJPEG(nvjpegEncodeImage(nvjpeg_handle, encoder_state, encoder_params, &input, input_format, image_width, image_height, stream));
-    cudaEventRecord(ev_end);
-
-    std::vector<unsigned char> obuffer;
-    size_t length;
-    CHECK_NVJPEG(nvjpegEncodeRetrieveBitstream(
-        nvjpeg_handle,
-        encoder_state,
-        NULL,
-        &length,
-        stream));
-
-    obuffer.resize(length);
-    CHECK_NVJPEG(nvjpegEncodeRetrieveBitstream(
-        nvjpeg_handle,
-        encoder_state,
-        obuffer.data(),
-        &length,
-        stream));
-
-    cudaEventSynchronize(ev_end);
-
-    // 用完销毁,避免显存泄露
-    nvjpegEncoderParamsDestroy(encoder_params);
-    nvjpegEncoderStateDestroy(encoder_state);
-    nvjpegDestroy(nvjpeg_handle);
-
-    float ms;
-    cudaEventElapsedTime(&ms, ev_start, ev_end);
-    // std::cout << "time spend " << ms << " ms" << std::endl;
-
-    std::ofstream outputFile(filepath, std::ios::out | std::ios::binary);
-    outputFile.write(reinterpret_cast<const char *>(obuffer.data()), static_cast<int>(length));
-    outputFile.close();
-    
-    return 0;
-}
\ No newline at end of file
diff --git a/src/nvdecoder/NvJpegEncoder.h b/src/nvdecoder/NvJpegEncoder.h
deleted file mode 100644
index 3c27ba8..0000000
--- a/src/nvdecoder/NvJpegEncoder.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#include <nvjpeg.h>
-
-int saveJpeg(const char * filepath, unsigned char* d_srcBGR, int width, int height, cudaStream_t stream);
\ No newline at end of file
diff --git a/src/nvdecoder/PartMemCopy.cu b/src/nvdecoder/PartMemCopy.cu
deleted file mode 100644
index 396765b..0000000
--- a/src/nvdecoder/PartMemCopy.cu
+++ /dev/null
@@ -1,289 +0,0 @@
-#include "cuda_kernels.h"
-#include <algorithm>
-typedef unsigned char   uchar;
-typedef unsigned int    uint32;
-typedef int             int32;
-
-#define MAX_SNAPSHOT_WIDTH 320
-#define MAX_SNAPSHOT_HEIGHT 320
-
-namespace cuda_common
-{
-	__global__ void kernel_memcopy(unsigned char* d_srcRGB, int src_width, int src_height,
-		unsigned char* d_dstRGB, int left, int top, int right, int bottom)
-	{
-		const int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-		const int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-		const int dst_width = right - left;
-		const int dst_height = bottom - top;
-		if (dst_x < dst_width && dst_y < dst_height)
-		{
-			int src_x = left + dst_x;
-			int src_y = top + dst_y;
-
-			//bgr...bgr...bgr...
-			d_dstRGB[(dst_y*dst_width + dst_x) * 3] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3];
-			d_dstRGB[(dst_y*dst_width + dst_x)
-				* 3 + 1] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3 + 1];
-			d_dstRGB[(dst_y*dst_width + dst_x) * 3 + 2] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3 + 2];
-
-			//bbb...ggg...rrr...
-			//d_dstRGB[(dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(src_y*src_width) + src_x];
-			//d_dstRGB[(dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(src_width*src_height) + (src_y*src_width) + src_x];
-			//d_dstRGB[(2 * dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(2 * src_width*src_height) + (src_y*src_width) + src_x];
-
-			/*	memcpy(d_dstRGB + (dst_y*src_width) + dst_x, d_srcRGB + (src_y*src_width) + src_x, sizeof(float));
-			memcpy(d_dstRGB + (src_width*src_height) + (dst_y*src_width) + dst_x, d_srcRGB + (src_width*src_height) + (src_y*src_width) + src_x, sizeof(float));
-			memcpy(d_dstRGB + (2 * src_width*src_height) + (dst_y*src_width) + dst_x, d_srcRGB + (2 * src_width*src_height) + (src_y*src_width) + src_x, sizeof(float));*/
-		}
-	}
-
-	cudaError_t PartMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom)
-	{
-		dim3 block(32, 16, 1);
-		dim3 grid(((right - left) + (block.x - 1)) / block.x, ((bottom - top) + (block.y - 1)) / block.y, 1);
-
-		kernel_memcopy << < grid, block >> > (d_srcRGB, src_width, src_height, d_dstRGB, left, top, right, bottom);
-
-		cudaError_t cudaStatus = cudaGetLastError();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "Part 50 kernel_memcopy launch failed: %s\n", cudaGetErrorString(cudaStatus));
-			return cudaStatus;
-		}
-		cudaStatus = cudaDeviceSynchronize();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus);
-			return cudaStatus;
-		}
-		return cudaStatus;
-	}
-
-
-	//    __global__ void kernel_memcopy_mean_variance(float* d_srcRGB, int src_width, int src_height, 
-	//            unsigned char* vd_dstRGB, int count, int * vleft, int* vtop, int* vright, int * vbottom, float submeanb,float submeang, float submeanr, float varianceb,float varianceg, float variancer)
-	//    {
-	//        const int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-	//        const int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-	//        for (int i=0;i<count;i++)
-	//        {
-	//                const int left = vleft[i];
-	//                const int right = vright[i];
-	//                const int top = vtop[i];
-	//                const int bottom = vbottom[i];
-	//        
-	//                const int dst_width = right - left;
-	//                const int dst_height = bottom - top;
-	//
-	//
-	//                unsigned char * d_dstRGB = vd_dstRGB + i *   ;
-	//
-	//                if (dst_x < dst_width && dst_y < dst_height)
-	//                {
-	//                    int src_x = left + dst_x;
-	//                    int src_y = top + dst_y;
-	//        
-	//                    d_dstRGB[(dst_y*dst_width) + dst_x] = (d_srcRGB[(src_y*src_width) + src_x] - submeanb)*varianceb;
-	//                    d_dstRGB[(dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (d_srcRGB[(src_width*src_height) + (src_y*src_width) + src_x] -submeang)*varianceg;
-	//                    d_dstRGB[(2 * dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (d_srcRGB[(2 * src_width*src_height) + (src_y*src_width) + src_x] - submeanr) * variancer;
-	//        
-	//                }
-	//        }
-	//    }
-	__global__ void PartCopy_ResizeImgBilinearBGR_Mean_Variance_CUDAKernel(
-		unsigned char * d_srcRGB, int srcimg_width, int srcimg_height,
-		int* vleft, int* vtop, int* vright, int * vbottom,
-		unsigned char** vd_dstRGB, int count, int *dst_width, int *dst_height,
-		float submeanb, float submeang, float submeanr,
-		float varianceb, float varianceg, float variancer)
-	{
-		int i = blockIdx.z;
-
-		//for (int i = 0; i<count; i++)
-		{
-			const int left = vleft[i];
-			const int right = vright[i];
-			const int top = vtop[i];
-			const int bottom = vbottom[i];
-			const int cur_dst_width = dst_width[i];
-			const int cur_dst_height = dst_height[i];
-
-			unsigned char* d_dstRGB =  vd_dstRGB[i];
-
-			const int src_width = right - left;
-			const int src_height = bottom - top;
-			const int x = blockIdx.x * blockDim.x + threadIdx.x;// + left;
-			const int y = blockIdx.y * blockDim.y + threadIdx.y;//+ top;
-			const int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-			const int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-			/*if (dst_x == 0 && dst_y == 0)
-				printf("%d %d %d %d %d\n", i, vleft[i], vright[i], cur_dst_width, cur_dst_height);*/
-
-			unsigned char * src_img = d_srcRGB;
-			unsigned char * dst_img = d_dstRGB;
-			if (dst_x < cur_dst_width && dst_y < cur_dst_height)
-			{
-				float fx = (x + 0.5)*src_width / (float)cur_dst_width - 0.5 + left;
-				float fy = (y + 0.5)*src_height / (float)cur_dst_height - 0.5 + top;
-				int ax = floor(fx);
-				int ay = floor(fy);
-				if (ax < 0)
-				{
-					ax = 0;
-				}
-				if (ax > srcimg_width - 2)
-				{
-					ax = srcimg_width - 2;
-				}
-				if (ay < 0) {
-					ay = 0;
-				}
-				if (ay > srcimg_height - 2)
-				{
-					ay = srcimg_height - 2;
-				}
-
-				int A = ax + ay*srcimg_width;
-				int B = ax + ay*srcimg_width + 1;
-				int C = ax + ay*srcimg_width + srcimg_width;
-				int D = ax + ay*srcimg_width + srcimg_width + 1;
-
-				float w1, w2, w3, w4;
-				w1 = fx - ax;
-				w2 = 1 - w1;
-				w3 = fy - ay;
-				w4 = 1 - w3;
-				float blue = src_img[A * 3] * w2*w4 + src_img[B * 3] * w1*w4 + src_img[C * 3] * w2*w3 + src_img[D * 3] * w1*w3;
-				float green = src_img[A * 3 + 1] * w2*w4 + src_img[B * 3 + 1] * w1*w4
-					+ src_img[C * 3 + 1] * w2*w3 + src_img[D * 3 + 1] * w1*w3;
-				float red = src_img[A * 3 + 2] * w2*w4 + src_img[B * 3 + 2] * w1*w4
-					+ src_img[C * 3 + 2] * w2*w3 + src_img[D * 3 + 2] * w1*w3;
-
-				/*dst_img[(dst_y * dst_width + dst_x) * 3] = (unsigned char)(blue - submeanb)*varianceb;
-				dst_img[(dst_y * dst_width + dst_x) * 3 + 1] =(unsigned char) (green - submeang)*varianceg;
-				dst_img[(dst_y * dst_width + dst_x) * 3 + 2] = (unsigned char) (red - submeanr)*variancer;*/
-
-				if (blue < 0)
-					blue = 0;
-				else if (blue > 255)
-					blue = 255;
-
-				if (green < 0)
-					green = 0;
-				else if (green > 255)
-					green = 255;
-
-				if (red < 0)
-					red = 0;
-				else if (red > 255)
-					red = 255;
-
-				dst_img[(dst_y * cur_dst_width + dst_x) * 3] = (unsigned char)blue;
-				dst_img[(dst_y * cur_dst_width + dst_x) * 3 + 1] = (unsigned char)green;
-				dst_img[(dst_y * cur_dst_width + dst_x) * 3 + 2] = (unsigned char)red;
-
-
-				/*if (src_img[(dst_y * dst_width + dst_x) * 3] < 0)
-					src_img[(dst_y * dst_width + dst_x) * 3] = 0;
-				else if (src_img[(dst_y * dst_width + dst_x) * 3] > 255)
-					src_img[(dst_y * dst_width + dst_x) * 3] = 255;
-
-				if (src_img[(dst_y * dst_width + dst_x) * 3 + 1] < 0)
-					src_img[(dst_y * dst_width + dst_x) * 3 + 1] = 0;
-				else if (src_img[(dst_y * dst_width + dst_x) * 3 + 1] > 255)
-					src_img[(dst_y * dst_width + dst_x) * 3 + 1] = 255;
-
-				if (src_img[(dst_y * dst_width + dst_x) * 3 + 2] < 0)
-					src_img[(dst_y * dst_width + dst_x) * 3 + 2] = 0;
-				else if (src_img[(dst_y * dst_width + dst_x) * 3 + 2] > 255)
-					src_img[(dst_y * dst_width + dst_x) * 3 + 2] = 255;
-
-
-				dst_img[(dst_y * dst_width + dst_x) * 3] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3];
-				dst_img[(dst_y * dst_width + dst_x) * 3 + 1] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3 + 1];
-				dst_img[(dst_y * dst_width + dst_x) * 3 + 2] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3 + 2];*/
-			}
-		}
-	}
-
-	cudaError_t PartMemResizeBatch(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, int count, int* left, int* top, int* right, int* bottom, int *dst_w, int *dst_h, float submeanb, float submeang, float submeanr,
-		float varianceb, float varianceg, float variancer)
-	{
-	/*	cudaEvent_t start, stop;
-		float time;
-		cudaEventCreate(&start);
-		cudaEventCreate(&stop);
-		cudaEventRecord(start, 0);*/
-
-		dim3 block(32, 16, 1);
-		dim3 grid((*std::max_element(dst_w, dst_w+ count) + (block.x - 1)) / block.x, (*std::max_element(dst_h, dst_h + count) + (block.y - 1)) / block.y, count);
-
-		int * gpu_left;
-		cudaMalloc(&gpu_left, 1000 * sizeof(int));
-		cudaMemcpy(gpu_left, left, count * sizeof(int), cudaMemcpyHostToDevice);
-
-		int * gpu_right;
-		cudaMalloc(&gpu_right, 1000 * sizeof(int));
-		cudaMemcpy(gpu_right, right, count * sizeof(int), cudaMemcpyHostToDevice);
-
-		int * gpu_top;
-		cudaMalloc(&gpu_top, 1000 * sizeof(int));
-		cudaMemcpy(gpu_top, top, count * sizeof(int), cudaMemcpyHostToDevice);
-
-		int * gpu_bottom;
-		cudaMalloc(&gpu_bottom, 1000 * sizeof(int));
-		cudaMemcpy(gpu_bottom, bottom, count * sizeof(int), cudaMemcpyHostToDevice);
-
-		int * gpu_dst_w;
-		cudaMalloc(&gpu_dst_w, 1000 * sizeof(int));
-		cudaMemcpy(gpu_dst_w, dst_w, count * sizeof(int), cudaMemcpyHostToDevice);
-
-		int * gpu_dst_h;
-		cudaMalloc(&gpu_dst_h, 1000 * sizeof(int));
-		cudaMemcpy(gpu_dst_h, dst_h, count * sizeof(int), cudaMemcpyHostToDevice);
-
-		unsigned char** gpu_dst_rgb;
-		cudaMalloc(&gpu_dst_rgb, 1000 * sizeof(unsigned char*));
-		cudaMemcpy(gpu_dst_rgb, d_dstRGB, count * sizeof(unsigned char*), cudaMemcpyHostToDevice);
-
-		//cudaMemcpy(cpu_personfloat, d_srcRGB, 112*224*2*sizeof(float), cudaMemcpyDeviceToHost);
-		//            for(int i=0;i<100;i++)
-		//            {
-		//                  printf("the score is %f\t",cpu_personfloat[i]);
-		//            }
-		PartCopy_ResizeImgBilinearBGR_Mean_Variance_CUDAKernel << < grid, block >> > (
-			d_srcRGB, src_width, src_height,
-			gpu_left, gpu_top, gpu_right, gpu_bottom,
-			gpu_dst_rgb, count, gpu_dst_w, gpu_dst_h,
-			submeanb, submeang, submeanr,
-			varianceb, varianceg, variancer);
-		cudaFree(gpu_top);
-		cudaFree(gpu_bottom);
-		cudaFree(gpu_left);
-		cudaFree(gpu_right);
-		cudaFree(gpu_dst_w);
-		cudaFree(gpu_dst_h);
-		cudaFree(gpu_dst_rgb);
-	
-		cudaError_t cudaStatus = cudaGetLastError();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "Part 270 kernel_memcopy launch failed: %s\n", cudaGetErrorString(cudaStatus));
-			return cudaStatus;
-		}
-		cudaStatus = cudaDeviceSynchronize();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus);
-			return cudaStatus;
-		}
-
-		/*cudaEventRecord(stop, 0);
-		cudaEventSynchronize(stop);
-		cudaEventElapsedTime(&time, start, stop);
-		cudaEventDestroy(start);
-		cudaEventDestroy(stop);
-		printf("�˺�������ʱ��:%f\n", time);*/
-
-		return cudaStatus;
-	}
-
-}
\ No newline at end of file
diff --git a/src/nvdecoder/RGB2YUV.cu b/src/nvdecoder/RGB2YUV.cu
deleted file mode 100644
index 7202c3a..0000000
--- a/src/nvdecoder/RGB2YUV.cu
+++ /dev/null
@@ -1,263 +0,0 @@
-
-
-#include "cuda_kernels.h"
-
-typedef unsigned char   uint8;
-typedef unsigned int    uint32;
-typedef int             int32;
-
-namespace cuda_common
-{
-	__device__ unsigned char clip_value(unsigned char x, unsigned char min_val, unsigned char  max_val){
-		if (x>max_val){
-			return max_val;
-		}
-		else if (x<min_val){
-			return min_val;
-		}
-		else{
-			return x;
-		}
-	}
-
-	__global__ void kernel_rgb2yuv(unsigned char *src_img, unsigned char* Y, unsigned char* u, unsigned char* v,
-		int src_width, int src_height, size_t yPitch)
-	{
-		const int x = blockIdx.x * blockDim.x + threadIdx.x;
-		const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-		if (x >= src_width)
-			return; //x = width - 1;
-
-		if (y >= src_height)
-			return; // y = height - 1;
-		
-		int B = src_img[y * src_width * 3 + x * 3];
-		int G = src_img[y * src_width * 3 + x * 3 + 1];
-		int R = src_img[y * src_width * 3 + x * 3 + 2];
-
-		/*int B = src_img[y * src_width + x];
-		int G = src_img[src_width * src_height + y * src_width + x];
-		int R = src_img[src_width * src_height * 2 + y * src_width + x];*/
-
-		Y[y * yPitch + x] = clip_value((unsigned char)(0.299 * R + 0.587 * G + 0.114 * B), 0, 255);
-		u[y * src_width + x] = clip_value((unsigned char)(-0.147 * R - 0.289 * G + 0.436 * B + 128), 0, 255);
-		v[y * src_width + x] = clip_value((unsigned char)(0.615 * R - 0.515 * G - 0.100 * B + 128), 0, 255);
-
-		//Y[y * yPitch + x] = clip_value((unsigned char)(0.257 * R + 0.504 * G + 0.098 * B + 16), 0, 255);
-		//u[y * src_width + x] = clip_value((unsigned char)(-0.148 * R - 0.291 * G + 0.439 * B + 128), 0, 255);
-		//v[y * src_width + x] = clip_value((unsigned char)(0.439 * R - 0.368 * G - 0.071 * B + 128), 0, 255);
-	}
-
-	__global__ void kernel_rgb2yuv(float *src_img, unsigned char* Y, unsigned char* u, unsigned char* v,
-		int src_width, int src_height, size_t yPitch)
-	{
-		const int x = blockIdx.x * blockDim.x + threadIdx.x;
-		const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-		if (x >= src_width)
-			return; //x = width - 1;
-
-		if (y >= src_height)
-			return; // y = height - 1;
-
-		float B = src_img[y * src_width + x];
-		float G = src_img[src_width * src_height + y * src_width + x];
-		float R = src_img[src_width * src_height * 2 + y * src_width + x];
-
-		Y[y * yPitch + x] = clip_value((unsigned char)(0.299 * R + 0.587 * G + 0.114 * B), 0, 255);
-		u[y * src_width + x] = clip_value((unsigned char)(-0.147 * R - 0.289 * G + 0.436 * B + 128), 0, 255);
-		v[y * src_width + x] = clip_value((unsigned char)(0.615 * R - 0.515 * G - 0.100 * B + 128), 0, 255);
-
-		//Y[y * yPitch + x] = clip_value((unsigned char)(0.257 * R + 0.504 * G + 0.098 * B + 16), 0, 255);
-		//u[y * src_width + x] = clip_value((unsigned char)(-0.148 * R - 0.291 * G + 0.439 * B + 128), 0, 255);
-		//v[y * src_width + x] = clip_value((unsigned char)(0.439 * R - 0.368 * G - 0.071 * B + 128), 0, 255);
-	}
-
-	extern "C"
-	__global__ void kernel_resize_UV(unsigned char* src_img, unsigned char *dst_img,
-		int src_width, int src_height, int dst_width, int dst_height, int nPitch)
-	{
-		const int x = blockIdx.x * blockDim.x + threadIdx.x;
-		const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-		if (x >= dst_width)
-			return; //x = width - 1;
-
-		if (y >= dst_height)
-			return; // y = height - 1;
-
-		float fx = (x + 0.5)*src_width / (float)dst_width - 0.5;
-		float fy = (y + 0.5)*src_height / (float)dst_height - 0.5;
-		int ax = floor(fx);
-		int ay = floor(fy);
-		if (ax < 0)
-		{
-			ax = 0;
-		}
-		else if (ax > src_width - 2)
-		{
-			ax = src_width - 2;
-		}
-
-		if (ay < 0){
-			ay = 0;
-		}
-		else if (ay > src_height - 2)
-		{
-			ay = src_height - 2;
-		}
-
-		int A = ax + ay*src_width;
-		int B = ax + ay*src_width + 1;
-		int C = ax + ay*src_width + src_width;
-		int D = ax + ay*src_width + src_width + 1;
-
-		float w1, w2, w3, w4;
-		w1 = fx - ax;
-		w2 = 1 - w1;
-		w3 = fy - ay;
-		w4 = 1 - w3;
-
-		unsigned char val = src_img[A] * w2*w4 + src_img[B] * w1*w4 + src_img[C] * w2*w3 + src_img[D] * w1*w3;
-
-		dst_img[y * nPitch + x] = clip_value(val,0,255);
-	}
-
-	cudaError_t RGB2YUV(float* d_srcRGB, int src_width, int src_height,
-						unsigned char* Y, size_t yPitch, int yWidth, int yHeight,
-						unsigned char* U, size_t uPitch, int uWidth, int uHeight,
-						unsigned char* V, size_t vPitch, int vWidth, int vHeight)
-	{
-		unsigned char * u ;
-		unsigned char * v ;
-
-		cudaError_t cudaStatus;
-
-		cudaStatus = cudaMalloc((void**)&u, src_width * src_height * sizeof(unsigned char));
-		cudaStatus = cudaMalloc((void**)&v, src_width * src_height * sizeof(unsigned char));
-
-		dim3 block(32, 16, 1);
-		dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1);
-		dim3 grid1((uWidth + (block.x - 1)) / block.x, (uHeight + (block.y - 1)) / block.y, 1);
-		dim3 grid2((vWidth + (block.x - 1)) / block.x, (vHeight + (block.y - 1)) / block.y, 1);
-
-		kernel_rgb2yuv << < grid, block >> >(d_srcRGB, Y, u, v, src_width, src_height, yPitch);
-
-		cudaStatus = cudaGetLastError();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "kernel_rgb2yuv launch failed: %s\n", cudaGetErrorString(cudaStatus));
-			goto Error;
-		}
-
-		cudaStatus = cudaDeviceSynchronize();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_rgb2yuv!\n", cudaStatus);
-			goto Error;
-		}
-
-		kernel_resize_UV << < grid1, block >> >(u, U, src_width, src_height, uWidth, uHeight, uPitch);
-
-		cudaStatus = cudaGetLastError();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus));
-			goto Error;
-		}
-
-		cudaStatus = cudaDeviceSynchronize();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus);
-			goto Error;
-		}
-
-		kernel_resize_UV << < grid2, block >> >(v, V, src_width, src_height, vWidth, vHeight, vPitch);
-
-		cudaStatus = cudaGetLastError();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus));
-			goto Error;
-		}
-
-		cudaStatus = cudaDeviceSynchronize();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus);
-			goto Error;
-		}
-
-Error :
-		cudaFree(u);
-		cudaFree(v);
-
-		return cudaStatus;
-	}
-
-
-
-	cudaError_t RGB2YUV(unsigned char* d_srcRGB, int src_width, int src_height,
-		unsigned char* Y, size_t yPitch, int yWidth, int yHeight,
-		unsigned char* U, size_t uPitch, int uWidth, int uHeight,
-		unsigned char* V, size_t vPitch, int vWidth, int vHeight)
-	{
-		unsigned char * u;
-		unsigned char * v;
-
-		cudaError_t cudaStatus;
-
-		cudaStatus = cudaMalloc((void**)&u, src_width * src_height * sizeof(unsigned char));
-		cudaStatus = cudaMalloc((void**)&v, src_width * src_height * sizeof(unsigned char));
-
-		dim3 block(32, 16, 1);
-		dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1);
-		dim3 grid1((uWidth + (block.x - 1)) / block.x, (uHeight + (block.y - 1)) / block.y, 1);
-		dim3 grid2((vWidth + (block.x - 1)) / block.x, (vHeight + (block.y - 1)) / block.y, 1);
-
-		kernel_rgb2yuv << < grid, block >> >(d_srcRGB, Y, u, v, src_width, src_height, yPitch);
-
-		cudaStatus = cudaGetLastError();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "kernel_rgb2yuv launch failed: %s\n", cudaGetErrorString(cudaStatus));
-			goto Error;
-		}
-
-		cudaStatus = cudaDeviceSynchronize();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_rgb2yuv!\n", cudaStatus);
-			goto Error;
-		}
-
-		kernel_resize_UV << < grid1, block >> >(u, U, src_width, src_height, uWidth, uHeight, uPitch);
-
-		cudaStatus = cudaGetLastError();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus));
-			goto Error;
-		}
-
-		cudaStatus = cudaDeviceSynchronize();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus);
-			goto Error;
-		}
-
-		kernel_resize_UV << < grid2, block >> >(v, V, src_width, src_height, vWidth, vHeight, vPitch);
-
-		cudaStatus = cudaGetLastError();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus));
-			goto Error;
-		}
-
-		cudaStatus = cudaDeviceSynchronize();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus);
-			goto Error;
-		}
-
-	Error:
-		cudaFree(u);
-		cudaFree(v);
-
-		return cudaStatus;
-	}
-}
-
diff --git a/src/nvdecoder/ResizeImage.cu b/src/nvdecoder/ResizeImage.cu
deleted file mode 100644
index fdc6961..0000000
--- a/src/nvdecoder/ResizeImage.cu
+++ /dev/null
@@ -1,84 +0,0 @@
-#include "cuda_kernels.h"
-
-typedef unsigned char   uchar;
-typedef unsigned int    uint32;
-typedef int             int32;
-
-namespace cuda_common
-{
-	__global__ void kernel_bilinear(float *src_img, float *dst_img,
-		int src_width, int src_height, int dst_width, int dst_height)
-	{
-		const int x = blockIdx.x * blockDim.x + threadIdx.x;
-		const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-		if (x < dst_width && y < dst_height)
-		{
-			float fx = (x + 0.5)*src_width / (float)dst_width - 0.5;
-			float fy = (y + 0.5)*src_height / (float)dst_height - 0.5;
-			int ax = floor(fx);
-			int ay = floor(fy);
-			if (ax < 0)
-			{
-				ax = 0;
-			}
-			else if (ax > src_width - 2)
-			{
-				ax = src_width - 2;
-			}
-
-			if (ay < 0){
-				ay = 0;
-			}
-			else if (ay > src_height - 2)
-			{
-				ay = src_height - 2;
-			}
-
-			int A = ax + ay*src_width;
-			int B = ax + ay*src_width + 1;
-			int C = ax + ay*src_width + src_width;
-			int D = ax + ay*src_width + src_width + 1;
-
-			float w1, w2, w3, w4;
-			w1 = fx - ax;
-			w2 = 1 - w1;
-			w3 = fy - ay;
-			w4 = 1 - w3;
-
-			float blue = src_img[A] * w2*w4 + src_img[B] * w1*w4 + src_img[C] * w2*w3 + src_img[D] * w1*w3;
-
-			float green = src_img[src_width * src_height + A] * w2*w4 + src_img[src_width * src_height + B] * w1*w4 
-				+ src_img[src_width * src_height + C] * w2*w3 + src_img[src_width * src_height + D] * w1*w3;
-
-			float red = src_img[src_width * src_height * 2 + A] * w2*w4 + src_img[src_width * src_height * 2 + B] * w1*w4 
-				+ src_img[src_width * src_height * 2 + C] * w2*w3 + src_img[src_width * src_height * 2 + D] * w1*w3;
-
-			dst_img[y * dst_width + x] = blue;
-			dst_img[dst_width * dst_height + y * dst_width + x] = green;
-			dst_img[dst_width * dst_height * 2 + y * dst_width + x] = red;
-		}
-	}
-
-	cudaError_t ResizeImage(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height)
-	{
-		dim3 block(32, 16, 1);
-		dim3 grid((dst_width + (block.x - 1)) / block.x, (dst_height + (block.y - 1)) / block.y, 1);
-
-		kernel_bilinear << < grid, block >> >(d_srcRGB, d_dstRGB, src_width, src_height, dst_width, dst_height);
-
-		cudaError_t cudaStatus = cudaGetLastError();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "kernel_bilinear launch failed: %s\n", cudaGetErrorString(cudaStatus));
-			return cudaStatus;
-		}
-
-		cudaStatus = cudaDeviceSynchronize();
-		if (cudaStatus != cudaSuccess) {
-			fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus);
-			return cudaStatus;
-		}
-
-		return cudaStatus;
-	}
-}
\ No newline at end of file
diff --git a/src/nvdecoder/common_header.h b/src/nvdecoder/common_header.h
deleted file mode 100644
index cf45c91..0000000
--- a/src/nvdecoder/common_header.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _COMMON_HEADER_H_
-#define _COMMON_HEADER_H_
-
-
-#include "../interface/logger.hpp"
-#include "../interface/utiltools.hpp"
-#include "../interface/interface_headers.h"
-
-#endif
\ No newline at end of file
diff --git a/src/nvdecoder/cuda_kernels.h b/src/nvdecoder/cuda_kernels.h
deleted file mode 100644
index cd1eb00..0000000
--- a/src/nvdecoder/cuda_kernels.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#pragma once
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <string.h>
-#include <math.h>
-
-#include <cuda.h>
-
-typedef enum
-{
-	ITU_601 = 1,
-	ITU_709 = 2
-} FF_ColorSpace;
-
-namespace cuda_common
-{
-	cudaError_t setColorSpace(FF_ColorSpace CSC, float hue);
-
-	cudaError_t NV12ToRGBnot(CUdeviceptr d_srcNV12, size_t nSourcePitch, unsigned char* d_dstRGB, int width, int height);
-	cudaError_t CUDAToBGR(CUdeviceptr dataY, CUdeviceptr dataUV, size_t pitchY, size_t pitchUV, unsigned char* d_dstRGB, int width, int height);
-
-	
-	cudaError_t ResizeImage(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height);
-
-	cudaError_t RGB2YUV(float* d_srcRGB, int src_width, int src_height,
-		unsigned char* Y, size_t yPitch, int yWidth, int yHeight,
-		unsigned char* U, size_t uPitch, int uWidth, int uHeight,
-		unsigned char* V, size_t vPitch, int vWidth, int vHeight);
-
-	cudaError_t RGB2YUV(unsigned char* d_srcRGB, int src_width, int src_height,
-		unsigned char* Y, size_t yPitch, int yWidth, int yHeight,
-		unsigned char* U, size_t uPitch, int uWidth, int uHeight,
-		unsigned char* V, size_t vPitch, int vWidth, int vHeight);
-
-	cudaError_t PartMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom);
-	//	cudaError_t PartMemResize(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int left, int top, int right, int bottom);
-
-	cudaError_t PartMemResizeBatch(unsigned char* d_srcRGB, int srcimg_width, int srcimg_height, unsigned char** d_dstRGB, int count,
-		int* left, int* top, int* right, int* bottom, int *dst_w, int *dst_h,
-		float submeanb, float submeang, float submeanr,
-		float varianceb, float varianceg, float variancer);
-
-	cudaError_t DrawImage(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom);
-	cudaError_t DrawImage(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom);
-
-	cudaError_t DrawLine(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y);
-}
-
-
-int jpegNPP(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height);
-int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height);
-
-int jpegNPP(const char *szOutputFile, float* d_srcRGB);
-int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB);
-
-int initTable();
-int initTable(int flag, int width, int height);
-int releaseJpegNPP();
-
diff --git a/src/nvdecoder/define.hpp b/src/nvdecoder/define.hpp
deleted file mode 100644
index 2eaafe0..0000000
--- a/src/nvdecoder/define.hpp
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-
-#include <string>
-
-
-#define CHECK_CUDA(call) \
-{\
-    const cudaError_t error_code = call;\
-    if (cudaSuccess != error_code)\
-        LOG_ERROR("CUDA error, code: {} reason: {}", error_code, cudaGetErrorString(error_code));\
-}
\ No newline at end of file
diff --git a/src/nvdecoder/jpegNPP.cpp-1 b/src/nvdecoder/jpegNPP.cpp-1
deleted file mode 100644
index f0bf2e6..0000000
--- a/src/nvdecoder/jpegNPP.cpp-1
+++ /dev/null
@@ -1,1193 +0,0 @@
-/*
-* Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
-*
-* NOTICE TO USER:
-*
-* This source code is subject to NVIDIA ownership rights under U.S. and
-* international Copyright laws.
-*
-* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
-* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
-* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-* OR PERFORMANCE OF THIS SOURCE CODE.
-*
-* U.S. Government End Users.  This source code is a "commercial item" as
-* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
-* "commercial computer software" and "commercial computer software
-* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
-* and is provided to the U.S. Government only as a commercial end item.
-* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-* source code with only those rights set forth herein.
-*/
-
-// This sample needs at least CUDA 5.5 and a GPU that has at least Compute Capability 2.0
-
-// This sample demonstrates a simple image processing pipeline.
-// First, a JPEG file is huffman decoded and inverse DCT transformed and dequantized.
-// Then the different planes are resized. Finally, the resized image is quantized, forward
-// DCT transformed and huffman encoded.
-
-#include "cuda_kernels.h"
-
-#include <npp.h>
-#include <cuda_runtime.h>
-#include "common/UtilNPP/Exceptions.h"
-
-#include "Endianess.h"
-#include <math.h>
-
-#include <string.h>
-#include <fstream>
-#include <iostream>
-
-#include "common/inc/helper_string.h"
-#include "common/inc/helper_cuda.h"
-//#include "MacroDef.h"
-#include "cuda.h"
-
-using namespace std;
-
-struct FrameHeader
-{
-	unsigned char nSamplePrecision;
-	unsigned short nHeight;
-	unsigned short nWidth;
-	unsigned char nComponents;
-	unsigned char aComponentIdentifier[3];
-	unsigned char aSamplingFactors[3];
-	unsigned char aQuantizationTableSelector[3];
-};
-
-struct ScanHeader
-{
-	unsigned char nComponents;
-	unsigned char aComponentSelector[3];
-	unsigned char aHuffmanTablesSelector[3];
-	unsigned char nSs;
-	unsigned char nSe;
-	unsigned char nA;
-};
-
-struct QuantizationTable
-{
-	unsigned char nPrecisionAndIdentifier;
-	unsigned char aTable[64];
-};
-
-struct HuffmanTable
-{
-	unsigned char nClassAndIdentifier;
-	unsigned char aCodes[16];
-	unsigned char aTable[256];
-};
-
-//??准?炼??藕?量??模??
-//unsigned char std_Y_QT[64] =
-//{
-//	16, 11, 10, 16, 24, 40, 51, 61,
-//	12, 12, 14, 19, 26, 58, 60, 55,
-//	14, 13, 16, 24, 40, 57, 69, 56,
-//	14, 17, 22, 29, 51, 87, 80, 62,
-//	18, 22, 37, 56, 68, 109, 103, 77,
-//	24, 35, 55, 64, 81, 104, 113, 92,
-//	49, 64, 78, 87, 103, 121, 120, 101,
-//	72, 92, 95, 98, 112, 100, 103, 99
-//};
-//
-////??准色???藕?量??模??
-//unsigned char std_UV_QT[64] =
-//{
-//	17, 18, 24, 47, 99, 99, 99, 99,
-//	18, 21, 26, 66, 99, 99, 99, 99,
-//	24, 26, 56, 99, 99, 99, 99, 99,
-//	47, 66, 99, 99, 99, 99, 99, 99,
-//	99, 99, 99, 99, 99, 99, 99, 99,
-//	99, 99, 99, 99, 99, 99, 99, 99,
-//	99, 99, 99, 99, 99, 99, 99, 99,
-//	99, 99, 99, 99, 99, 99, 99, 99
-//};
-
-////?炼??藕?量??模??
-//unsigned char std_Y_QT[64] =
-//{
-//	6, 4, 5, 6, 5, 4, 6, 6,
-//	5, 6, 7, 7, 6, 8, 10, 16,
-//	10, 10, 9, 9, 10, 20, 14, 15,
-//	12, 16, 23, 20, 24, 24, 23, 20,
-//	22, 22, 26, 29, 37, 31, 26, 27,
-//	35, 28, 22, 22, 32, 44, 32, 35,
-//	38, 39, 41, 42, 41, 25, 31, 45,
-//	48, 45, 40, 48, 37, 40, 41, 40
-//};
-//
-////色???藕?量??模??
-//unsigned char std_UV_QT[64] =
-//{
-//	7, 7, 7, 10, 8, 10, 19, 10,
-//	10, 19, 40, 26, 22, 26, 40, 40,
-//	40, 40, 40, 40, 40, 40, 40, 40,
-//	40, 40, 40, 40, 40, 40, 40, 40,
-//	40, 40, 40, 40, 40, 40, 40, 40,
-//	40, 40, 40, 40, 40, 40, 40, 40,
-//	40, 40, 40, 40, 40, 40, 40, 40,
-//	40, 40, 40, 40, 40, 40, 40, 40
-//};
-
-//?炼??藕?量??模??
-unsigned char std_Y_QT[64] =
-{
-	0.75 * 6, 0.75 * 4, 0.75 * 5, 0.75 * 6, 0.75 * 5, 0.75 * 4, 0.75 * 6, 0.75 * 6,
-	0.75 * 5, 0.75 * 6, 0.75 * 7, 0.75 * 7, 0.75 * 6, 0.75 * 8, 0.75 * 10, 0.75 * 16,
-	0.75 * 10, 0.75 * 10, 0.75 * 9, 0.75 * 9, 0.75 * 10, 0.75 * 20, 0.75 * 14, 0.75 * 15,
-	0.75 * 12, 0.75 * 16, 0.75 * 23, 0.75 * 20, 0.75 * 24, 0.75 * 24, 0.75 * 23, 0.75 * 20,
-	0.75 * 22, 0.75 * 22, 0.75 * 26, 0.75 * 29, 0.75 * 37, 0.75 * 31, 0.75 * 26, 0.75 * 27,
-	0.75 * 35, 0.75 * 28, 0.75 * 22, 0.75 * 22, 0.75 * 32, 0.75 * 44, 0.75 * 32, 0.75 * 35,
-	0.75 * 38, 0.75 * 39, 0.75 * 41, 0.75 * 42, 0.75 * 41, 0.75 * 25, 0.75 * 31, 0.75 * 45,
-	0.75 * 48, 0.75 * 45, 0.75 * 40, 0.75 * 48, 0.75 * 37, 0.75 * 40, 0.75 * 41, 0.75 * 40
-};
-
-//色???藕?量??模??
-unsigned char std_UV_QT[64] =
-{
-	0.75 * 7, 0.75 * 7, 0.75 * 7, 0.75 * 10, 0.75 * 8, 0.75 * 10, 0.75 * 19, 0.75 * 10,
-	0.75 * 10, 0.75 * 19, 0.75 * 40, 0.75 * 26, 0.75 * 22, 0.75 * 26, 0.75 * 40, 0.75 * 40,
-	30, 30, 30, 30, 30, 30, 30, 30,
-	30, 30, 30, 30, 30, 30, 30, 30,
-	30, 30, 30, 30, 30, 30, 30, 30,
-	30, 30, 30, 30, 30, 30, 30, 30,
-	30, 30, 30, 30, 30, 30, 30, 30,
-	30, 30, 30, 30, 30, 30, 30, 30
-};
-
-unsigned char STD_DC_Y_NRCODES[16] = { 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
-unsigned char STD_DC_Y_VALUES[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
-
-unsigned char STD_DC_UV_NRCODES[16] = { 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
-unsigned char STD_DC_UV_VALUES[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
-
-unsigned char STD_AC_Y_NRCODES[16] = { 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0X7D };
-unsigned char STD_AC_Y_VALUES[162] =
-{
-	0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
-	0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
-	0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
-	0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
-	0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
-	0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
-	0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
-	0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
-	0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
-	0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
-	0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
-	0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
-	0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
-	0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
-	0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
-	0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
-	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
-	0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
-	0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
-	0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
-	0xf9, 0xfa
-};
-
-unsigned char STD_AC_UV_NRCODES[16] = { 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0X77 };
-unsigned char STD_AC_UV_VALUES[162] =
-{
-	0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
-	0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
-	0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
-	0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
-	0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
-	0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
-	0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
-	0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
-	0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
-	0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
-	0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
-	0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
-	0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
-	0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
-	0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
-	0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
-	0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
-	0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
-	0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
-	0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
-	0xf9, 0xfa
-};
-
-int DivUp(int x, int d)
-{
-	return (x + d - 1) / d;
-}
-
-template<typename T>
-void writeAndAdvance(unsigned char *&pData, T nElement)
-{
-	writeBigEndian<T>(pData, nElement);
-	pData += sizeof(T);
-}
-
-void writeMarker(unsigned char nMarker, unsigned char *&pData)
-{
-	*pData++ = 0x0ff;
-	*pData++ = nMarker;
-}
-
-void writeJFIFTag(unsigned char *&pData)
-{
-	const char JFIF_TAG[] =
-	{
-		0x4a, 0x46, 0x49, 0x46, 0x00,
-		0x01, 0x02,
-		0x00,
-		0x00, 0x01, 0x00, 0x01,
-		0x00, 0x00
-	};
-
-	writeMarker(0x0e0, pData);
-	writeAndAdvance<unsigned short>(pData, sizeof(JFIF_TAG) + sizeof(unsigned short));
-	memcpy(pData, JFIF_TAG, sizeof(JFIF_TAG));
-	pData += sizeof(JFIF_TAG);
-}
-
-void writeFrameHeader(const FrameHeader &header, unsigned char *&pData)
-{
-	unsigned char aTemp[128];
-	unsigned char *pTemp = aTemp;
-
-	writeAndAdvance<unsigned char>(pTemp, header.nSamplePrecision);
-	writeAndAdvance<unsigned short>(pTemp, header.nHeight);
-	writeAndAdvance<unsigned short>(pTemp, header.nWidth);
-	writeAndAdvance<unsigned char>(pTemp, header.nComponents);
-
-	for (int c = 0; c<header.nComponents; ++c)
-	{
-		writeAndAdvance<unsigned char>(pTemp, header.aComponentIdentifier[c]);
-		writeAndAdvance<unsigned char>(pTemp, header.aSamplingFactors[c]);
-		writeAndAdvance<unsigned char>(pTemp, header.aQuantizationTableSelector[c]);
-	}
-
-	unsigned short nLength = (unsigned short)(pTemp - aTemp);
-
-	writeMarker(0x0C0, pData);
-	writeAndAdvance<unsigned short>(pData, nLength + 2);
-	memcpy(pData, aTemp, nLength);
-	pData += nLength;
-}
-
-void writeScanHeader(const ScanHeader &header, unsigned char *&pData)
-{
-	unsigned char aTemp[128];
-	unsigned char *pTemp = aTemp;
-
-	writeAndAdvance<unsigned char>(pTemp, header.nComponents);
-
-	for (int c = 0; c<header.nComponents; ++c)
-	{
-		writeAndAdvance<unsigned char>(pTemp, header.aComponentSelector[c]);
-		writeAndAdvance<unsigned char>(pTemp, header.aHuffmanTablesSelector[c]);
-	}
-
-	writeAndAdvance<unsigned char>(pTemp, header.nSs);
-	writeAndAdvance<unsigned char>(pTemp, header.nSe);
-	writeAndAdvance<unsigned char>(pTemp, header.nA);
-
-	unsigned short nLength = (unsigned short)(pTemp - aTemp);
-
-	writeMarker(0x0DA, pData);
-	writeAndAdvance<unsigned short>(pData, nLength + 2);
-	memcpy(pData, aTemp, nLength);
-	pData += nLength;
-}
-
-void writeQuantizationTable(const QuantizationTable &table, unsigned char *&pData)
-{
-	writeMarker(0x0DB, pData);
-	writeAndAdvance<unsigned short>(pData, sizeof(QuantizationTable) + 2);
-	memcpy(pData, &table, sizeof(QuantizationTable));
-	pData += sizeof(QuantizationTable);
-}
-
-void writeHuffmanTable(const HuffmanTable &table, unsigned char *&pData)
-{
-	writeMarker(0x0C4, pData);
-
-	// Number of Codes for Bit Lengths [1..16]
-	int nCodeCount = 0;
-
-	for (int i = 0; i < 16; ++i)
-	{
-		nCodeCount += table.aCodes[i];
-	}
-
-	writeAndAdvance<unsigned short>(pData, 17 + nCodeCount + 2);
-	memcpy(pData, &table, 17 + nCodeCount);
-	pData += 17 + nCodeCount;
-}
-
-bool printfNPPinfo(int cudaVerMajor, int cudaVerMinor)
-{
-	const NppLibraryVersion *libVer = nppGetLibVersion();
-
-	printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);
-
-	int driverVersion, runtimeVersion;
-	cudaDriverGetVersion(&driverVersion);
-	cudaRuntimeGetVersion(&runtimeVersion);
-
-	printf("  CUDA Driver  Version: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10);
-	printf("  CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
-
-	bool bVal = checkCudaCapabilities(cudaVerMajor, cudaVerMinor);
-	return bVal;
-}
-
-NppiDCTState *pDCTState;
-FrameHeader oFrameHeader;
-FrameHeader oFrameHeaderFixedSize;
-ScanHeader oScanHeader;
-QuantizationTable aQuantizationTables[4];
-Npp8u *pdQuantizationTables;
-HuffmanTable aHuffmanTables[4];
-HuffmanTable *pHuffmanDCTables;
-HuffmanTable *pHuffmanACTables;
-int nMCUBlocksH;
-int nMCUBlocksV;
-int nMCUBlocksHFixedSize;
-int nMCUBlocksVFixedSize;
-Npp8u *pdScan;
-NppiEncodeHuffmanSpec *apHuffmanDCTable[3];
-NppiEncodeHuffmanSpec *apHuffmanACTable[3];
-unsigned char *pDstJpeg;
-unsigned char *pDstOutput;
-int nRestartInterval;
-
-int initTable()
-{
-	NPP_CHECK_NPP(nppiDCTInitAlloc(&pDCTState));
-
-	nRestartInterval = -1;
-
-	cudaMalloc(&pdQuantizationTables, 64 * 4);
-	pHuffmanDCTables = aHuffmanTables;
-	pHuffmanACTables = &aHuffmanTables[2];
-	memset(aQuantizationTables, 0, 4 * sizeof(QuantizationTable));
-	memset(aHuffmanTables, 0, 4 * sizeof(HuffmanTable));
-	memset(&oFrameHeader, 0, sizeof(FrameHeader));
-
-
-	//????Huffman??
-	aHuffmanTables[0].nClassAndIdentifier = 0;
-	memcpy(aHuffmanTables[0].aCodes, STD_DC_Y_NRCODES, 16);
-	memcpy(aHuffmanTables[0].aTable, STD_DC_Y_VALUES, 12);
-
-	aHuffmanTables[1].nClassAndIdentifier = 1;
-	memcpy(aHuffmanTables[1].aCodes, STD_DC_UV_NRCODES, 16);
-	memcpy(aHuffmanTables[1].aTable, STD_DC_UV_VALUES, 12);
-
-	aHuffmanTables[2].nClassAndIdentifier = 16;
-	memcpy(aHuffmanTables[2].aCodes, STD_AC_Y_NRCODES, 16);
-	memcpy(aHuffmanTables[2].aTable, STD_AC_Y_VALUES, 162);
-
-	aHuffmanTables[3].nClassAndIdentifier = 17;
-	memcpy(aHuffmanTables[3].aCodes, STD_AC_UV_NRCODES, 16);
-	memcpy(aHuffmanTables[3].aTable, STD_AC_UV_VALUES, 162);
-
-
-	//????量????
-	aQuantizationTables[0].nPrecisionAndIdentifier = 0;
-	memcpy(aQuantizationTables[0].aTable, std_Y_QT, 64);
-	aQuantizationTables[1].nPrecisionAndIdentifier = 1;
-	memcpy(aQuantizationTables[1].aTable, std_UV_QT, 64);
-
-	NPP_CHECK_CUDA(cudaMemcpyAsync(pdQuantizationTables, aQuantizationTables[0].aTable, 64, cudaMemcpyHostToDevice));
-	NPP_CHECK_CUDA(cudaMemcpyAsync(pdQuantizationTables + 64, aQuantizationTables[1].aTable, 64, cudaMemcpyHostToDevice));
-
-	oFrameHeader.nSamplePrecision = 8;
-	oFrameHeader.nComponents = 3;
-	oFrameHeader.aComponentIdentifier[0] = 1;
-	oFrameHeader.aComponentIdentifier[1] = 2;
-	oFrameHeader.aComponentIdentifier[2] = 3;
-	oFrameHeader.aSamplingFactors[0] = 34;
-	oFrameHeader.aSamplingFactors[1] = 17;
-	oFrameHeader.aSamplingFactors[2] = 17;
-	oFrameHeader.aQuantizationTableSelector[0] = 0;
-	oFrameHeader.aQuantizationTableSelector[1] = 1;
-	oFrameHeader.aQuantizationTableSelector[2] = 1;
-
-	for (int i = 0; i < oFrameHeader.nComponents; ++i)
-	{
-		nMCUBlocksV = max(nMCUBlocksV, oFrameHeader.aSamplingFactors[i] & 0x0f);
-		nMCUBlocksH = max(nMCUBlocksH, oFrameHeader.aSamplingFactors[i] >> 4);
-	}
-	NPP_CHECK_CUDA(cudaMalloc(&pdScan, 4 << 20));
-
-
-
-	oScanHeader.nComponents = 3;
-	oScanHeader.aComponentSelector[0] = 1;
-	oScanHeader.aComponentSelector[1] = 2;
-	oScanHeader.aComponentSelector[2] = 3;
-	oScanHeader.aHuffmanTablesSelector[0] = 0;
-	oScanHeader.aHuffmanTablesSelector[1] = 17;
-	oScanHeader.aHuffmanTablesSelector[2] = 17;
-	oScanHeader.nSs = 0;
-	oScanHeader.nSe = 63;
-	oScanHeader.nA = 0;
-
-
-	return 0;
-}
-
-NppiSize aSrcSize[3];
-Npp16s *apdDCT[3];// = { 0, 0, 0 };
-Npp32s aDCTStep[3];
-
-Npp8u *apSrcImage[3];// = { 0, 0, 0 };
-Npp32s aSrcImageStep[3];
-size_t aSrcPitch[3];
-
-
-int releaseJpegNPP()
-{
-	nppiDCTFree(pDCTState);
-	cudaFree(pdQuantizationTables);
-	cudaFree(pdScan);
-	for (int i = 0; i < 3; ++i)
-	{
-		cudaFree(apdDCT[i]);
-		cudaFree(apSrcImage[i]);
-	}
-	return 0;
-}
-
-
-int initTable(int flag, int width, int height)
-{
-	//????帧头
-	oFrameHeaderFixedSize.nSamplePrecision = 8;
-	oFrameHeaderFixedSize.nComponents = 3;
-	oFrameHeaderFixedSize.aComponentIdentifier[0] = 1;
-	oFrameHeaderFixedSize.aComponentIdentifier[1] = 2;
-	oFrameHeaderFixedSize.aComponentIdentifier[2] = 3;
-	oFrameHeaderFixedSize.aSamplingFactors[0] = 34;
-	oFrameHeaderFixedSize.aSamplingFactors[1] = 17;
-	oFrameHeaderFixedSize.aSamplingFactors[2] = 17;
-	oFrameHeaderFixedSize.aQuantizationTableSelector[0] = 0;
-	oFrameHeaderFixedSize.aQuantizationTableSelector[1] = 1;
-	oFrameHeaderFixedSize.aQuantizationTableSelector[2] = 1;
-	oFrameHeaderFixedSize.nWidth = width;
-	oFrameHeaderFixedSize.nHeight = height;
-
-	for (int i = 0; i < oFrameHeaderFixedSize.nComponents; ++i)
-	{
-		nMCUBlocksVFixedSize = max(nMCUBlocksVFixedSize, oFrameHeaderFixedSize.aSamplingFactors[i] & 0x0f);
-		nMCUBlocksHFixedSize = max(nMCUBlocksHFixedSize, oFrameHeaderFixedSize.aSamplingFactors[i] >> 4);
-	}
-
-	for (int i = 0; i < oFrameHeaderFixedSize.nComponents; ++i)
-	{
-		NppiSize oBlocks;
-		NppiSize oBlocksPerMCU = { oFrameHeaderFixedSize.aSamplingFactors[i] >> 4, oFrameHeaderFixedSize.aSamplingFactors[i] & 0x0f };
-
-		oBlocks.width = (int)ceil((oFrameHeaderFixedSize.nWidth + 7) / 8 *
-			static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksHFixedSize);
-		oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;
-
-		oBlocks.height = (int)ceil((oFrameHeaderFixedSize.nHeight + 7) / 8 *
-			static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksVFixedSize);
-		oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;
-
-		aSrcSize[i].width = oBlocks.width * 8;
-		aSrcSize[i].height = oBlocks.height * 8;
-
-		// Allocate Memory
-		size_t nPitch;
-		NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height));
-		aDCTStep[i] = static_cast<Npp32s>(nPitch);
-
-		NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height));
-
-		aSrcPitch[i] = nPitch;
-		aSrcImageStep[i] = static_cast<Npp32s>(nPitch);
-	}
-
-	return 0;
-}
-
-int jpegNPP(const char *szOutputFile, float* d_srcRGB)
-{
-	//RGB2YUV
-	cudaError_t cudaStatus;
-	cudaStatus = cuda_common::RGB2YUV(d_srcRGB, oFrameHeaderFixedSize.nWidth, oFrameHeaderFixedSize.nHeight,
-		apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height,
-		apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height,
-		apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height);
-
-	/**
-	* Forward DCT, quantization and level shift part of the JPEG encoding.
-	* Input is expected in 8x8 macro blocks and output is expected to be in 64x1
-	* macro blocks. The new version of the primitive takes the ROI in image pixel size and
-	* works with DCT coefficients that are in zig-zag order.
-	*/
-	int k = 0;
-	//LOG_INFO("NPP_CHECK_NPP:%d", 1);
-	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0],
-		apdDCT[0], aDCTStep[0],
-		pdQuantizationTables + k * 64,
-		aSrcSize[0],
-		pDCTState)))
-	{
-		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	k = 1;
-	//LOG_INFO("NPP_CHECK_NPP:%d", 2);
-	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1],
-		apdDCT[1], aDCTStep[1],
-		pdQuantizationTables + k * 64,
-		aSrcSize[1],
-		pDCTState)))
-	{
-		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	//LOG_INFO("NPP_CHECK_NPP:%d", 3);
-	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2],
-		apdDCT[2], aDCTStep[2],
-		pdQuantizationTables + k * 64,
-		aSrcSize[2],
-		pDCTState)))
-	{
-		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	// Huffman Encoding
-
-	Npp32s nScanLength;
-	Npp8u *pJpegEncoderTemp;
-
-#if (CUDA_VERSION == 8000)
-		Npp32s nTempSize; //when using CUDA8
-#else
-		size_t nTempSize; //when using CUDA9
-#endif
-	//modified by Junlin 190221
-
-	//LOG_INFO("NPP_CHECK_NPP:%d",4);
-	if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize)))
-	{
-		printf("nppiEncodeHuffmanGetSize Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	//LOG_INFO("NPP_CHECK_CUDA:%d",5);
-	NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize));
-
-	/**
-	* Allocates memory and creates a Huffman table in a format that is suitable for the encoder.
-	*/
-	NppStatus t_status;
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]);
-
-	/**
-	* Huffman Encoding of the JPEG Encoding.
-	* Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan.
-	*/
-	Npp32s nSs = 0;
-	Npp32s nSe = 63;
-	Npp32s nH = 0;
-	Npp32s nL = 0;
-	//LOG_INFO("NPP_CHECK_NPP:%d",6);
-	if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep,
-		0, nSs, nSe, nH, nL,
-		pdScan, &nScanLength,
-		apHuffmanDCTable,
-		apHuffmanACTable,
-		aSrcSize,
-		pJpegEncoderTemp)))
-	{
-		printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	for (int i = 0; i < 3; ++i)
-	{
-		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]);
-		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]);
-	}
-	// Write JPEG
-	pDstJpeg = new unsigned char[4 << 20]{};
-	pDstOutput = pDstJpeg;
-
-	writeMarker(0x0D8, pDstOutput);
-	writeJFIFTag(pDstOutput);
-	writeQuantizationTable(aQuantizationTables[0], pDstOutput);
-	writeQuantizationTable(aQuantizationTables[1], pDstOutput);
-	writeHuffmanTable(pHuffmanDCTables[0], pDstOutput);
-	writeHuffmanTable(pHuffmanACTables[0], pDstOutput);
-	writeHuffmanTable(pHuffmanDCTables[1], pDstOutput);
-	writeHuffmanTable(pHuffmanACTables[1], pDstOutput);
-	writeFrameHeader(oFrameHeaderFixedSize, pDstOutput);
-	writeScanHeader(oScanHeader, pDstOutput);
-
-	//LOG_INFO("NPP_CHECK_CUDA:%d",7);
-	NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost));
-
-	pDstOutput += nScanLength;
-	writeMarker(0x0D9, pDstOutput);
-	{
-		// Write result to file.
-		std::ofstream outputFile(szOutputFile, ios::out | ios::binary);
-		outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg));
-	}
-
-	// Cleanup
-	cudaFree(pJpegEncoderTemp);
-	delete[] pDstJpeg;
-
-
-	return EXIT_SUCCESS;
-}
-
-int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB)
-{
-	//RGB2YUV
-	cudaError_t cudaStatus;
-	cudaStatus = cuda_common::RGB2YUV(d_srcRGB, oFrameHeaderFixedSize.nWidth, oFrameHeaderFixedSize.nHeight,
-		apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height,
-		apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height,
-		apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height);
-
-	/**
-	* Forward DCT, quantization and level shift part of the JPEG encoding.
-	* Input is expected in 8x8 macro blocks and output is expected to be in 64x1
-	* macro blocks. The new version of the primitive takes the ROI in image pixel size and
-	* works with DCT coefficients that are in zig-zag order.
-	*/
-	int k = 0;
-	//LOG_INFO("NPP_CHECK_NPP:%d", 1);
-	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0],
-		apdDCT[0], aDCTStep[0],
-		pdQuantizationTables + k * 64,
-		aSrcSize[0],
-		pDCTState)))
-	{
-		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	k = 1;
-	//LOG_INFO("NPP_CHECK_NPP:%d", 2);
-	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1],
-		apdDCT[1], aDCTStep[1],
-		pdQuantizationTables + k * 64,
-		aSrcSize[1],
-		pDCTState)))
-	{
-		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	//LOG_INFO("NPP_CHECK_NPP:%d", 3);
-	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2],
-		apdDCT[2], aDCTStep[2],
-		pdQuantizationTables + k * 64,
-		aSrcSize[2],
-		pDCTState)))
-	{
-		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	// Huffman Encoding
-
-	Npp32s nScanLength;
-	Npp8u *pJpegEncoderTemp;
-
-#if (CUDA_VERSION == 8000)
-	Npp32s nTempSize; //when using CUDA8
-#else
-	size_t nTempSize; //when using CUDA9
-#endif
-					  //modified by Junlin 190221
-
-					  //LOG_INFO("NPP_CHECK_NPP:%d",4);
-	if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize)))
-	{
-		printf("nppiEncodeHuffmanGetSize Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	//LOG_INFO("NPP_CHECK_CUDA:%d",5);
-	NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize));
-
-	/**
-	* Allocates memory and creates a Huffman table in a format that is suitable for the encoder.
-	*/
-	NppStatus t_status;
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]);
-
-	/**
-	* Huffman Encoding of the JPEG Encoding.
-	* Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan.
-	*/
-	Npp32s nSs = 0;
-	Npp32s nSe = 63;
-	Npp32s nH = 0;
-	Npp32s nL = 0;
-	//LOG_INFO("NPP_CHECK_NPP:%d",6);
-	if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep,
-		0, nSs, nSe, nH, nL,
-		pdScan, &nScanLength,
-		apHuffmanDCTable,
-		apHuffmanACTable,
-		aSrcSize,
-		pJpegEncoderTemp)))
-	{
-		printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	for (int i = 0; i < 3; ++i)
-	{
-		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]);
-		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]);
-	}
-	// Write JPEG
-	pDstJpeg = new unsigned char[4 << 20]{};
-	pDstOutput = pDstJpeg;
-
-	writeMarker(0x0D8, pDstOutput);
-	writeJFIFTag(pDstOutput);
-	writeQuantizationTable(aQuantizationTables[0], pDstOutput);
-	writeQuantizationTable(aQuantizationTables[1], pDstOutput);
-	writeHuffmanTable(pHuffmanDCTables[0], pDstOutput);
-	writeHuffmanTable(pHuffmanACTables[0], pDstOutput);
-	writeHuffmanTable(pHuffmanDCTables[1], pDstOutput);
-	writeHuffmanTable(pHuffmanACTables[1], pDstOutput);
-	writeFrameHeader(oFrameHeaderFixedSize, pDstOutput);
-	writeScanHeader(oScanHeader, pDstOutput);
-
-	//LOG_INFO("NPP_CHECK_CUDA:%d",7);
-	NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost));
-
-	pDstOutput += nScanLength;
-	writeMarker(0x0D9, pDstOutput);
-	{
-		// Write result to file.
-		std::ofstream outputFile(szOutputFile, ios::out | ios::binary);
-		outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg));
-	}
-
-	// Cleanup
-	cudaFree(pJpegEncoderTemp);
-	delete[] pDstJpeg;
-
-
-	return EXIT_SUCCESS;
-}
-
-
-int jpegNPP(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height)
-{
-	NppiSize aSrcSize[3];
-	Npp16s *apdDCT[3] = { 0, 0, 0 };
-	Npp32s aDCTStep[3];
-
-	Npp8u *apSrcImage[3] = { 0, 0, 0 };
-	Npp32s aSrcImageStep[3];
-	size_t aSrcPitch[3];
-
-
-	//????帧头
-	oFrameHeader.nWidth = img_width;
-	oFrameHeader.nHeight = img_height;
-
-	for (int i = 0; i < oFrameHeader.nComponents; ++i)
-	{
-		NppiSize oBlocks;
-		NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] >> 4, oFrameHeader.aSamplingFactors[i] & 0x0f };
-
-		oBlocks.width = (int)ceil((oFrameHeader.nWidth + 7) / 8 *
-			static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH);
-		oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;
-
-		oBlocks.height = (int)ceil((oFrameHeader.nHeight + 7) / 8 *
-			static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV);
-		oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;
-
-		aSrcSize[i].width = oBlocks.width * 8;
-		aSrcSize[i].height = oBlocks.height * 8;
-
-		// Allocate Memory
-		size_t nPitch;
-		//LOG_INFO("NPP_CHECK_CUDA:%d",1);
-		NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height));
-		aDCTStep[i] = static_cast<Npp32s>(nPitch);
-
-		//LOG_INFO("NPP_CHECK_CUDA:%d",2);
-		NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height));
-
-		aSrcPitch[i] = nPitch;
-		aSrcImageStep[i] = static_cast<Npp32s>(nPitch);
-	}
-
-	//RGB2YUV
-	cudaError_t cudaStatus;
-	cudaStatus = cuda_common::RGB2YUV(d_srcRGB, img_width, img_height,
-		apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height,
-		apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height,
-		apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height);
-
-	/**
-	* Forward DCT, quantization and level shift part of the JPEG encoding.
-	* Input is expected in 8x8 macro blocks and output is expected to be in 64x1
-	* macro blocks. The new version of the primitive takes the ROI in image pixel size and
-	* works with DCT coefficients that are in zig-zag order.
-	*/
-	int k = 0;
-	//LOG_INFO("NPP_CHECK_CUDA:%d",3);
-	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0],
-		apdDCT[0], aDCTStep[0],
-		pdQuantizationTables + k * 64,
-		aSrcSize[0],
-		pDCTState)))
-	{
-		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
-		return EXIT_FAILURE;
-	}
-	k = 1;
-
-	//LOG_INFO("NPP_CHECK_CUDA:%d",4);
-	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1],
-		apdDCT[1], aDCTStep[1],
-		pdQuantizationTables + k * 64,
-		aSrcSize[1],
-		pDCTState)))
-	{
-		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	//LOG_INFO("NPP_CHECK_CUDA:%d",5);
-	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2],
-		apdDCT[2], aDCTStep[2],
-		pdQuantizationTables + k * 64,
-		aSrcSize[2],
-		pDCTState)))
-	{
-		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	// Huffman Encoding
-
-	Npp32s nScanLength;
-	Npp8u *pJpegEncoderTemp;
-
-#if (CUDA_VERSION == 8000)
-	Npp32s nTempSize; //when using CUDA8
-#else
-	size_t nTempSize; //when using CUDA9
-#endif
-					  //modified by Junlin 190221
-
-	//LOG_INFO("NPP_CHECK_CUDA:%d",6);
-	if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize)))
-	{
-		printf("nppiEncodeHuffmanGetSize Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	//LOG_INFO("NPP_CHECK_CUDA:%d",7);
-	NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize));
-
-	/**
-	* Allocates memory and creates a Huffman table in a format that is suitable for the encoder.
-	*/
-	NppStatus t_status;
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]);
-
-	/**
-	* Huffman Encoding of the JPEG Encoding.
-	* Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan.
-	*/
-	Npp32s nSs = 0;
-	Npp32s nSe = 63;
-	Npp32s nH = 0;
-	Npp32s nL = 0;
-	//LOG_INFO("NPP_CHECK_CUDA:%d",8);
-	if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep,
-		0, nSs, nSe, nH, nL,
-		pdScan, &nScanLength,
-		apHuffmanDCTable,
-		apHuffmanACTable,
-		aSrcSize,
-		pJpegEncoderTemp)))
-	{
-		printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	for (int i = 0; i < 3; ++i)
-	{
-		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]);
-		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]);
-	}
-	// Write JPEG
-	pDstJpeg = new unsigned char[4 << 20]{};
-	pDstOutput = pDstJpeg;
-
-	writeMarker(0x0D8, pDstOutput);
-	writeJFIFTag(pDstOutput);
-	writeQuantizationTable(aQuantizationTables[0], pDstOutput);
-	writeQuantizationTable(aQuantizationTables[1], pDstOutput);
-	writeHuffmanTable(pHuffmanDCTables[0], pDstOutput);
-	writeHuffmanTable(pHuffmanACTables[0], pDstOutput);
-	writeHuffmanTable(pHuffmanDCTables[1], pDstOutput);
-	writeHuffmanTable(pHuffmanACTables[1], pDstOutput);
-	writeFrameHeader(oFrameHeader, pDstOutput);
-	writeScanHeader(oScanHeader, pDstOutput);
-
-	//LOG_INFO("NPP_CHECK_CUDA:%d",9);
-	NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost));
-
-	pDstOutput += nScanLength;
-	writeMarker(0x0D9, pDstOutput);
-
-	{
-		// Write result to file.
-		std::ofstream outputFile(szOutputFile, ios::out | ios::binary);
-		outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg));
-	}
-
-	// Cleanup
-	cudaFree(pJpegEncoderTemp);
-	delete[] pDstJpeg;
-	for (int i = 0; i < 3; ++i)
-	{
-		cudaFree(apdDCT[i]);
-		cudaFree(apSrcImage[i]);
-	}
-
-	return EXIT_SUCCESS;
-}
-
-
-int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height)
-{
-	NppiSize aSrcSize[3];
-	Npp16s *apdDCT[3] = { 0, 0, 0 };
-	Npp32s aDCTStep[3];
-
-	Npp8u *apSrcImage[3] = { 0, 0, 0 };
-	Npp32s aSrcImageStep[3];
-	size_t aSrcPitch[3];
-
-
-	//????帧头
-	oFrameHeader.nWidth = img_width;
-	oFrameHeader.nHeight = img_height;
-
-	for (int i = 0; i < oFrameHeader.nComponents; ++i)
-	{
-		NppiSize oBlocks;
-		NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] >> 4, oFrameHeader.aSamplingFactors[i] & 0x0f };
-
-		oBlocks.width = (int)ceil((oFrameHeader.nWidth + 7) / 8 *
-			static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH);
-		oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;
-
-		oBlocks.height = (int)ceil((oFrameHeader.nHeight + 7) / 8 *
-			static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV);
-		oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;
-
-		aSrcSize[i].width = oBlocks.width * 8;
-		aSrcSize[i].height = oBlocks.height * 8;
-
-		// Allocate Memory
-		size_t nPitch;
-		//LOG_INFO("NPP_CHECK_CUDA:%d",1);
-		NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height));
-		aDCTStep[i] = static_cast<Npp32s>(nPitch);
-
-		//LOG_INFO("NPP_CHECK_CUDA:%d",2);
-		NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height));
-
-		aSrcPitch[i] = nPitch;
-		aSrcImageStep[i] = static_cast<Npp32s>(nPitch);
-	}
-
-	//RGB2YUV
-	cudaError_t cudaStatus;
-	cudaStatus = cuda_common::RGB2YUV(d_srcRGB, img_width, img_height,
-		apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height,
-		apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height,
-		apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height);
-
-	/**
-	* Forward DCT, quantization and level shift part of the JPEG encoding.
-	* Input is expected in 8x8 macro blocks and output is expected to be in 64x1
-	* macro blocks. The new version of the primitive takes the ROI in image pixel size and
-	* works with DCT coefficients that are in zig-zag order.
-	*/
-	int k = 0;
-	//LOG_INFO("NPP_CHECK_CUDA:%d",3);
-	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0],
-		apdDCT[0], aDCTStep[0],
-		pdQuantizationTables + k * 64,
-		aSrcSize[0],
-		pDCTState)))
-	{
-		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
-		return EXIT_FAILURE;
-	}
-	k = 1;
-
-	//LOG_INFO("NPP_CHECK_CUDA:%d",4);
-	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1],
-		apdDCT[1], aDCTStep[1],
-		pdQuantizationTables + k * 64,
-		aSrcSize[1],
-		pDCTState)))
-	{
-		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	//LOG_INFO("NPP_CHECK_CUDA:%d",5);
-	if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2],
-		apdDCT[2], aDCTStep[2],
-		pdQuantizationTables + k * 64,
-		aSrcSize[2],
-		pDCTState)))
-	{
-		printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	// Huffman Encoding
-
-	Npp32s nScanLength;
-	Npp8u *pJpegEncoderTemp;
-
-#if (CUDA_VERSION == 8000)
-	Npp32s nTempSize; //when using CUDA8
-#else
-	size_t nTempSize; //when using CUDA9
-#endif
-					  //modified by Junlin 190221
-
-					  //LOG_INFO("NPP_CHECK_CUDA:%d",6);
-	if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize)))
-	{
-		printf("nppiEncodeHuffmanGetSize Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	//LOG_INFO("NPP_CHECK_CUDA:%d",7);
-	NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize));
-
-	/**
-	* Allocates memory and creates a Huffman table in a format that is suitable for the encoder.
-	*/
-	NppStatus t_status;
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]);
-	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]);
-
-	/**
-	* Huffman Encoding of the JPEG Encoding.
-	* Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan.
-	*/
-	Npp32s nSs = 0;
-	Npp32s nSe = 63;
-	Npp32s nH = 0;
-	Npp32s nL = 0;
-	//LOG_INFO("NPP_CHECK_CUDA:%d",8);
-	if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep,
-		0, nSs, nSe, nH, nL,
-		pdScan, &nScanLength,
-		apHuffmanDCTable,
-		apHuffmanACTable,
-		aSrcSize,
-		pJpegEncoderTemp)))
-	{
-		printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n");
-		return EXIT_FAILURE;
-	}
-
-	for (int i = 0; i < 3; ++i)
-	{
-		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]);
-		nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]);
-	}
-	// Write JPEG
-	pDstJpeg = new unsigned char[4 << 20]{};
-	pDstOutput = pDstJpeg;
-
-	writeMarker(0x0D8, pDstOutput);
-	writeJFIFTag(pDstOutput);
-	writeQuantizationTable(aQuantizationTables[0], pDstOutput);
-	writeQuantizationTable(aQuantizationTables[1], pDstOutput);
-	writeHuffmanTable(pHuffmanDCTables[0], pDstOutput);
-	writeHuffmanTable(pHuffmanACTables[0], pDstOutput);
-	writeHuffmanTable(pHuffmanDCTables[1], pDstOutput);
-	writeHuffmanTable(pHuffmanACTables[1], pDstOutput);
-	writeFrameHeader(oFrameHeader, pDstOutput);
-	writeScanHeader(oScanHeader, pDstOutput);
-
-	//LOG_INFO("NPP_CHECK_CUDA:%d",9);
-	NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost));
-
-	pDstOutput += nScanLength;
-	writeMarker(0x0D9, pDstOutput);
-
-	{
-		// Write result to file.
-		std::ofstream outputFile(szOutputFile, ios::out | ios::binary);
-		outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg));
-	}
-
-	// Cleanup
-	cudaFree(pJpegEncoderTemp);
-	delete[] pDstJpeg;
-	for (int i = 0; i < 3; ++i)
-	{
-		cudaFree(apdDCT[i]);
-		cudaFree(apSrcImage[i]);
-	}
-
-	return EXIT_SUCCESS;
-}
--
libgit2 0.21.4