From 5686354a692a7987f1bb58f2dbe10f39dcbcbd1e Mon Sep 17 00:00:00 2001 From: fiss <2657262686@qq.com> Date: Mon, 20 Mar 2023 23:30:34 -0400 Subject: [PATCH] 初步编译成功cuvid部分的 --- src/Makefile | 8 ++++---- src/Makefile.bak | 4 ++-- src/demo/Makefile | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------- src/demo/Makefile.o.nvdec | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/demo/main_nvdec.cpp | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------------------------- src/gb28181/FFGB28181Decoder.cpp | 7 ++++--- src/gb28181/Makefile | 4 +++- src/interface/FFNvDecoderManager.cpp | 2 +- src/interface/Makefile | 10 ++++++---- src/nvdec/DrawImageOnGPU.cu | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/nvdec/FFCuContextManager.cpp | 41 +++++++++++++++++++++++++++++++++++++++++ src/nvdec/FFCuContextManager.h | 28 ++++++++++++++++++++++++++++ src/nvdec/FFNvDecoder.cpp | 513 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/nvdec/FFNvDecoder.h | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/nvdec/GpuRgbMemory.hpp | 34 ++++++++++++++++++++++++++++++++++ src/nvdec/ImageSaveGPU.cpp | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/nvdec/ImageSaveGPU.h | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/nvdec/Makefile | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/nvdec/NV12ToRGB.cu | 345 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/nvdec/NvDecoderApi.cpp | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/nvdec/NvDecoderApi.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ src/nvdec/NvJpegEncoder.cpp | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/nvdec/NvJpegEncoder.h | 3 +++ src/nvdec/PartMemCopy.cu | 289 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/nvdec/RGB2YUV.cu | 263 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/nvdec/ResizeImage.cu | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/nvdec/common_header.h | 9 +++++++++ src/nvdec/cuda_kernels.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/nvdec/define.hpp | 11 +++++++++++ src/nvdec/jpegNPP.cpp-1 | 1193 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/nvdecoder/DrawImageOnGPU.cu | 126 ------------------------------------------------------------------------------------------------------------------------------ src/nvdecoder/FFCuContextManager.cpp | 41 ----------------------------------------- src/nvdecoder/FFCuContextManager.h | 28 ---------------------------- src/nvdecoder/FFNvDecoder.cpp | 513 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- src/nvdecoder/FFNvDecoder.h | 107 ----------------------------------------------------------------------------------------------------------- src/nvdecoder/GpuRgbMemory.hpp | 34 ---------------------------------- src/nvdecoder/ImageSaveGPU.cpp | 123 --------------------------------------------------------------------------------------------------------------------------- src/nvdecoder/ImageSaveGPU.h | 65 ----------------------------------------------------------------- src/nvdecoder/Makefile | 102 ------------------------------------------------------------------------------------------------------ src/nvdecoder/NV12ToRGB.cu | 345 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- src/nvdecoder/NvDecoderApi.cpp | 133 ------------------------------------------------------------------------------------------------------------------------------------- src/nvdecoder/NvDecoderApi.h | 44 -------------------------------------------- src/nvdecoder/NvJpegEncoder.cpp | 90 ------------------------------------------------------------------------------------------ src/nvdecoder/NvJpegEncoder.h | 3 --- src/nvdecoder/PartMemCopy.cu | 289 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- src/nvdecoder/RGB2YUV.cu | 263 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- src/nvdecoder/ResizeImage.cu | 84 ------------------------------------------------------------------------------------ src/nvdecoder/common_header.h | 9 --------- src/nvdecoder/cuda_kernels.h | 63 --------------------------------------------------------------- src/nvdecoder/define.hpp | 11 ----------- src/nvdecoder/jpegNPP.cpp-1 | 1193 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 51 files changed, 3857 insertions(+), 3775 deletions(-) create mode 100644 src/demo/Makefile.o.nvdec create mode 100644 src/nvdec/DrawImageOnGPU.cu create mode 100644 src/nvdec/FFCuContextManager.cpp create mode 100644 src/nvdec/FFCuContextManager.h create mode 100644 src/nvdec/FFNvDecoder.cpp create mode 100644 src/nvdec/FFNvDecoder.h create mode 100644 src/nvdec/GpuRgbMemory.hpp create mode 100644 src/nvdec/ImageSaveGPU.cpp create mode 100644 src/nvdec/ImageSaveGPU.h create mode 100644 src/nvdec/Makefile create mode 100644 src/nvdec/NV12ToRGB.cu create mode 100644 src/nvdec/NvDecoderApi.cpp create mode 100644 src/nvdec/NvDecoderApi.h create mode 100644 src/nvdec/NvJpegEncoder.cpp create mode 100644 src/nvdec/NvJpegEncoder.h create mode 100644 src/nvdec/PartMemCopy.cu create mode 100644 src/nvdec/RGB2YUV.cu create mode 100644 src/nvdec/ResizeImage.cu create mode 100644 src/nvdec/common_header.h create mode 100644 src/nvdec/cuda_kernels.h create mode 100644 src/nvdec/define.hpp create mode 100644 src/nvdec/jpegNPP.cpp-1 delete mode 100644 src/nvdecoder/DrawImageOnGPU.cu delete mode 100644 src/nvdecoder/FFCuContextManager.cpp delete mode 100644 src/nvdecoder/FFCuContextManager.h delete mode 100644 src/nvdecoder/FFNvDecoder.cpp delete mode 100644 src/nvdecoder/FFNvDecoder.h delete mode 100644 src/nvdecoder/GpuRgbMemory.hpp delete mode 100644 src/nvdecoder/ImageSaveGPU.cpp delete mode 100644 src/nvdecoder/ImageSaveGPU.h delete mode 100644 src/nvdecoder/Makefile delete mode 100644 src/nvdecoder/NV12ToRGB.cu delete mode 100644 src/nvdecoder/NvDecoderApi.cpp delete mode 100644 src/nvdecoder/NvDecoderApi.h delete mode 100644 src/nvdecoder/NvJpegEncoder.cpp delete mode 100644 src/nvdecoder/NvJpegEncoder.h delete mode 100644 src/nvdecoder/PartMemCopy.cu delete mode 100644 src/nvdecoder/RGB2YUV.cu delete mode 100644 src/nvdecoder/ResizeImage.cu delete mode 100644 src/nvdecoder/common_header.h delete mode 100644 src/nvdecoder/cuda_kernels.h delete mode 100644 src/nvdecoder/define.hpp delete mode 100644 src/nvdecoder/jpegNPP.cpp-1 diff --git a/src/Makefile b/src/Makefile index 2b6c8d8..994ba2a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -29,11 +29,11 @@ LDFLAGS:= LIBS:= -L $(SPDLOG_ROOT)/lib -l:libspdlog.a \ # 各个模块 -MODULES:= nvdecoder gb28181 interface demo +MODULES:= nvdec gb28181 interface demo # 各个模块对应的库 -# MODULE_LIBS:=$(BUILD_DIR)/nvdecoder/lib/nvdecoder.a\ -# $(BUILD_DIR)/nvdecoder/lib/gb28181.a\ +# MODULE_LIBS:=$(BUILD_DIR)/nvdec/lib/nvdec.a\ +# $(BUILD_DIR)/nvdec/lib/gb28181.a\ # $(BUILD_DIR)/interface/lib/interface.a\ # 最终目标文件 @@ -46,7 +46,7 @@ all:$(TARGET) # 最终目标依赖关系 $(TARGET):FORCE | $(BIN_DIR) @for n in $(MODULES); do make -s -f $(TOP_DIR)/$$n/Makefile MODULE=$$n || exit "$$?"; done -# @echo -e "\e[32m""Linking executable $(TARGET)""\e[0m" + @echo -e "\e[32m""Linking executable $(TARGET)""\e[0m" #@$(LD) $(LDFLAGS) -o $@ $(MODULE_LIBS) $(LIBS) # 若没有bin目录则自动生成 diff --git a/src/Makefile.bak b/src/Makefile.bak index bddc482..af26493 100644 --- a/src/Makefile.bak +++ b/src/Makefile.bak @@ -38,7 +38,7 @@ CFLAGS= -g -fPIC -O0 $(INCLUDE) -pthread -lrt -lz -std=c++11 -fvisibility=hidden NFLAGS_LIB=-g -c -shared -Xcompiler -fPIC -Xcompiler -fvisibility=hidden NFLAGS = $(NFLAGS_LIB) $(INCLUDE) -std=c++11 -SRCS:=$(wildcard $(SRC_ROOT)/nvdecoder/*.cpp) \ +SRCS:=$(wildcard $(SRC_ROOT)/nvdec/*.cpp) \ $(wildcard $(SRC_ROOT)/gb28181/*.cpp) \ $(wildcard $(SRC_ROOT)/dvpp/*.cpp) OBJS = $(patsubst %.cpp, %.o, $(notdir $(SRCS))) @@ -52,7 +52,7 @@ $(TARGET):$(OBJS) $(CU_OBJS) $(XX) -o $@ $^ $(CFLAGS) $(LIBSPATH) $(LIBS) -Wwrite-strings rm -f *.o -# %.o:$(SRC_ROOT)/nvdecoder/%.cpp +# %.o:$(SRC_ROOT)/nvdec/%.cpp # $(XX) $(CFLAGS) -c $< %.o:$(SRC_ROOT)/gb28181/%.cpp diff --git a/src/demo/Makefile b/src/demo/Makefile index b19cdb2..e608a63 100644 --- a/src/demo/Makefile +++ b/src/demo/Makefile @@ -1,25 +1,41 @@ -XX = g++ +# 各项目录 +LIB_DIR:=$(BUILD_DIR)/$(MODULE)/lib +DEP_DIR:=$(BUILD_DIR)/$(MODULE)/.dep +OBJ_DIR:=$(BUILD_DIR)/$(MODULE)/obj +SRC_DIR:=$(TOP_DIR)/$(MODULE) +# 源文件以及中间目标文件和依赖文件 +SRCS:=$(notdir $(wildcard $(SRC_DIR)/*.cpp)) +OBJS:=$(addprefix $(OBJ_DIR)/, $(patsubst %.cpp, %.o, $(SRCS))) +DEPS:=$(addprefix $(DEP_DIR)/, $(patsubst %.cpp, %.d,a $(SRCS))) -PROJECT_ROOT= /mnt/data/cmhu/FFNvDecoder +# 自动生成头文件依赖选项 +DEPFLAGS=-MT $@ -MMD -MP -MF $(DEP_DIR)/$*.d -CUDA_ROOT = /usr/local/cuda-11.1 +# 最终目标文件 +TARGET:=/mnt/data/cmhu/FFNvDecoder/bin/lib/demo + + + +PROJECT_ROOT= /mnt/data/cmhu/FFNvDecoder DEPEND_DIR = $(PROJECT_ROOT)/bin THIRDPARTY_ROOT = $(PROJECT_ROOT)/3rdparty SPDLOG_ROOT = $(THIRDPARTY_ROOT)/spdlog-1.9.2/release JRTP_ROOT = $(THIRDPARTY_ROOT)/jrtp_export -SRC_ROOT = $(PROJECT_ROOT)/src +CUDA_ROOT = /usr/local/cuda-11.1 -TARGET= $(PROJECT_ROOT)/bin/lib/demo +LIBSPATH= -L $(DEPEND_DIR)/lib -lavformat -lavcodec -lswscale -lavutil -lavfilter -lswresample -lavdevice \ + -L $(CUDA_ROOT)/lib64 -lcuda -lcudart -lnvcuvid -lcurand -lcublas -lnvjpeg \ + -L $(SPDLOG_ROOT) -l:libspdlog.a \ + -L $(JRTP_ROOT)/jthread/lib -l:libjthread.a \ + -L $(JRTP_ROOT)/jrtplib/lib -l:libjrtp.a -DEFS = -DENABLE_DVPP_INTERFACE -INCLUDE= -I $(SRC_ROOT)/interface \ - -I $(SRC_ROOT)/dvpp \ - -I $(SRC_ROOT)/gb28181 \ - -I $(SRC_ROOT)/nvdecoder \ +INCLUDE= -I $(TOP_DIR)/interface \ + -I $(TOP_DIR)/nvdec \ + -I $(TOP_DIR)/gb28181 \ -I $(DEPEND_DIR)/include \ -I $(CUDA_ROOT)/include \ -I $(TOP_DIR)/common/inc \ @@ -29,51 +45,53 @@ INCLUDE= -I $(SRC_ROOT)/interface \ -I $(JRTP_ROOT)/jrtplib/include/jrtplib3 \ -I $(JRTP_ROOT)/jthread/include/jthread -LIBSPATH= -L $(DEPEND_DIR)/lib -lavformat -lavcodec -lswscale -lavutil -lavfilter -lswresample -lavdevice \ +LIBSPATH= -L $(BUILD_DIR)/interface/lib -l:interface.a \ + -L $(BUILD_DIR)/nvdec/lib -l:nvdec.a \ + -L $(BUILD_DIR)/gb28181/lib -l:gb28181.a \ + -L $(DEPEND_DIR)/lib -lavformat -lavcodec -lswscale -lavutil -lavfilter -lswresample -lavdevice \ -L $(CUDA_ROOT)/lib64 -lcuda -lcudart -lnvcuvid -lcurand -lcublas -lnvjpeg \ -L $(SPDLOG_ROOT) -l:libspdlog.a \ -L $(JRTP_ROOT)/jthread/lib -l:libjthread.a \ -L $(JRTP_ROOT)/jrtplib/lib -l:libjrtp.a +CXXFLAGS= -g -O0 -fPIC $(INCLUDE) -lpthread -lrt -lz -fexceptions -std=c++11 -fvisibility=hidden -Wl,-Bsymbolic -ldl -Wwrite-strings + # -DUNICODE -D_UNICODE -# include_dir=-I/usr/local/Ascend/ascend-toolkit/6.3.RC1.alpha001/runtime/include - -# lib_dir=-L/usr/lib \ -# -L/usr/local/lib \ -# -L/usr/local/Ascend/driver/lib64 \ -# -L/usr/local/Ascend/ascend-toolkit/6.3.RC1.alpha001/atc/lib64\ -# -L/usr/local/Ascend/ascend-toolkit/6.3.RC1.alpha001/runtime/lib64 \ -# -L/usr/local/Ascend/ascend-toolkit/6.3.RC1.alpha001/runtime/lib64/stub \ -# -L/usr/local/Ascend/ascend-toolkit/6.3.RC1.alpha001/lib64 \ -# -L/usr/local/Ascend/driver/lib64/driver - -# lib=-lacl_dvpp -lascendcl -lmmpa -lglog -lgflags -lpthread -lz -lacl_dvpp_mpi -lruntime -lascendalog -lc_sec -lmsprofiler -lgert -lge_executor -lge_common \ -# -lgraph -lascend_protobuf -lprofapi -lerror_manager -lexe_graph -lregister -lplatform -# LIBS= -lavformat -lavcodec -lswscale -lavutil -lavfilter -lswresample -lavdevice -# CXXFLAGS= -g -O0 -fPIC $(INCLUDE) $(include_dir) $(DEFS) -lpthread -lrt -lz -fexceptions -std=c++11 -fvisibility=hidden -Wl,-Bsymbolic -ldl +# 默认最终目标 +.PHONY:all +all:$(TARGET) -CXXFLAGS= -g -O0 -fPIC $(INCLUDE) $(DEFS) -lpthread -lrt -lz -fexceptions -std=c++11 -fvisibility=hidden -Wl,-Bsymbolic -ldl +# 生成最终目标 +$(TARGET): $(OBJS) | $(LIB_DIR) + @echo -e "\e[32m""Linking static library $(TARGET)""\e[0m" + @echo -e "$(CXX) -o $@ $^ $(DEPFLAGS) $(CXXFLAGS) $(LIBSPATH) $(MACROS)" + $(CXX) -o $@ $^ $(DEPFLAGS) $(CXXFLAGS) $(LIBSPATH) $(MACROS) -SRCS:=$(wildcard $(SRC_ROOT)/demo/*.cpp) -OBJS = $(patsubst %.cpp, %.o, $(notdir $(SRCS))) +# 若没有lib目录则自动生成 +$(LIB_DIR): + @mkdir -p $@ -OBJ_ROOT = $(PROJECT_ROOT)/src/build -# DVPP_SRCS:=$(wildcard $(OBJ_ROOT)/dvpp/obj/*.o) -INTEFACE_SRCS:=$(wildcard $(OBJ_ROOT)/interface/obj/*.o) -NVDECODER_SRCS:=$(wildcard $(OBJ_ROOT)/nvdecoder/obj/*.o) -GB28181_SRCS:=$(wildcard $(OBJ_ROOT)/gb28181/obj/*.o) +# 生成中间目标文件 +$(OBJ_DIR)/%.o:$(SRC_DIR)/%.cpp $(DEP_DIR)/%.d | $(OBJ_DIR) $(DEP_DIR) + @echo -e "\e[33m""Building object $@""\e[0m" + @echo -e "$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $<" + $(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $< +# 若没有obj目录则自动生成 +$(OBJ_DIR): + @mkdir -p $@ -$(TARGET):$(OBJS) $(INTEFACE_SRCS) $(NVDECODER_SRCS) $(GB28181_SRCS) - rm -f $(TARGET) -# @echo -e "\e[33m""Building object $@""\e[0m" -# $(XX) -o $@ $^ $(CXXFLAGS) $(LIBS) $(lib_dir) $(lib) -Wwrite-strings - $(XX) -o $@ $^ $(CXXFLAGS) $(LIBSPATH) -Wwrite-strings - rm -f *.o +# 若没有.dep目录则自动生成 +$(DEP_DIR): + @mkdir -p $@ -%.o:$(SRC_ROOT)/demo/%.cpp - $(XX) $(CXXFLAGS) -c $< +# 依赖文件会在生成中间文件的时候自动生成,这里只是为了防止报错 +$(DEPS): +# 引入中间目标文件头文件依赖关系 +include $(wildcard $(DEPS)) +# 直接删除组件build目录 +.PHONY:clean clean: - rm -f *.o $(TARGET) \ No newline at end of file + @rm -rf $(BUILD_DIR)/$(MODULE) diff --git a/src/demo/Makefile.o.nvdec b/src/demo/Makefile.o.nvdec new file mode 100644 index 0000000..a40488b --- /dev/null +++ b/src/demo/Makefile.o.nvdec @@ -0,0 +1,61 @@ +XX = g++ + + +PROJECT_ROOT= /mnt/data/cmhu/FFNvDecoder + +CUDA_ROOT = /usr/local/cuda-11.1 + +DEPEND_DIR = $(PROJECT_ROOT)/bin +THIRDPARTY_ROOT = $(PROJECT_ROOT)/3rdparty +SPDLOG_ROOT = $(THIRDPARTY_ROOT)/spdlog-1.9.2/release +JRTP_ROOT = $(THIRDPARTY_ROOT)/jrtp_export + +SRC_ROOT = $(PROJECT_ROOT)/src + +TARGET= $(PROJECT_ROOT)/bin/lib/demo + +DEFS = -DENABLE_DVPP_INTERFACE + +INCLUDE= -I $(SRC_ROOT)/interface \ + -I $(SRC_ROOT)/dvpp \ + -I $(SRC_ROOT)/gb28181 \ + -I $(SRC_ROOT)/nvdec \ + -I $(DEPEND_DIR)/include \ + -I $(CUDA_ROOT)/include \ + -I $(TOP_DIR)/common/inc \ + -I $(TOP_DIR)/common/UtilNPP \ + -I $(TOP_DIR)/ \ + -I $(SPDLOG_ROOT)/include \ + -I $(JRTP_ROOT)/jrtplib/include/jrtplib3 \ + -I $(JRTP_ROOT)/jthread/include/jthread + +LIBSPATH= -L $(DEPEND_DIR)/lib -lavformat -lavcodec -lswscale -lavutil -lavfilter -lswresample -lavdevice \ + -L $(CUDA_ROOT)/lib64 -lcuda -lcudart -lnvcuvid -lcurand -lcublas -lnvjpeg \ + -L $(SPDLOG_ROOT) -l:libspdlog.a \ + -L $(JRTP_ROOT)/jthread/lib -l:libjthread.a \ + -L $(JRTP_ROOT)/jrtplib/lib -l:libjrtp.a + + +CXXFLAGS= -g -O0 -fPIC $(INCLUDE) $(DEFS) -lpthread -lrt -lz -fexceptions -std=c++11 -fvisibility=hidden -Wl,-Bsymbolic -ldl + +SRCS:=$(wildcard $(SRC_ROOT)/demo/*.cpp) +OBJS = $(patsubst %.cpp, %.o, $(notdir $(SRCS))) + +OBJ_ROOT = $(PROJECT_ROOT)/src/build +INTEFACE_SRCS:=$(wildcard $(OBJ_ROOT)/interface/obj/*.o) +NVDECODER_SRCS:=$(wildcard $(OBJ_ROOT)/nvdec/obj/*.o) +GB28181_SRCS:=$(wildcard $(OBJ_ROOT)/gb28181/obj/*.o) + + +$(TARGET):$(OBJS) $(INTEFACE_SRCS) $(NVDECODER_SRCS) $(GB28181_SRCS) + rm -f $(TARGET) + @echo -e "\e[33m""Building object $@""\e[0m" + $(XX) -o $@ $^ $(CXXFLAGS) $(LIBSPATH) -Wwrite-strings + rm -f *.o + +%.o:$(SRC_ROOT)/demo/%.cpp + $(XX) $(CXXFLAGS) -c $< + + +clean: + rm -f *.o $(TARGET) \ No newline at end of file diff --git a/src/demo/main_nvdec.cpp b/src/demo/main_nvdec.cpp index db2da61..227bc8a 100644 --- a/src/demo/main_nvdec.cpp +++ b/src/demo/main_nvdec.cpp @@ -1,9 +1,8 @@ #include "FFNvDecoderManager.h" #include <iostream> -#include "cuda_kernels.h" - -#include "NvJpegEncoder.h" +// #include "cuda_kernels.h" +// #include "NvJpegEncoder.h" #include <pthread.h> #include <thread> @@ -85,75 +84,75 @@ unsigned char *pHwRgb[2] = {nullptr, nullptr}; int sum1 = 0; int sum2 = 0; -cudaStream_t stream[2]; +// cudaStream_t stream[2]; string data_home = "/mnt/data/cmhu/tmp/"; -#define checkCudaErrors(S) do {CUresult status; \ - status = S; \ - if (status != CUDA_SUCCESS ) std::cout << __LINE__ <<" checkCudaErrors - status = " << status << std::endl; \ - } while (false) +// #define checkCudaErrors(S) do {CUresult status; \ +// status = S; \ +// if (status != CUDA_SUCCESS ) std::cout << __LINE__ <<" checkCudaErrors - status = " << status << std::endl; \ +// } while (false) -static void gpu_helper(int gpuid) -{ - cudaSetDevice(gpuid); +// static void gpu_helper(int gpuid) +// { +// cudaSetDevice(gpuid); - // int *dn; - // cudaMalloc((void **)&dn, 1 * sizeof(int)); +// // int *dn; +// // cudaMalloc((void **)&dn, 1 * sizeof(int)); - size_t free_byte; - size_t total_byte; +// size_t free_byte; +// size_t total_byte; - CUresult cuda_status = cuMemGetInfo(&free_byte, &total_byte); +// CUresult cuda_status = cuMemGetInfo(&free_byte, &total_byte); - const char *pStr = nullptr; - if (CUDA_SUCCESS != cuda_status) { - cuGetErrorString(cuda_status, &pStr); - printf("Error: cudaMemGetInfo fails, %s \n", pStr); - return; - } +// const char *pStr = nullptr; +// if (CUDA_SUCCESS != cuda_status) { +// cuGetErrorString(cuda_status, &pStr); +// printf("Error: cudaMemGetInfo fails, %s \n", pStr); +// return; +// } - double free_db = (double)free_byte; - double total_db = (double)total_byte; - double used_db_1 = (total_db - free_db) / 1024.0 / 1024.0; +// double free_db = (double)free_byte; +// double total_db = (double)total_byte; +// double used_db_1 = (total_db - free_db) / 1024.0 / 1024.0; - std::cout <<"显存已使用 " << used_db_1 << " MB\n"; +// std::cout <<"显存已使用 " << used_db_1 << " MB\n"; - // cudaFree(dn); -} +// // cudaFree(dn); +// } -int CheckCUDAProperty( int devId ) -{ - cuInit(0); +// int CheckCUDAProperty( int devId ) +// { +// cuInit(0); - CUdevice dev = devId; - size_t memSize = 0; - char devName[256] = {0}; - int major = 0, minor = 0; - CUresult rlt = CUDA_SUCCESS; +// CUdevice dev = devId; +// size_t memSize = 0; +// char devName[256] = {0}; +// int major = 0, minor = 0; +// CUresult rlt = CUDA_SUCCESS; - rlt = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev); - checkCudaErrors( rlt ); +// rlt = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev); +// checkCudaErrors( rlt ); - rlt = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev); - checkCudaErrors( rlt ); +// rlt = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev); +// checkCudaErrors( rlt ); - rlt = cuDeviceGetName( devName, sizeof( devName ), dev ); - checkCudaErrors( rlt ); +// rlt = cuDeviceGetName( devName, sizeof( devName ), dev ); +// checkCudaErrors( rlt ); - printf( "Using GPU Device %d: %s has SM %d.%d compute capability\n", - dev, devName, major, minor ); +// printf( "Using GPU Device %d: %s has SM %d.%d compute capability\n", +// dev, devName, major, minor ); - rlt = cuDeviceTotalMem( &memSize, dev ); - checkCudaErrors( rlt ); +// rlt = cuDeviceTotalMem( &memSize, dev ); +// checkCudaErrors( rlt ); - printf( "Total amount of global memory: %4.4f MB\n", - (float)memSize / ( 1024 * 1024 ) ); +// printf( "Total amount of global memory: %4.4f MB\n", +// (float)memSize / ( 1024 * 1024 ) ); - return 0; -} +// return 0; +// } /** * 注意: gpuFrame 在解码器设置的显卡上,后续操作要十分注意这一点,尤其是多线程情况 @@ -378,7 +377,7 @@ int main(int argc, char* argv[]){ // av_log_set_callback(&logFF); - CheckCUDAProperty(atoi(gpuid)); + // CheckCUDAProperty(atoi(gpuid)); pthread_t m_decode_thread; pthread_create(&m_decode_thread,0, diff --git a/src/gb28181/FFGB28181Decoder.cpp b/src/gb28181/FFGB28181Decoder.cpp index a4ea5da..3ff9edd 100644 --- a/src/gb28181/FFGB28181Decoder.cpp +++ b/src/gb28181/FFGB28181Decoder.cpp @@ -2,7 +2,7 @@ #include <iostream> #include "FFGB28181Decoder.h" -#include "../nvdecoder/FFCuContextManager.h" + extern "C" { #include "libavutil/avstring.h" @@ -17,8 +17,9 @@ extern "C" { #include "common_header.h" -#include "../nvdecoder/GpuRgbMemory.hpp" -#include "../nvdecoder/cuda_kernels.h" +#include "../nvdec/FFCuContextManager.h" +#include "../nvdec/GpuRgbMemory.hpp" +#include "../nvdec/cuda_kernels.h" #define ECLOSED 0 #define ECLOSING 1 diff --git a/src/gb28181/Makefile b/src/gb28181/Makefile index d15ead3..d154115 100644 --- a/src/gb28181/Makefile +++ b/src/gb28181/Makefile @@ -50,7 +50,8 @@ all:$(TARGET) # 生成最终目标 $(TARGET):$(OBJS) | $(LIB_DIR) @echo -e "\e[32m""Linking static library $(TARGET)""\e[0m" -# @ar -rc $@ $^ + @echo -e "ar -rc $@ $^" + @ar -rc $@ $^ # 若没有lib目录则自动生成 $(LIB_DIR): @@ -59,6 +60,7 @@ $(LIB_DIR): # 生成中间目标文件 $(OBJ_DIR)/%.o:$(SRC_DIR)/%.cpp $(DEP_DIR)/%.d | $(OBJ_DIR) $(DEP_DIR) @echo -e "\e[33m""Building object $@""\e[0m" + @echo -e "$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $<" @$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $< # 若没有obj目录则自动生成 diff --git a/src/interface/FFNvDecoderManager.cpp b/src/interface/FFNvDecoderManager.cpp index b274b12..bd05873 100644 --- a/src/interface/FFNvDecoderManager.cpp +++ b/src/interface/FFNvDecoderManager.cpp @@ -1,7 +1,7 @@ #include "FFNvDecoderManager.h" #ifdef USE_NVDEC -#include "../nvdecoder/FFNvDecoder.h" +#include "../nvdec/FFNvDecoder.h" #include "../gb28181/FFGB28181Decoder.h" #endif diff --git a/src/interface/Makefile b/src/interface/Makefile index 2b9ab32..5b38ce4 100644 --- a/src/interface/Makefile +++ b/src/interface/Makefile @@ -30,7 +30,7 @@ INCLUDE= -I $(DEPEND_DIR)/include \ -I $(JRTP_ROOT)/jrtplib/include/jrtplib3 \ -I $(JRTP_ROOT)/jthread/include/jthread \ -I $(TOP_DIR)/src/gb28181 \ - -I $(TOP_DIR)/src/nvdecoder \ + -I $(TOP_DIR)/src/nvdec \ LIBSPATH= -L $(DEPEND_DIR)/lib -lavformat -lavcodec -lswscale -lavutil -lavfilter -lswresample -lavdevice \ -L $(CUDA_ROOT)/lib64 -lcuda -lcudart -lnvcuvid -lcurand -lcublas -lnvjpeg \ @@ -45,7 +45,7 @@ CXXFLAGS= -g -O0 -fPIC $(INCLUDE) $(DEFS) -lpthread -lrt -lz -fexceptions -std=c # 最终目标文件 TARGET:=$(LIB_DIR)/$(MODULE).a -# MODULE_LIBS:=$(BUILD_DIR)/nvdecoder/lib/nvdecoder.a \ +# MODULE_LIBS:=$(BUILD_DIR)/nvdec/lib/nvdec.a \ # $(BUILD_DIR)/gb28181/lib/gb28181.a\ # 默认最终目标 @@ -54,8 +54,9 @@ all:$(TARGET) # 生成最终目标 $(TARGET):$(OBJS) | $(LIB_DIR) - # @echo -e "\e[32m""Linking static library $(TARGET)""\e[0m" -# @ar -rc $@ $^ + @echo -e "\e[32m""Linking static library $(TARGET)""\e[0m" + @echo -e "ar -rc $@ $^" + @ar -rc $@ $^ # 若没有lib目录则自动生成 $(LIB_DIR): @@ -64,6 +65,7 @@ $(LIB_DIR): # 生成中间目标文件 $(OBJ_DIR)/%.o:$(SRC_DIR)/%.cpp $(DEP_DIR)/%.d | $(OBJ_DIR) $(DEP_DIR) @echo -e "\e[33m""Building object $@""\e[0m" + @echo -e "$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $<" # @$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $(MODULE_LIBS) $< @$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $< diff --git a/src/nvdec/DrawImageOnGPU.cu b/src/nvdec/DrawImageOnGPU.cu new file mode 100644 index 0000000..1fa99dc --- /dev/null +++ b/src/nvdec/DrawImageOnGPU.cu @@ -0,0 +1,126 @@ +#include "cuda_kernels.h" + +#include "../interface/logger.hpp" + +typedef unsigned char uchar; +typedef unsigned int uint32; +typedef int int32; + +namespace cuda_common +{ + __global__ void kernel_drawPixel(float* d_srcRGB, int src_width, int src_height, + int left, int top, int right, int bottom) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (((x == left || x == right) && y >= top && y <= bottom) || ((y == top || y == bottom) && x >= left && x <= right)) + { + d_srcRGB[(y*src_width) + x] = 0; + d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255; + d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0; + } + } + + cudaError_t DrawImage(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom) + { + dim3 block(32, 16, 1); + dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1); + + kernel_drawPixel << < grid, block >> >(d_srcRGB, src_width, src_height, left, top, right, bottom); + + cudaError_t cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("Draw 32 kernel_memcopy launch failed:{}",cudaGetErrorString(cudaStatus)); + return cudaStatus; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus); + return cudaStatus; + } + + return cudaStatus; + } + + __global__ void kernel_drawPixel(unsigned char* d_srcRGB, int src_width, int src_height, + int left, int top, int right, int bottom) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (((x == left || x == right) && y >= top && y <= bottom) || ((y == top || y == bottom) && x >= left && x <= right)) + { + d_srcRGB[(y*src_width) + x] = 0; + d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255; + d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0; + } + } + + cudaError_t DrawImage(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom) + { + dim3 block(32, 16, 1); + dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1); + + kernel_drawPixel << < grid, block >> >(d_srcRGB, src_width, src_height, left, top, right, bottom); + + cudaError_t cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("Draw 68 kernel_memcopy launch failed: {}",cudaGetErrorString(cudaStatus)); + return cudaStatus; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus); + return cudaStatus; + } + + return cudaStatus; + } + + __global__ void kernel_drawLine(float* d_srcRGB, int src_width, int src_height, + int begin_x, int begin_y, int end_x, int end_y) + { + int min_x = end_x < begin_x ? end_x : begin_x; + int max_x = end_x < begin_x ? begin_x : end_x; + + int min_y = end_y < begin_y ? end_y : begin_y; + int max_y = end_y < begin_y ? begin_y : end_y; + + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if ((x - begin_x) * (end_y - begin_y) == (end_x - begin_x) * (y - begin_y) + && min_x <= x && x <= max_x + && min_y <= y && y <= max_y) + { + d_srcRGB[(y*src_width) + x] = 0; + d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255; + d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0; + } + } + + cudaError_t DrawLine(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y) + { + dim3 block(32, 16, 1); + dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1); + + kernel_drawLine << < grid, block >> >(d_srcRGB, src_width, src_height, begin_x, begin_y, end_x, end_y); + + cudaError_t cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("Draw 112 kernel_memcopy launch failed: {}",cudaGetErrorString(cudaStatus)); + return cudaStatus; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus); + return cudaStatus; + } + + return cudaStatus; + } +} \ No newline at end of file diff --git a/src/nvdec/FFCuContextManager.cpp b/src/nvdec/FFCuContextManager.cpp new file mode 100644 index 0000000..382c4d8 --- /dev/null +++ b/src/nvdec/FFCuContextManager.cpp @@ -0,0 +1,41 @@ +#include "FFCuContextManager.h" + +#include "common_header.h" + +using namespace std; + +extern "C" +{ + #include <libavcodec/avcodec.h> + #include <libavdevice/avdevice.h> + #include <libavformat/avformat.h> + #include <libavfilter/avfilter.h> + #include <libavutil/avutil.h> + #include <libavutil/pixdesc.h> + #include <libswscale/swscale.h> + #include <libavutil/imgutils.h> +} + +FFCuContextManager::~FFCuContextManager() +{ + for(auto iter = ctxMap.begin(); iter != ctxMap.end(); iter++){ + av_buffer_unref(&iter->second); + } + ctxMap.clear(); +} + +AVBufferRef *FFCuContextManager::getCuCtx(string gpuid) +{ + AVBufferRef *hw_device_ctx = ctxMap[gpuid]; + if (nullptr == hw_device_ctx) + { + // 初始化硬件解码器 + if (av_hwdevice_ctx_create(&hw_device_ctx, AV_HWDEVICE_TYPE_CUDA, gpuid.c_str(), nullptr, 0) < 0) + { + LOG_ERROR("Failed to create specified HW device."); + return nullptr; + } + ctxMap[gpuid] = hw_device_ctx; + } + return hw_device_ctx; +} \ No newline at end of file diff --git a/src/nvdec/FFCuContextManager.h b/src/nvdec/FFCuContextManager.h new file mode 100644 index 0000000..758167c --- /dev/null +++ b/src/nvdec/FFCuContextManager.h @@ -0,0 +1,28 @@ + +#include<map> +#include<string> + +using namespace std; + +struct AVBufferRef; + +class FFCuContextManager{ +public: + static FFCuContextManager* getInstance(){ + static FFCuContextManager* singleton = nullptr; + if (singleton == nullptr){ + singleton = new FFCuContextManager(); + } + return singleton; + } + + AVBufferRef *getCuCtx(string gpuid); + +private: + FFCuContextManager(){} + ~FFCuContextManager(); + +private: + map<string,AVBufferRef *> ctxMap; + +}; \ No newline at end of file diff --git a/src/nvdec/FFNvDecoder.cpp b/src/nvdec/FFNvDecoder.cpp new file mode 100644 index 0000000..e64e2a5 --- /dev/null +++ b/src/nvdec/FFNvDecoder.cpp @@ -0,0 +1,513 @@ +#include "FFNvDecoder.h" + +#include <chrono> +#include <thread> +#include <fstream> + +#include <chrono> + +#include "FFCuContextManager.h" + +#include "common_header.h" + +#include "GpuRgbMemory.hpp" +#include "cuda_kernels.h" + +using namespace std; + +// 参考博客: https://blog.csdn.net/qq_40116098/article/details/120704340 + +static AVPixelFormat get_hw_format(AVCodecContext *avctx, const AVPixelFormat *pix_fmts) +{ + FFNvDecoder* _this = (FFNvDecoder*)avctx->opaque; + + const AVPixelFormat *p; + + for (p = pix_fmts; *p != -1; p++) { + if (*p == _this->getHwPixFmt()) + return *p; + } + + LOG_ERROR("Failed to get HW surface format"); + return AV_PIX_FMT_NONE; +} + +FFNvDecoder::FFNvDecoder() +{ + // 初始化解码对象 + fmt_ctx = nullptr; + avctx = nullptr; + m_bRunning = false; + + stream = nullptr; + stream_index = -1; + hw_pix_fmt = AV_PIX_FMT_NONE; + m_dec_name = ""; + + m_bPause = false; + m_bReal = true; + + m_decode_thread = 0; + m_post_decode_thread = 0; + + m_bFinished = false; + m_dec_keyframe = false; + m_fps = 0.0; +} + +FFNvDecoder::~FFNvDecoder() +{ + m_dec_keyframe = false; +} + +bool FFNvDecoder::init(FFDecConfig& cfg) +{ + m_cfg = cfg; + m_dec_name = cfg.dec_name; + + fstream infile(cfg.uri); + if (infile.is_open()){ + m_bReal = false; + infile.close(); + }else { + m_bReal = true; + } + + post_decoded_cbk = cfg.post_decoded_cbk; + decode_finished_cbk = cfg.decode_finished_cbk; + + return init(cfg.uri.c_str(), cfg.gpuid.c_str(),cfg.force_tcp); +} + +bool FFNvDecoder::init(const char* uri, const char* gpuid, bool force_tcp) +{ + // av_log_set_level(AV_LOG_DEBUG); + + avformat_network_init(); + + // 打开输入视频文件 + AVDictionary *options = nullptr; + av_dict_set( &options, "bufsize", "655360", 0 ); + av_dict_set( &options, "rtsp_transport", force_tcp ? "tcp" : "udp", 0 ); + // av_dict_set( &options, "listen_timeout", "30", 0 ); // 单位为s + av_dict_set( &options, "stimeout", "30000000", 0 ); // 单位为 百万分之一秒 + + fmt_ctx = avformat_alloc_context(); + const char* input_file = uri; + if (avformat_open_input(&fmt_ctx, input_file, nullptr, &options) != 0) { + LOG_ERROR("Cannot open input file:{}",input_file); + return false; + } + + // 查找流信息 + if (avformat_find_stream_info(fmt_ctx, nullptr) < 0) { + LOG_ERROR("Cannot find input stream information"); + return false; + } + + // 查找视频流信息 + AVCodec *decoder = nullptr; + stream_index = av_find_best_stream(fmt_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, &decoder, 0); + if (stream_index < 0) { + LOG_ERROR("Cannot find a video stream in the input file"); + return false; + } + + string cuvid_dec_name = string(decoder->name) + "_cuvid"; + AVCodec *vcodec = avcodec_find_decoder_by_name(cuvid_dec_name.c_str()); + if (!(avctx = avcodec_alloc_context3(vcodec))) + return (bool)AVERROR(ENOMEM); + + // 得到视频流对象 + stream = fmt_ctx->streams[stream_index]; + if (avcodec_parameters_to_context(avctx, stream->codecpar) < 0) + return false; + + m_fps = av_q2d(stream ->avg_frame_rate); + + avctx->opaque = this; + // 设置解码器管理器的像素格式回调函数 + avctx->get_format = get_hw_format; + + hw_pix_fmt = AV_PIX_FMT_CUDA; + + FFCuContextManager* pCtxMgr = FFCuContextManager::getInstance(); + + AVBufferRef *hw_device_ctx = pCtxMgr->getCuCtx(gpuid); + if(nullptr == hw_device_ctx){ + av_log(nullptr, AV_LOG_ERROR, "create CUDA context failed ! \n"); + return false; + } + avctx->hw_device_ctx = av_buffer_ref(hw_device_ctx); + if (nullptr == avctx->hw_device_ctx) + { + return false; + } + + // 打开解码器流 + AVDictionary *op = nullptr; + av_dict_set( &op, "gpu", gpuid, 0 ); + // av_dict_set( &op, "surfaces", "5", 0 ); + if (avcodec_open2(avctx, vcodec, &op) < 0) { + LOG_ERROR("Failed to open codec for stream"); + return false; + } + + return true; +} + +bool FFNvDecoder::isSurport(FFDecConfig& cfg) +{ + bool bRet = init(cfg); + decode_finished(); + return bRet; +} + +bool FFNvDecoder::start(){ + + m_bRunning = true; + + pthread_create(&m_decode_thread,0, + [](void* arg) + { + FFNvDecoder* a=(FFNvDecoder*)arg; + a->decode_thread(); + return (void*)0; + } + ,this); + + return true; +} + +void FFNvDecoder::decode_thread() +{ + AVPacket* pkt ; + pkt = av_packet_alloc(); + av_init_packet( pkt ); + + pthread_create(&m_post_decode_thread,0, + [](void* arg) + { + FFNvDecoder* a=(FFNvDecoder*)arg; + a->post_decode_thread(); + return (void*)0; + } + ,this); + + // long start_time = UtilTools::get_cur_time_ms(); + + while (m_bRunning) + { + if (!m_bReal) + { + if (m_bPause) + { + std::this_thread::sleep_for(std::chrono::milliseconds(3)); + continue; + } + } + + int result = av_read_frame(fmt_ctx, pkt); + if (result == AVERROR_EOF || result < 0) + { + LOG_ERROR("Failed to read frame!"); + break; + } + + if (m_dec_keyframe && !(pkt->flags & AV_PKT_FLAG_KEY)) { + av_packet_unref(pkt); + continue; + } + + if (stream_index == pkt->stream_index){ + result = avcodec_send_packet(avctx, pkt); + if (result < 0){ + av_packet_unref(pkt); + LOG_ERROR("{} - Failed to send pkt: {}", m_dec_name, result); + continue; + } + + AVFrame* gpuFrame = av_frame_alloc(); + result = avcodec_receive_frame(avctx, gpuFrame); + if ((result == AVERROR(EAGAIN) || result == AVERROR_EOF) || result < 0){ + LOG_ERROR("{} - Failed to receive frame: {}", m_dec_name, result); + av_frame_free(&gpuFrame); + av_packet_unref(pkt); + continue; + } + av_packet_unref(pkt); + + if (m_bReal){ + if (m_bPause){ + av_frame_free(&gpuFrame); + std::this_thread::sleep_for(std::chrono::milliseconds(3)); + continue; + } + } + + if(gpuFrame != nullptr){ + m_queue_mutex.lock(); + if(mFrameQueue.size() <= 10){ + mFrameQueue.push(gpuFrame); + }else{ + av_frame_free(&gpuFrame); + } + m_queue_mutex.unlock(); + } + } + av_packet_unref(pkt); + } + + m_bRunning = false; + av_packet_free(&pkt); + + // long end_time = UtilTools::get_cur_time_ms(); + // cout << "解码用时:" << end_time - start_time << endl; + + if (m_post_decode_thread != 0) + { + pthread_join(m_post_decode_thread,0); + } + + decode_finished_cbk(m_finishedDecArg); + + decode_finished(); + + // 清空队列 + while(mFrameQueue.size() > 0){ + AVFrame * gpuFrame = mFrameQueue.front(); + av_frame_free(&gpuFrame); + mFrameQueue.pop(); + } + + LOG_INFO("{} - decode thread exited.", m_dec_name); +} + +void FFNvDecoder::decode_finished(){ + if (avctx) + { + avcodec_free_context(&avctx); + } + + if (fmt_ctx) + { + avformat_close_input(&fmt_ctx); + } + + m_bFinished = true; + m_dec_keyframe = false; +} + +void FFNvDecoder::post_decode_thread(){ + int skip_frame = m_cfg.skip_frame; + if (skip_frame <= 0){ + skip_frame = 1; + } + + int index = 0; + while (m_bRunning) + { + if(mFrameQueue.size() > 0){ + std::lock_guard<std::mutex> l(m_snapshot_mutex); + // 取队头数据 + m_queue_mutex.lock(); + AVFrame * gpuFrame = mFrameQueue.front(); + mFrameQueue.pop(); + m_queue_mutex.unlock(); + // 跳帧 + if (skip_frame == 1 || index % skip_frame == 0){ + post_decoded_cbk(m_postDecArg, convert2bgr(gpuFrame)); + index = 0; + } + + av_frame_free(&gpuFrame); + + index++; + } + } + + LOG_INFO("post decode thread exited."); +} + +void FFNvDecoder::close(){ + m_bRunning=false; + if(m_decode_thread != 0){ + pthread_join(m_decode_thread,0); + } + m_dec_keyframe = false; +} + +AVPixelFormat FFNvDecoder::getHwPixFmt(){ + return hw_pix_fmt; +} + +bool FFNvDecoder::isRunning(){ + return m_bRunning; +} + +bool FFNvDecoder::isFinished(){ + return m_bFinished; +} + +bool FFNvDecoder::isPausing(){ + return m_bPause; +} + +bool FFNvDecoder::getResolution( int &width, int &height ){ + if (avctx != nullptr) + { + width = avctx->width; + height = avctx->height; + return true; + } + + return false; +} + +void FFNvDecoder::pause(){ + m_bPause = true; +} + +void FFNvDecoder::resume(){ + m_bPause = false; +} + +void FFNvDecoder::setDecKeyframe(bool bKeyframe) +{ + m_dec_keyframe = bKeyframe; +} + +int FFNvDecoder::getCachedQueueLength(){ + m_queue_mutex.lock(); + int queue_size = mFrameQueue.size(); + m_queue_mutex.lock(); + return queue_size; +} + +float FFNvDecoder::fps(){ + return m_fps; +} + +void FFNvDecoder::setPostDecArg(const void* postDecArg){ + m_postDecArg = postDecArg; +} + +void FFNvDecoder::setFinishedDecArg(const void* finishedDecArg){ + m_finishedDecArg = finishedDecArg; +} + +DeviceRgbMemory* FFNvDecoder::convert2bgr(AVFrame * gpuFrame){ + if (gpuFrame != nullptr && gpuFrame->format == AV_PIX_FMT_CUDA ){ + LOG_DEBUG("decode task: gpuid: {} width: {} height: {}", m_cfg.gpuid, gpuFrame->width, gpuFrame->height); + GpuRgbMemory* gpuMem = new GpuRgbMemory(3, gpuFrame->width, gpuFrame->height, getName(), m_cfg.gpuid, false, true); + + do{ + if (gpuMem->getMem() == nullptr){ + LOG_ERROR("new GpuRgbMemory failed !!!"); + break; + } + + cudaSetDevice(atoi(m_cfg.gpuid.c_str())); + cuda_common::setColorSpace( ITU_709, 0 ); + cudaError_t cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], gpuMem->getMem(), gpuFrame->width, gpuFrame->height); + cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("CUDAToBGR failed failed !!!"); + break; + } + + return gpuMem; + }while(0); + + delete gpuMem; + gpuMem = nullptr; + } + + return nullptr; +} + +FFImgInfo* FFNvDecoder::snapshot(){ + + // 锁住停止队列消耗 + std::lock_guard<std::mutex> l(m_snapshot_mutex); + + AVFrame * gpuFrame = nullptr; + + bool bFirst = true; + while(true){ + m_queue_mutex.lock(); + if(mFrameQueue.size() <= 0){ + m_queue_mutex.unlock(); + if(bFirst){ + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + bFirst = false; + continue; + }else{ + // 再进来说明前面已经等了 100 ms + // 100 ms都没有等到解码数据,则退出 + return nullptr; + } + } + + // 队列中数据大于1 + gpuFrame = mFrameQueue.front(); + m_queue_mutex.unlock(); + break; + } + + if (gpuFrame != nullptr && gpuFrame->format == AV_PIX_FMT_CUDA ){ + LOG_DEBUG("decode task: gpuid: {} width: {} height: {}", m_cfg.gpuid, gpuFrame->width, gpuFrame->height); + GpuRgbMemory* gpuMem = new GpuRgbMemory(3, gpuFrame->width, gpuFrame->height, getName(), m_cfg.gpuid, false, true); + + if (gpuMem->getMem() == nullptr){ + LOG_ERROR("new GpuRgbMemory failed !!!"); + return nullptr; + } + + cudaSetDevice(atoi(m_cfg.gpuid.c_str())); + cuda_common::setColorSpace( ITU_709, 0 ); + cudaError_t cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], gpuMem->getMem(), gpuFrame->width, gpuFrame->height); + cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("CUDAToBGR failed failed !!!"); + return nullptr; + } + + unsigned char * pHwRgb = gpuMem->getMem(); + int channel = gpuMem->getChannel(); + int width = gpuMem->getWidth(); + int height = gpuMem->getHeight(); + + if (pHwRgb != nullptr && channel > 0 && width > 0 && height > 0){ + int nSize = channel * height * width; + + LOG_INFO("channel:{} height:{} width:{}", channel, height, width); + // unsigned char* cpu_data = new unsigned char[nSize]; + + unsigned char* cpu_data = (unsigned char *)av_malloc(nSize * sizeof(unsigned char)); + + cudaMemcpy(cpu_data, pHwRgb, nSize * sizeof(unsigned char), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + + delete gpuMem; + gpuMem = nullptr; + + FFImgInfo* imgInfo = new FFImgInfo(); + imgInfo->dec_name = m_dec_name; + imgInfo->pData = cpu_data; + imgInfo->height = height; + imgInfo->width = width; + imgInfo->timestamp = UtilTools::get_cur_time_ms(); + imgInfo->index = m_index; + + m_index++; + + return imgInfo; + } + + delete gpuMem; + gpuMem = nullptr; + } + + return nullptr; +} \ No newline at end of file diff --git a/src/nvdec/FFNvDecoder.h b/src/nvdec/FFNvDecoder.h new file mode 100644 index 0000000..4784ab6 --- /dev/null +++ b/src/nvdec/FFNvDecoder.h @@ -0,0 +1,107 @@ +#include<string> +#include <pthread.h> + +#include <mutex> + +extern "C" +{ + #include <libavcodec/avcodec.h> + #include <libavdevice/avdevice.h> + #include <libavformat/avformat.h> + #include <libavfilter/avfilter.h> + #include <libavutil/avutil.h> + #include <libavutil/pixdesc.h> + #include <libswscale/swscale.h> + #include <libavutil/imgutils.h> +} + +#include "common_header.h" + +#include "../interface/AbstractDecoder.h" + +using namespace std; + +class FFNvDecoder : public AbstractDecoder { +public: + FFNvDecoder(); + ~FFNvDecoder(); + bool init(FFDecConfig& cfg); + void close(); + bool start(); + void pause(); + void resume(); + + void setDecKeyframe(bool bKeyframe); + + bool isRunning(); + bool isFinished(); + bool isPausing(); + bool getResolution( int &width, int &height ); + + bool isSurport(FFDecConfig& cfg); + + int getCachedQueueLength(); + + float fps(); + + DECODER_TYPE getDecoderType(){ return DECODER_TYPE_FFMPEG; } + + FFImgInfo* snapshot(); + + void setName(string nm){ + m_dec_name = nm; + } + + string getName(){ + return m_dec_name; + } + + void setPostDecArg(const void* postDecArg); + void setFinishedDecArg(const void* finishedDecArg); + +public: + AVPixelFormat getHwPixFmt(); + +private: + void decode_thread(); + void post_decode_thread(); + bool init(const char* uri, const char* gpuid, bool force_tcp); + void decode_finished(); + + DeviceRgbMemory* convert2bgr(AVFrame * gpuFrame); + +private: + string m_dec_name; + FFDecConfig m_cfg; + + AVStream* stream; + AVCodecContext *avctx; + int stream_index; + AVFormatContext *fmt_ctx; + AVPixelFormat hw_pix_fmt; + + pthread_t m_decode_thread; + pthread_t m_post_decode_thread; + + bool m_bRunning; + bool m_bFinished; + + bool m_bPause; + + bool m_bReal; // 是否实时流 + + float m_fps; + + queue<AVFrame*> mFrameQueue; + mutex m_queue_mutex; + mutex m_snapshot_mutex; + long m_index{0}; + + bool m_dec_keyframe; + + const void * m_postDecArg; + POST_DECODE_CALLBACK post_decoded_cbk; // 解码数据回调接口 + + const void * m_finishedDecArg; + DECODE_FINISHED_CALLBACK decode_finished_cbk; +}; \ No newline at end of file diff --git a/src/nvdec/GpuRgbMemory.hpp b/src/nvdec/GpuRgbMemory.hpp new file mode 100644 index 0000000..35eac65 --- /dev/null +++ b/src/nvdec/GpuRgbMemory.hpp @@ -0,0 +1,34 @@ +#include<string> + +#include "../interface/DeviceRgbMemory.hpp" +#include "cuda_kernels.h" +#include "define.hpp" +#include "common_header.h" + +using namespace std; + +class GpuRgbMemory : public DeviceRgbMemory{ + +public: + GpuRgbMemory(int _channel, int _width, int _height, string _id, string _gpuid, bool _key_frame, bool _isused) + :DeviceRgbMemory(_channel, _width, _height, _id, _gpuid, _key_frame, _isused){ + gpuid = _gpuid; + cudaSetDevice(atoi(gpuid.c_str())); + CHECK_CUDA(cudaMalloc((void **)&pHwRgb, data_size * sizeof(unsigned char))); + } + + ~GpuRgbMemory(){ + if (pHwRgb) { + cudaSetDevice(atoi(gpuid.c_str())); + CHECK_CUDA(cudaFree(pHwRgb)); + pHwRgb = nullptr; + } + } + + string getGpuId() { + return gpuid; + } + +private: + string gpuid; +}; \ No newline at end of file diff --git a/src/nvdec/ImageSaveGPU.cpp b/src/nvdec/ImageSaveGPU.cpp new file mode 100644 index 0000000..dde9b64 --- /dev/null +++ b/src/nvdec/ImageSaveGPU.cpp @@ -0,0 +1,123 @@ +#include "cuda_kernels.h" + +#include "common_header.h" + + +//int saveJPEG(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height) +//{ +// return jpegNPP(szOutputFile, d_srcRGB, img_width, img_height); +// //return 0; +//} +// +//int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height) +//{ +// return jpegNPP(szOutputFile, d_srcRGB, img_width, img_height); +// //return 0; +//} +// +//int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB) +//{ +// return jpegNPP(szOutputFile, d_srcRGB); +//} +// +//int saveJPEG(const char *szOutputFile, float* d_srcRGB) +//{ +// return jpegNPP(szOutputFile, d_srcRGB); +//} + +int resizeFrame(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height) +{ + cudaError_t cudaStatus = cuda_common::ResizeImage(d_srcRGB, src_width, src_height, d_dstRGB, dst_width, dst_height); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("cuda_common::ResizeImage failed: {}",cudaGetErrorString(cudaStatus)); + return -1; + } + + return 0; +} + +//int initTables() +//{ +// initTable(); +// return 0; +//} +// +//int initTables(int flag, int width, int height) +//{ +// initTable(0, width, height); +// return 0; +//} + +int drawImageOnGPU(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom) +{ + cuda_common::DrawImage(d_srcRGB, src_width, src_height, left, top, right, bottom); + return 0; +} + +int drawImageOnGPU(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom) +{ + cuda_common::DrawImage(d_srcRGB, src_width, src_height, left, top, right, bottom); + return 0; +} + +int drawLineOnGPU(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y) +{ + cuda_common::DrawLine(d_srcRGB, src_width, src_height, begin_x, begin_y, end_x, end_y); + return 0; +} + +//int releaseJpegSaver() +//{ +// releaseJpegNPP(); +// return 0; +//} + +int partMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom) +{ + cudaError_t cudaStatus = cuda_common::PartMemCopy(d_srcRGB, src_width, src_height, d_dstRGB, left, top, right, bottom); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("cuda_common::77 PartMemCopy failed: {} {} {} {} {} {} {}",cudaGetErrorString(cudaStatus), left, top, right, bottom, src_height, d_dstRGB); + return -1; + } + + return 0; +} +//#include <fstream> +//extern std::ofstream g_os; +int PartMemResizeBatch(unsigned char * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, + int count, int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h, + float submeanb, float submeang, float submeanr, + float varianceb, float varianceg, float variancer) +{ + //g_os << "cudaMemcpyHostToDevice begin 9" << std::endl; + cudaError_t cudaStatus = cuda_common::PartMemResizeBatch( + d_srcRGB, src_width, src_height, d_dstRGB, count, vleft, vtop, vright, vbottom, dst_w, dst_h, + submeanb, submeang, submeanr, + varianceb, varianceg, variancer); + //g_os << "cudaMemcpyHostToDevice end 9" << std::endl; + if (cudaStatus != cudaSuccess) { + LOG_ERROR("cuda_common::PartMemResizeBatch failed: {}",cudaGetErrorString(cudaStatus)); + return -1; + } + + return 0; +} + + +//int PartMemResizeBatch(float * d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, +// int count, int* vleft, int * vtop, int* vright, int* vbottom, int dst_w, int dst_h, +// float submeanb, float submeang, float submeanr, +// float varianceb, float varianceg, float variancer) +// +//{ +// cudaError_t cudaStatus = cuda_common::PartMemResizeBatch( +// d_srcRGB, src_width, src_height, d_dstRGB, count, vleft, vtop, vright, vbottom, dst_w, dst_h, +// submeanb, submeang, submeanr, +// varianceb, varianceg, variancer); +// if (cudaStatus != cudaSuccess) { +// fprintf(stderr, "cuda_common::PartMemCopy failed: %s\n", cudaGetErrorString(cudaStatus)); +// return -1; +// } +// +// return 0; +//} \ No newline at end of file diff --git a/src/nvdec/ImageSaveGPU.h b/src/nvdec/ImageSaveGPU.h new file mode 100644 index 0000000..272a6d2 --- /dev/null +++ b/src/nvdec/ImageSaveGPU.h @@ -0,0 +1,65 @@ +/******************************************************************************************* +* Version: VPT_x64_V2.0.0_20170904 +* CopyRight: 中科院自动化研究所模式识别实验室图像视频组 +* UpdateDate: 20170904 +* Content: 人车物监测跟踪 +********************************************************************************************/ + +#ifndef IMAGESAVEGPU_H_ +#define IMAGESAVEGPU_H_ + +#ifdef _MSC_VER + #ifdef IMAGESAVEGPU_EXPORTS + #define IMAGESAVEGPU_API __declspec(dllexport) + #else + #define IMAGESAVEGPU_API __declspec(dllimport) + #endif +#else +#define IMAGESAVEGPU_API __attribute__((visibility ("default"))) +#endif +// 功能:保存成jpeg文件 +// szOutputFile 输出图片路径,如D:\\out.jpg +// d_srcRGB 输入RGB数据,由cudaMalloc分配的显存空间,数据排列形式为:BBBBBB......GGGGGG......RRRRRRRR...... +// img_width RGB数据图片的宽度 +// img_height RGB数据图片的高度 +// +//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height); +//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, float* d_srcRGB); +// +//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height); +//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB); + +// 功能:防缩图像 +IMAGESAVEGPU_API int resizeFrame(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height); + +// 功能:部分拷贝数据 +IMAGESAVEGPU_API int partMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom); + +//IMAGESAVEGPU_API int partMemResizeImage(float * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, +// int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h, +// float submeanb, float submeang, float submeanr, +// float varianceb, float varianceg, float variancer); + + +IMAGESAVEGPU_API int PartMemResizeBatch(unsigned char * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, + int count, int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h, + float submeanb, float submeang, float submeanr, + float varianceb, float varianceg, float variancer); + + +//// 功能:初始化GPU保存图像的各种量化表 +//IMAGESAVEGPU_API int initTables(); +//IMAGESAVEGPU_API int initTables(int falg, int width, int height); +// +//// 功能:释放资源 +//IMAGESAVEGPU_API int releaseJpegSaver(); + +// 功能:在GPU中绘制快照包围框 +IMAGESAVEGPU_API int drawImageOnGPU(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom); + +IMAGESAVEGPU_API int drawImageOnGPU(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom); + +// 功能:在GPU中绘制直线 +IMAGESAVEGPU_API int drawLineOnGPU(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y); + +#endif diff --git a/src/nvdec/Makefile b/src/nvdec/Makefile new file mode 100644 index 0000000..1b49ca4 --- /dev/null +++ b/src/nvdec/Makefile @@ -0,0 +1,101 @@ +# 各项目录 +LIB_DIR:=$(BUILD_DIR)/$(MODULE)/lib +DEP_DIR:=$(BUILD_DIR)/$(MODULE)/.dep +OBJ_DIR:=$(BUILD_DIR)/$(MODULE)/obj +SRC_DIR:=$(TOP_DIR)/$(MODULE) + +# 源文件以及中间目标文件和依赖文件 +SRCS:=$(notdir $(wildcard $(SRC_DIR)/*.cpp)) +OBJS:=$(addprefix $(OBJ_DIR)/, $(patsubst %.cpp, %.o, $(SRCS))) +DEPS:=$(addprefix $(DEP_DIR)/, $(patsubst %.cpp, %.d,a $(SRCS))) + +CUDA_ROOT = /usr/local/cuda-11.1 +NVCC = $(CUDA_ROOT)/bin/nvcc + +# 自动生成头文件依赖选项 +DEPFLAGS=-MT $@ -MMD -MP -MF $(DEP_DIR)/$*.d + +DEFS = -DENABLE_DVPP_INTERFACE + +# 最终目标文件 +TARGET:=$(LIB_DIR)/$(MODULE).a + + +PROJECT_ROOT= /mnt/data/cmhu/FFNvDecoder + +DEPEND_DIR = $(PROJECT_ROOT)/bin +THIRDPARTY_ROOT = $(PROJECT_ROOT)/3rdparty +SPDLOG_ROOT = $(THIRDPARTY_ROOT)/spdlog-1.9.2/release +JRTP_ROOT = $(THIRDPARTY_ROOT)/jrtp_export + + +INCLUDE= -I $(DEPEND_DIR)/include \ + -I $(CUDA_ROOT)/include \ + -I $(TOP_DIR)/common/inc \ + -I $(TOP_DIR)/common/UtilNPP \ + -I $(TOP_DIR)/ \ + -I $(SPDLOG_ROOT)/include \ + -I $(JRTP_ROOT)/jrtplib/include/jrtplib3 \ + -I $(JRTP_ROOT)/jthread/include/jthread + +LIBSPATH= -L $(DEPEND_DIR)/lib -lavformat -lavcodec -lswscale -lavutil -lavfilter -lswresample -lavdevice \ + -L $(CUDA_ROOT)/lib64 -lcuda -lcudart -lnvcuvid -lcurand -lcublas -lnvjpeg \ + -L $(SPDLOG_ROOT) -l:libspdlog.a \ + -L $(JRTP_ROOT)/jthread/lib -l:libjthread.a \ + -L $(JRTP_ROOT)/jrtplib/lib -l:libjrtp.a + + +CXXFLAGS= -g -O0 -fPIC $(INCLUDE) $(DEFS) -lpthread -lrt -lz -fexceptions -std=c++11 -fvisibility=hidden -Wl,-Bsymbolic -ldl -Wwrite-strings + # -DUNICODE -D_UNICODE + +NFLAGS_LIB=-g -c -shared -Xcompiler -fPIC -Xcompiler -fvisibility=hidden +NFLAGS = $(NFLAGS_LIB) $(INCLUDE) -std=c++11 + +CU_SOURCES:=$(notdir $(wildcard $(SRC_DIR)/*.cu)) +CU_OBJS:=$(addprefix $(OBJ_DIR)/, $(patsubst %.cu, %.o, $(CU_SOURCES))) + + +# 默认最终目标 +.PHONY:all +all:$(TARGET) + +# 生成最终目标 +$(TARGET):$(OBJS) $(CU_OBJS) | $(LIB_DIR) + @echo -e "\e[32m""Linking static library $(TARGET)""\e[0m" + @echo -e "ar -rc $@ $^" + @ar -rc $@ $^ + +# 若没有lib目录则自动生成 +$(LIB_DIR): + @mkdir -p $@ + +# 生成中间目标文件 +$(OBJ_DIR)/%.o:$(SRC_DIR)/%.cpp $(DEP_DIR)/%.d | $(OBJ_DIR) $(DEP_DIR) + @echo -e "\e[33m""Building object $@""\e[0m" + @echo "$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $<" + @$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $< + +$(OBJ_DIR)%.o:$(SRC_DIR)/%.cu + @echo -e "\e[33m""Building object $@""\e[0m" + @echo "$(NVCC) $(NFLAGS) -o $@ $<" + $(NVCC) $(NFLAGS) -o $@ $< + + +# 若没有obj目录则自动生成 +$(OBJ_DIR): + @mkdir -p $@ + +# 若没有.dep目录则自动生成 +$(DEP_DIR): + @mkdir -p $@ + +# 依赖文件会在生成中间文件的时候自动生成,这里只是为了防止报错 +$(DEPS): + +# 引入中间目标文件头文件依赖关系 +include $(wildcard $(DEPS)) + +# 直接删除组件build目录 +.PHONY:clean +clean: + @rm -rf $(BUILD_DIR)/$(MODULE) diff --git a/src/nvdec/NV12ToRGB.cu b/src/nvdec/NV12ToRGB.cu new file mode 100644 index 0000000..68e54ac --- /dev/null +++ b/src/nvdec/NV12ToRGB.cu @@ -0,0 +1,345 @@ + +#include "cuda_kernels.h" + +#include <builtin_types.h> +#include "helper_cuda_drvapi.h" + +typedef unsigned char uint8; +typedef unsigned int uint32; +typedef int int32; + +#define COLOR_COMPONENT_MASK 0x3FF +#define COLOR_COMPONENT_BIT_SIZE 10 + +namespace cuda_common +{ + +#define MUL(x,y) ((x)*(y)) + + __constant__ float constHueColorSpaceMat2[9]; //默认分配到0卡上,未找到分配到指定卡上设置方法,当前也未用到,先注释掉 + + __device__ void YUV2RGB2(uint32 *yuvi, float *red, float *green, float *blue) + { + float luma, chromaCb, chromaCr; + + // Prepare for hue adjustment + luma = (float)yuvi[0]; + chromaCb = (float)((int32)yuvi[1] - 512.0f); + chromaCr = (float)((int32)yuvi[2] - 512.0f); + + + // Convert YUV To RGB with hue adjustment + *red = MUL(luma, constHueColorSpaceMat2[0]) + + MUL(chromaCb, constHueColorSpaceMat2[1]) + + MUL(chromaCr, constHueColorSpaceMat2[2]); + *green = MUL(luma, constHueColorSpaceMat2[3]) + + MUL(chromaCb, constHueColorSpaceMat2[4]) + + MUL(chromaCr, constHueColorSpaceMat2[5]); + *blue = MUL(luma, constHueColorSpaceMat2[6]) + + MUL(chromaCb, constHueColorSpaceMat2[7]) + + MUL(chromaCr, constHueColorSpaceMat2[8]); + + } + + __device__ unsigned char clip_v(int x, int min_val, int max_val) { + if (x>max_val) { + return max_val; + } + else if (x<min_val) { + return min_val; + } + else { + return x; + } + } + // CUDA kernel for outputing the final RGB output from NV12; + extern "C" + __global__ void NV12ToRGB_drvapi2(uint32 *srcImage, size_t nSourcePitch, unsigned char *dstImage, int width, int height) + { + + int32 x, y; + uint32 yuv101010Pel[2]; + uint32 processingPitch = ((width)+63) & ~63; + uint8 *srcImageU8 = (uint8 *)srcImage; + + processingPitch = nSourcePitch; + + // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread + x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1); + y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= width) + { + //printf("x >= width\n"); + //*flag = -1; + return; //x = width - 1; + } + //return; //x = width - 1; + + if (y >= height) + { + //printf("y >= height\n"); + //*flag = -1; + return; // y = height - 1; + } + + // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way. + // if we move to texture we could read 4 luminance values + yuv101010Pel[0] = (srcImageU8[y * processingPitch + x]) << 2; + yuv101010Pel[1] = (srcImageU8[y * processingPitch + x + 1]) << 2; + + uint32 chromaOffset = processingPitch * height; + int32 y_chroma = y >> 1; + + if (y & 1) // odd scanline ? + { + uint32 chromaCb; + uint32 chromaCr; + + chromaCb = srcImageU8[chromaOffset + y_chroma * processingPitch + x]; + chromaCr = srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1]; + + if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically + { + chromaCb = (chromaCb + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x] + 1) >> 1; + chromaCr = (chromaCr + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x + 1] + 1) >> 1; + } + + yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2)); + yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); + + yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2)); + yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); + } + else + { + yuv101010Pel[0] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x] << (COLOR_COMPONENT_BIT_SIZE + 2)); + yuv101010Pel[0] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); + + yuv101010Pel[1] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x] << (COLOR_COMPONENT_BIT_SIZE + 2)); + yuv101010Pel[1] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); + } + + // this steps performs the color conversion + uint32 yuvi[6]; + float red[2], green[2], blue[2]; + + yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK); + yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK); + yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK); + + yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK); + yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK); + yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK); + + // YUV to RGB Transformation conversion + YUV2RGB2(&yuvi[0], &red[0], &green[0], &blue[0]); + YUV2RGB2(&yuvi[3], &red[1], &green[1], &blue[1]); + + + dstImage[y * width * 3 + x * 3] = clip_v(blue[0] * 0.25,0 ,255); + dstImage[y * width * 3 + x * 3 + 3] = clip_v(blue[1] * 0.25,0, 255); + + dstImage[width * y * 3 + x * 3 + 1] = clip_v(green[0] * 0.25,0 ,255); + dstImage[width * y * 3 + x * 3 + 4] = clip_v(green[1] * 0.25,0, 255); + + dstImage[width * y * 3 + x * 3 + 2] = clip_v(red[0] * 0.25, 0, 255); + dstImage[width * y * 3 + x * 3 + 5] = clip_v(red[1] * 0.25,0 ,255); + + + //dstImage[y * width * 3 + x * 3] = blue[0] * 0.25; + //dstImage[y * width * 3 + x * 3 + 3] = blue[1] * 0.25; + + //dstImage[width * y * 3 + x * 3 + 1] =green[0] * 0.25; + //dstImage[width * y * 3 + x * 3 + 4] = green[1] * 0.25; + + //dstImage[width * y * 3 + x * 3 + 2] = red[0] * 0.25; + //dstImage[width * y * 3 + x * 3 + 5] = red[1] * 0.25; + + // Clamp the results to BBBBBB....GGGGGGG.......RRRRRRR.... + // dstImage[y * width + x] = blue[0] * 0.25; + // dstImage[y * width + x + 1] = blue[1] * 0.25; + + // dstImage[width * height + y * width + x] = green[0] * 0.25; + // dstImage[width * height + y * width + x + 1] = green[1] * 0.25; + + // dstImage[width * height * 2 + y * width + x] = red[0] * 0.25; + // dstImage[width * height * 2 + y * width + x + 1] = red[1] * 0.25; + return; + + } + + // CUDA kernel for outputing the final RGB output from NV12; + extern "C" + __global__ void CUDAToBGR_drvapi(uint32 *dataY, uint32 *dataUV, size_t pitchY, size_t pitchUV, unsigned char *dstImage, int width, int height) + { + + int32 x, y; + + // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread + x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1); + y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= width) + { + return; + } + + if (y >= height) + { + return; + } + + uint32 yuv101010Pel[2]; + uint8 *srcImageU8_Y = (uint8 *)dataY; + uint8 *srcImageU8_UV = (uint8 *)dataUV; + + // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way. + // if we move to texture we could read 4 luminance values + yuv101010Pel[0] = (srcImageU8_Y[y * pitchY + x]) << 2; + yuv101010Pel[1] = (srcImageU8_Y[y * pitchY + x + 1]) << 2; + + int32 y_chroma = y >> 1; + + if (y & 1) // odd scanline ? + { + uint32 chromaCb; + uint32 chromaCr; + + chromaCb = srcImageU8_UV[y_chroma * pitchUV + x]; + chromaCr = srcImageU8_UV[y_chroma * pitchUV + x + 1]; + + if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically + { + chromaCb = (chromaCb + srcImageU8_UV[(y_chroma + 1) * pitchUV + x] + 1) >> 1; + chromaCr = (chromaCr + srcImageU8_UV[(y_chroma + 1) * pitchUV + x + 1] + 1) >> 1; + } + + yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2)); + yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); + + yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2)); + yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); + } + else + { + yuv101010Pel[0] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2)); + yuv101010Pel[0] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); + + yuv101010Pel[1] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2)); + yuv101010Pel[1] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); + } + + // this steps performs the color conversion + uint32 yuvi[6]; + float red[2], green[2], blue[2]; + + yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK); + yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK); + yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK); + + yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK); + yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK); + yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK); + + // YUV to RGB Transformation conversion + YUV2RGB2(&yuvi[0], &red[0], &green[0], &blue[0]); + YUV2RGB2(&yuvi[3], &red[1], &green[1], &blue[1]); + + + dstImage[y * width * 3 + x * 3] = clip_v(blue[0] * 0.25,0 ,255); + dstImage[y * width * 3 + x * 3 + 3] = clip_v(blue[1] * 0.25,0, 255); + + dstImage[width * y * 3 + x * 3 + 1] = clip_v(green[0] * 0.25,0 ,255); + dstImage[width * y * 3 + x * 3 + 4] = clip_v(green[1] * 0.25,0, 255); + + dstImage[width * y * 3 + x * 3 + 2] = clip_v(red[0] * 0.25, 0, 255); + dstImage[width * y * 3 + x * 3 + 5] = clip_v(red[1] * 0.25,0 ,255); + } + + cudaError_t setColorSpace(FF_ColorSpace CSC, float hue) + { + float hueSin = sin(hue); + float hueCos = cos(hue); + + float hueCSC[9]; + if (CSC == ITU_601) + { + //CCIR 601 + hueCSC[0] = 1.1644f; + hueCSC[1] = hueSin * 1.5960f; + hueCSC[2] = hueCos * 1.5960f; + hueCSC[3] = 1.1644f; + hueCSC[4] = (hueCos * -0.3918f) - (hueSin * 0.8130f); + hueCSC[5] = (hueSin * 0.3918f) - (hueCos * 0.8130f); + hueCSC[6] = 1.1644f; + hueCSC[7] = hueCos * 2.0172f; + hueCSC[8] = hueSin * -2.0172f; + } + else if (CSC == ITU_709) + { + //CCIR 709 + hueCSC[0] = 1.0f; + hueCSC[1] = hueSin * 1.57480f; + hueCSC[2] = hueCos * 1.57480f; + hueCSC[3] = 1.0; + hueCSC[4] = (hueCos * -0.18732f) - (hueSin * 0.46812f); + hueCSC[5] = (hueSin * 0.18732f) - (hueCos * 0.46812f); + hueCSC[6] = 1.0f; + hueCSC[7] = hueCos * 1.85560f; + hueCSC[8] = hueSin * -1.85560f; + } + + cudaError_t cudaStatus = cudaMemcpyToSymbol(constHueColorSpaceMat2, hueCSC, 9 * sizeof(float), 0, cudaMemcpyHostToDevice); + float tmpf[9]; + memset(tmpf, 0, 9 * sizeof(float)); + cudaMemcpyFromSymbol(tmpf, constHueColorSpaceMat2, 9 * sizeof(float), 0, ::cudaMemcpyDefault); + cudaDeviceSynchronize(); + + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMemcpyToSymbol failed: %s\n", cudaGetErrorString(cudaStatus)); + } + + return cudaStatus; + } + + cudaError_t NV12ToRGBnot(CUdeviceptr d_srcNV12, size_t nSourcePitch, unsigned char* d_dstRGB, int width, int height) + { + dim3 block(32, 16, 1); + dim3 grid((width + (2 * block.x - 1)) / (2 * block.x), (height + (block.y - 1)) / block.y, 1); + NV12ToRGB_drvapi2 << < grid, block >> >((uint32 *)d_srcNV12, nSourcePitch, d_dstRGB, width, height); + cudaError_t cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "NV12ToRGB_drvapi launch failed: %s\n", cudaGetErrorString(cudaStatus)); + return cudaStatus; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching NV12ToRGB_drvapi !\n", cudaStatus); + return cudaStatus; + } + + return cudaStatus; + } + + cudaError_t CUDAToBGR(CUdeviceptr dataY, CUdeviceptr dataUV, size_t pitchY, size_t pitchUV, unsigned char* d_dstRGB, int width, int height) + { + dim3 block(32, 16, 1); + dim3 grid((width + (2 * block.x - 1)) / (2 * block.x), (height + (block.y - 1)) / block.y, 1); + CUDAToBGR_drvapi << < grid, block >> >((uint32 *)dataY, (uint32 *)dataUV, pitchY, pitchUV, d_dstRGB, width, height); + cudaError_t cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "NV12ToRGB_drvapi launch failed: %s\n", cudaGetErrorString(cudaStatus)); + return cudaStatus; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching NV12ToRGB_drvapi !\n", cudaStatus); + return cudaStatus; + } + + return cudaStatus; + } +} \ No newline at end of file diff --git a/src/nvdec/NvDecoderApi.cpp b/src/nvdec/NvDecoderApi.cpp new file mode 100644 index 0000000..efb63cd --- /dev/null +++ b/src/nvdec/NvDecoderApi.cpp @@ -0,0 +1,133 @@ +#include "NvDecoderApi.h" +#include "FFNvDecoder.h" + +NvDecoderApi::NvDecoderApi(){ + m_pDecoder = nullptr; +} + +NvDecoderApi::~NvDecoderApi(){ + if(m_pDecoder != nullptr){ + delete m_pDecoder; + m_pDecoder = nullptr; + } +} + +bool NvDecoderApi::init(FFDecConfig& cfg){ + m_pDecoder = new FFNvDecoder(); + if(m_pDecoder != nullptr){ + return m_pDecoder->init(cfg); + } + return false; +} + +void NvDecoderApi::close(){ + if(m_pDecoder != nullptr){ + return m_pDecoder->close(); + } +} + +bool NvDecoderApi::start(){ + if(m_pDecoder != nullptr){ + return m_pDecoder->start(); + } + return false; +} + +void NvDecoderApi::pause(){ + if(m_pDecoder != nullptr){ + return m_pDecoder->pause(); + } +} + +void NvDecoderApi::resume(){ + if(m_pDecoder != nullptr){ + return m_pDecoder->resume(); + } +} + +void NvDecoderApi::setDecKeyframe(bool bKeyframe){ + if(m_pDecoder != nullptr){ + return m_pDecoder->setDecKeyframe(bKeyframe); + } +} + +bool NvDecoderApi::isRunning(){ + if(m_pDecoder != nullptr){ + return m_pDecoder->isRunning(); + } + return false; +} + +bool NvDecoderApi::isFinished(){ + if(m_pDecoder != nullptr){ + return m_pDecoder->isFinished(); + } + return false; +} + +bool NvDecoderApi::isPausing(){ + if(m_pDecoder != nullptr){ + return m_pDecoder->isPausing(); + } + return false; +} + +bool NvDecoderApi::getResolution(int &width, int &height){ + if(m_pDecoder != nullptr){ + return m_pDecoder->getResolution(width, height); + } + return false; +} + +bool NvDecoderApi::isSurport(FFDecConfig& cfg){ + if(m_pDecoder != nullptr){ + return m_pDecoder->isSurport(cfg); + } + return false; +} + +float NvDecoderApi::fps(){ + if(m_pDecoder != nullptr){ + return m_pDecoder->fps(); + } + return 0.0; +} + +int NvDecoderApi::getCachedQueueLength(){ + if(m_pDecoder != nullptr){ + return m_pDecoder->getCachedQueueLength(); + } + return 0; +} + +void NvDecoderApi::setName(string nm){ + if(m_pDecoder != nullptr){ + return m_pDecoder->setName(nm); + } +} + +string NvDecoderApi::getName(){ + if(m_pDecoder != nullptr){ + return m_pDecoder->getName(); + } + return nullptr; +} + +FFImgInfo* NvDecoderApi::snapshot(){ + if(m_pDecoder != nullptr){ + return m_pDecoder->snapshot(); + } + return nullptr; +} + +void NvDecoderApi::setPostDecArg(const void* postDecArg){ + if(m_pDecoder != nullptr){ + return m_pDecoder->setPostDecArg(postDecArg); + } +} + +void NvDecoderApi::setFinishedDecArg(const void* finishedDecArg){ + if(m_pDecoder != nullptr){ + return m_pDecoder->setFinishedDecArg(finishedDecArg); + } +} \ No newline at end of file diff --git a/src/nvdec/NvDecoderApi.h b/src/nvdec/NvDecoderApi.h new file mode 100644 index 0000000..f742dd8 --- /dev/null +++ b/src/nvdec/NvDecoderApi.h @@ -0,0 +1,44 @@ +#include<string> +#include <pthread.h> + +#include "common_header.h" +#include "../interface/AbstractDecoder.h" + +using namespace std; + +class FFNvDecoder; + +class NvDecoderApi : public AbstractDecoder{ +public: + NvDecoderApi(); + ~NvDecoderApi(); + bool init(FFDecConfig& cfg); + void close(); + bool start(); + void pause(); + void resume(); + + void setDecKeyframe(bool bKeyframe); + + bool isRunning(); + bool isFinished(); + bool isPausing(); + bool getResolution( int &width, int &height ); + + bool isSurport(FFDecConfig& cfg); + + int getCachedQueueLength(); + + float fps(); + + FFImgInfo* snapshot(); + + DECODER_TYPE getDecoderType(){ return DECODER_TYPE_DVPP; } + void setName(string nm); + string getName(); + + void setPostDecArg(const void* postDecArg); + void setFinishedDecArg(const void* finishedDecArg); +private: + FFNvDecoder* m_pDecoder; +}; \ No newline at end of file diff --git a/src/nvdec/NvJpegEncoder.cpp b/src/nvdec/NvJpegEncoder.cpp new file mode 100644 index 0000000..7ee0727 --- /dev/null +++ b/src/nvdec/NvJpegEncoder.cpp @@ -0,0 +1,90 @@ +#include "NvJpegEncoder.h" + +#include <fstream> +#include <vector> +#include <iostream> + + +#define CHECK_NVJPEG(S) do {nvjpegStatus_t status; \ + status = S; \ + if (status != NVJPEG_STATUS_SUCCESS ) std::cout << __LINE__ <<" CHECK_NVJPEG - status = " << status << std::endl; \ + } while (false) + + +int saveJpeg(const char * filepath, unsigned char* d_srcBGR, int width, int height, cudaStream_t stream) +{ + nvjpegHandle_t nvjpeg_handle; + nvjpegEncoderState_t encoder_state; + nvjpegEncoderParams_t encoder_params; + + cudaEvent_t ev_start, ev_end; + cudaEventCreate(&ev_start); + cudaEventCreate(&ev_end); + + nvjpegImage_t input; + nvjpegInputFormat_t input_format = NVJPEG_INPUT_BGRI; + int image_width = width; + int image_height = height; + + // int channel_size = image_width * image_height; + // for (int i = 0; i < 3; i++) + // { + // input.pitch[i] = image_width; + // (cudaMalloc((void**)&(input.channel[i]), channel_size)); + // (cudaMemset(input.channel[i], 50 * 40 * i, channel_size)); + // } + + input.channel[0] = d_srcBGR; + input.pitch[0] = image_width * 3; + + nvjpegBackend_t backend = NVJPEG_BACKEND_DEFAULT; + + CHECK_NVJPEG(nvjpegCreate(backend, nullptr, &nvjpeg_handle)); + + CHECK_NVJPEG(nvjpegEncoderParamsCreate(nvjpeg_handle, &encoder_params, stream)); + CHECK_NVJPEG(nvjpegEncoderStateCreate(nvjpeg_handle, &encoder_state, stream)); + + // set params + CHECK_NVJPEG(nvjpegEncoderParamsSetEncoding(encoder_params, nvjpegJpegEncoding_t::NVJPEG_ENCODING_PROGRESSIVE_DCT_HUFFMAN, stream)); + CHECK_NVJPEG(nvjpegEncoderParamsSetOptimizedHuffman(encoder_params, 1, stream)); + CHECK_NVJPEG(nvjpegEncoderParamsSetQuality(encoder_params, 70, stream)); + CHECK_NVJPEG(nvjpegEncoderParamsSetSamplingFactors(encoder_params, nvjpegChromaSubsampling_t::NVJPEG_CSS_420, stream)); + + cudaEventRecord(ev_start); + CHECK_NVJPEG(nvjpegEncodeImage(nvjpeg_handle, encoder_state, encoder_params, &input, input_format, image_width, image_height, stream)); + cudaEventRecord(ev_end); + + std::vector<unsigned char> obuffer; + size_t length; + CHECK_NVJPEG(nvjpegEncodeRetrieveBitstream( + nvjpeg_handle, + encoder_state, + NULL, + &length, + stream)); + + obuffer.resize(length); + CHECK_NVJPEG(nvjpegEncodeRetrieveBitstream( + nvjpeg_handle, + encoder_state, + obuffer.data(), + &length, + stream)); + + cudaEventSynchronize(ev_end); + + // 用完销毁,避免显存泄露 + nvjpegEncoderParamsDestroy(encoder_params); + nvjpegEncoderStateDestroy(encoder_state); + nvjpegDestroy(nvjpeg_handle); + + float ms; + cudaEventElapsedTime(&ms, ev_start, ev_end); + // std::cout << "time spend " << ms << " ms" << std::endl; + + std::ofstream outputFile(filepath, std::ios::out | std::ios::binary); + outputFile.write(reinterpret_cast<const char *>(obuffer.data()), static_cast<int>(length)); + outputFile.close(); + + return 0; +} \ No newline at end of file diff --git a/src/nvdec/NvJpegEncoder.h b/src/nvdec/NvJpegEncoder.h new file mode 100644 index 0000000..3c27ba8 --- /dev/null +++ b/src/nvdec/NvJpegEncoder.h @@ -0,0 +1,3 @@ +#include <nvjpeg.h> + +int saveJpeg(const char * filepath, unsigned char* d_srcBGR, int width, int height, cudaStream_t stream); \ No newline at end of file diff --git a/src/nvdec/PartMemCopy.cu b/src/nvdec/PartMemCopy.cu new file mode 100644 index 0000000..396765b --- /dev/null +++ b/src/nvdec/PartMemCopy.cu @@ -0,0 +1,289 @@ +#include "cuda_kernels.h" +#include <algorithm> +typedef unsigned char uchar; +typedef unsigned int uint32; +typedef int int32; + +#define MAX_SNAPSHOT_WIDTH 320 +#define MAX_SNAPSHOT_HEIGHT 320 + +namespace cuda_common +{ + __global__ void kernel_memcopy(unsigned char* d_srcRGB, int src_width, int src_height, + unsigned char* d_dstRGB, int left, int top, int right, int bottom) + { + const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; + const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; + const int dst_width = right - left; + const int dst_height = bottom - top; + if (dst_x < dst_width && dst_y < dst_height) + { + int src_x = left + dst_x; + int src_y = top + dst_y; + + //bgr...bgr...bgr... + d_dstRGB[(dst_y*dst_width + dst_x) * 3] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3]; + d_dstRGB[(dst_y*dst_width + dst_x) + * 3 + 1] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3 + 1]; + d_dstRGB[(dst_y*dst_width + dst_x) * 3 + 2] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3 + 2]; + + //bbb...ggg...rrr... + //d_dstRGB[(dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(src_y*src_width) + src_x]; + //d_dstRGB[(dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(src_width*src_height) + (src_y*src_width) + src_x]; + //d_dstRGB[(2 * dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(2 * src_width*src_height) + (src_y*src_width) + src_x]; + + /* memcpy(d_dstRGB + (dst_y*src_width) + dst_x, d_srcRGB + (src_y*src_width) + src_x, sizeof(float)); + memcpy(d_dstRGB + (src_width*src_height) + (dst_y*src_width) + dst_x, d_srcRGB + (src_width*src_height) + (src_y*src_width) + src_x, sizeof(float)); + memcpy(d_dstRGB + (2 * src_width*src_height) + (dst_y*src_width) + dst_x, d_srcRGB + (2 * src_width*src_height) + (src_y*src_width) + src_x, sizeof(float));*/ + } + } + + cudaError_t PartMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom) + { + dim3 block(32, 16, 1); + dim3 grid(((right - left) + (block.x - 1)) / block.x, ((bottom - top) + (block.y - 1)) / block.y, 1); + + kernel_memcopy << < grid, block >> > (d_srcRGB, src_width, src_height, d_dstRGB, left, top, right, bottom); + + cudaError_t cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "Part 50 kernel_memcopy launch failed: %s\n", cudaGetErrorString(cudaStatus)); + return cudaStatus; + } + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus); + return cudaStatus; + } + return cudaStatus; + } + + + // __global__ void kernel_memcopy_mean_variance(float* d_srcRGB, int src_width, int src_height, + // unsigned char* vd_dstRGB, int count, int * vleft, int* vtop, int* vright, int * vbottom, float submeanb,float submeang, float submeanr, float varianceb,float varianceg, float variancer) + // { + // const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; + // const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; + // for (int i=0;i<count;i++) + // { + // const int left = vleft[i]; + // const int right = vright[i]; + // const int top = vtop[i]; + // const int bottom = vbottom[i]; + // + // const int dst_width = right - left; + // const int dst_height = bottom - top; + // + // + // unsigned char * d_dstRGB = vd_dstRGB + i * ; + // + // if (dst_x < dst_width && dst_y < dst_height) + // { + // int src_x = left + dst_x; + // int src_y = top + dst_y; + // + // d_dstRGB[(dst_y*dst_width) + dst_x] = (d_srcRGB[(src_y*src_width) + src_x] - submeanb)*varianceb; + // d_dstRGB[(dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (d_srcRGB[(src_width*src_height) + (src_y*src_width) + src_x] -submeang)*varianceg; + // d_dstRGB[(2 * dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (d_srcRGB[(2 * src_width*src_height) + (src_y*src_width) + src_x] - submeanr) * variancer; + // + // } + // } + // } + __global__ void PartCopy_ResizeImgBilinearBGR_Mean_Variance_CUDAKernel( + unsigned char * d_srcRGB, int srcimg_width, int srcimg_height, + int* vleft, int* vtop, int* vright, int * vbottom, + unsigned char** vd_dstRGB, int count, int *dst_width, int *dst_height, + float submeanb, float submeang, float submeanr, + float varianceb, float varianceg, float variancer) + { + int i = blockIdx.z; + + //for (int i = 0; i<count; i++) + { + const int left = vleft[i]; + const int right = vright[i]; + const int top = vtop[i]; + const int bottom = vbottom[i]; + const int cur_dst_width = dst_width[i]; + const int cur_dst_height = dst_height[i]; + + unsigned char* d_dstRGB = vd_dstRGB[i]; + + const int src_width = right - left; + const int src_height = bottom - top; + const int x = blockIdx.x * blockDim.x + threadIdx.x;// + left; + const int y = blockIdx.y * blockDim.y + threadIdx.y;//+ top; + const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; + const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; + + /*if (dst_x == 0 && dst_y == 0) + printf("%d %d %d %d %d\n", i, vleft[i], vright[i], cur_dst_width, cur_dst_height);*/ + + unsigned char * src_img = d_srcRGB; + unsigned char * dst_img = d_dstRGB; + if (dst_x < cur_dst_width && dst_y < cur_dst_height) + { + float fx = (x + 0.5)*src_width / (float)cur_dst_width - 0.5 + left; + float fy = (y + 0.5)*src_height / (float)cur_dst_height - 0.5 + top; + int ax = floor(fx); + int ay = floor(fy); + if (ax < 0) + { + ax = 0; + } + if (ax > srcimg_width - 2) + { + ax = srcimg_width - 2; + } + if (ay < 0) { + ay = 0; + } + if (ay > srcimg_height - 2) + { + ay = srcimg_height - 2; + } + + int A = ax + ay*srcimg_width; + int B = ax + ay*srcimg_width + 1; + int C = ax + ay*srcimg_width + srcimg_width; + int D = ax + ay*srcimg_width + srcimg_width + 1; + + float w1, w2, w3, w4; + w1 = fx - ax; + w2 = 1 - w1; + w3 = fy - ay; + w4 = 1 - w3; + float blue = src_img[A * 3] * w2*w4 + src_img[B * 3] * w1*w4 + src_img[C * 3] * w2*w3 + src_img[D * 3] * w1*w3; + float green = src_img[A * 3 + 1] * w2*w4 + src_img[B * 3 + 1] * w1*w4 + + src_img[C * 3 + 1] * w2*w3 + src_img[D * 3 + 1] * w1*w3; + float red = src_img[A * 3 + 2] * w2*w4 + src_img[B * 3 + 2] * w1*w4 + + src_img[C * 3 + 2] * w2*w3 + src_img[D * 3 + 2] * w1*w3; + + /*dst_img[(dst_y * dst_width + dst_x) * 3] = (unsigned char)(blue - submeanb)*varianceb; + dst_img[(dst_y * dst_width + dst_x) * 3 + 1] =(unsigned char) (green - submeang)*varianceg; + dst_img[(dst_y * dst_width + dst_x) * 3 + 2] = (unsigned char) (red - submeanr)*variancer;*/ + + if (blue < 0) + blue = 0; + else if (blue > 255) + blue = 255; + + if (green < 0) + green = 0; + else if (green > 255) + green = 255; + + if (red < 0) + red = 0; + else if (red > 255) + red = 255; + + dst_img[(dst_y * cur_dst_width + dst_x) * 3] = (unsigned char)blue; + dst_img[(dst_y * cur_dst_width + dst_x) * 3 + 1] = (unsigned char)green; + dst_img[(dst_y * cur_dst_width + dst_x) * 3 + 2] = (unsigned char)red; + + + /*if (src_img[(dst_y * dst_width + dst_x) * 3] < 0) + src_img[(dst_y * dst_width + dst_x) * 3] = 0; + else if (src_img[(dst_y * dst_width + dst_x) * 3] > 255) + src_img[(dst_y * dst_width + dst_x) * 3] = 255; + + if (src_img[(dst_y * dst_width + dst_x) * 3 + 1] < 0) + src_img[(dst_y * dst_width + dst_x) * 3 + 1] = 0; + else if (src_img[(dst_y * dst_width + dst_x) * 3 + 1] > 255) + src_img[(dst_y * dst_width + dst_x) * 3 + 1] = 255; + + if (src_img[(dst_y * dst_width + dst_x) * 3 + 2] < 0) + src_img[(dst_y * dst_width + dst_x) * 3 + 2] = 0; + else if (src_img[(dst_y * dst_width + dst_x) * 3 + 2] > 255) + src_img[(dst_y * dst_width + dst_x) * 3 + 2] = 255; + + + dst_img[(dst_y * dst_width + dst_x) * 3] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3]; + dst_img[(dst_y * dst_width + dst_x) * 3 + 1] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3 + 1]; + dst_img[(dst_y * dst_width + dst_x) * 3 + 2] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3 + 2];*/ + } + } + } + + cudaError_t PartMemResizeBatch(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, int count, int* left, int* top, int* right, int* bottom, int *dst_w, int *dst_h, float submeanb, float submeang, float submeanr, + float varianceb, float varianceg, float variancer) + { + /* cudaEvent_t start, stop; + float time; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start, 0);*/ + + dim3 block(32, 16, 1); + dim3 grid((*std::max_element(dst_w, dst_w+ count) + (block.x - 1)) / block.x, (*std::max_element(dst_h, dst_h + count) + (block.y - 1)) / block.y, count); + + int * gpu_left; + cudaMalloc(&gpu_left, 1000 * sizeof(int)); + cudaMemcpy(gpu_left, left, count * sizeof(int), cudaMemcpyHostToDevice); + + int * gpu_right; + cudaMalloc(&gpu_right, 1000 * sizeof(int)); + cudaMemcpy(gpu_right, right, count * sizeof(int), cudaMemcpyHostToDevice); + + int * gpu_top; + cudaMalloc(&gpu_top, 1000 * sizeof(int)); + cudaMemcpy(gpu_top, top, count * sizeof(int), cudaMemcpyHostToDevice); + + int * gpu_bottom; + cudaMalloc(&gpu_bottom, 1000 * sizeof(int)); + cudaMemcpy(gpu_bottom, bottom, count * sizeof(int), cudaMemcpyHostToDevice); + + int * gpu_dst_w; + cudaMalloc(&gpu_dst_w, 1000 * sizeof(int)); + cudaMemcpy(gpu_dst_w, dst_w, count * sizeof(int), cudaMemcpyHostToDevice); + + int * gpu_dst_h; + cudaMalloc(&gpu_dst_h, 1000 * sizeof(int)); + cudaMemcpy(gpu_dst_h, dst_h, count * sizeof(int), cudaMemcpyHostToDevice); + + unsigned char** gpu_dst_rgb; + cudaMalloc(&gpu_dst_rgb, 1000 * sizeof(unsigned char*)); + cudaMemcpy(gpu_dst_rgb, d_dstRGB, count * sizeof(unsigned char*), cudaMemcpyHostToDevice); + + //cudaMemcpy(cpu_personfloat, d_srcRGB, 112*224*2*sizeof(float), cudaMemcpyDeviceToHost); + // for(int i=0;i<100;i++) + // { + // printf("the score is %f\t",cpu_personfloat[i]); + // } + PartCopy_ResizeImgBilinearBGR_Mean_Variance_CUDAKernel << < grid, block >> > ( + d_srcRGB, src_width, src_height, + gpu_left, gpu_top, gpu_right, gpu_bottom, + gpu_dst_rgb, count, gpu_dst_w, gpu_dst_h, + submeanb, submeang, submeanr, + varianceb, varianceg, variancer); + cudaFree(gpu_top); + cudaFree(gpu_bottom); + cudaFree(gpu_left); + cudaFree(gpu_right); + cudaFree(gpu_dst_w); + cudaFree(gpu_dst_h); + cudaFree(gpu_dst_rgb); + + cudaError_t cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "Part 270 kernel_memcopy launch failed: %s\n", cudaGetErrorString(cudaStatus)); + return cudaStatus; + } + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus); + return cudaStatus; + } + + /*cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); + printf("�˺�������ʱ��:%f\n", time);*/ + + return cudaStatus; + } + +} \ No newline at end of file diff --git a/src/nvdec/RGB2YUV.cu b/src/nvdec/RGB2YUV.cu new file mode 100644 index 0000000..7202c3a --- /dev/null +++ b/src/nvdec/RGB2YUV.cu @@ -0,0 +1,263 @@ + + +#include "cuda_kernels.h" + +typedef unsigned char uint8; +typedef unsigned int uint32; +typedef int int32; + +namespace cuda_common +{ + __device__ unsigned char clip_value(unsigned char x, unsigned char min_val, unsigned char max_val){ + if (x>max_val){ + return max_val; + } + else if (x<min_val){ + return min_val; + } + else{ + return x; + } + } + + __global__ void kernel_rgb2yuv(unsigned char *src_img, unsigned char* Y, unsigned char* u, unsigned char* v, + int src_width, int src_height, size_t yPitch) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= src_width) + return; //x = width - 1; + + if (y >= src_height) + return; // y = height - 1; + + int B = src_img[y * src_width * 3 + x * 3]; + int G = src_img[y * src_width * 3 + x * 3 + 1]; + int R = src_img[y * src_width * 3 + x * 3 + 2]; + + /*int B = src_img[y * src_width + x]; + int G = src_img[src_width * src_height + y * src_width + x]; + int R = src_img[src_width * src_height * 2 + y * src_width + x];*/ + + Y[y * yPitch + x] = clip_value((unsigned char)(0.299 * R + 0.587 * G + 0.114 * B), 0, 255); + u[y * src_width + x] = clip_value((unsigned char)(-0.147 * R - 0.289 * G + 0.436 * B + 128), 0, 255); + v[y * src_width + x] = clip_value((unsigned char)(0.615 * R - 0.515 * G - 0.100 * B + 128), 0, 255); + + //Y[y * yPitch + x] = clip_value((unsigned char)(0.257 * R + 0.504 * G + 0.098 * B + 16), 0, 255); + //u[y * src_width + x] = clip_value((unsigned char)(-0.148 * R - 0.291 * G + 0.439 * B + 128), 0, 255); + //v[y * src_width + x] = clip_value((unsigned char)(0.439 * R - 0.368 * G - 0.071 * B + 128), 0, 255); + } + + __global__ void kernel_rgb2yuv(float *src_img, unsigned char* Y, unsigned char* u, unsigned char* v, + int src_width, int src_height, size_t yPitch) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= src_width) + return; //x = width - 1; + + if (y >= src_height) + return; // y = height - 1; + + float B = src_img[y * src_width + x]; + float G = src_img[src_width * src_height + y * src_width + x]; + float R = src_img[src_width * src_height * 2 + y * src_width + x]; + + Y[y * yPitch + x] = clip_value((unsigned char)(0.299 * R + 0.587 * G + 0.114 * B), 0, 255); + u[y * src_width + x] = clip_value((unsigned char)(-0.147 * R - 0.289 * G + 0.436 * B + 128), 0, 255); + v[y * src_width + x] = clip_value((unsigned char)(0.615 * R - 0.515 * G - 0.100 * B + 128), 0, 255); + + //Y[y * yPitch + x] = clip_value((unsigned char)(0.257 * R + 0.504 * G + 0.098 * B + 16), 0, 255); + //u[y * src_width + x] = clip_value((unsigned char)(-0.148 * R - 0.291 * G + 0.439 * B + 128), 0, 255); + //v[y * src_width + x] = clip_value((unsigned char)(0.439 * R - 0.368 * G - 0.071 * B + 128), 0, 255); + } + + extern "C" + __global__ void kernel_resize_UV(unsigned char* src_img, unsigned char *dst_img, + int src_width, int src_height, int dst_width, int dst_height, int nPitch) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width) + return; //x = width - 1; + + if (y >= dst_height) + return; // y = height - 1; + + float fx = (x + 0.5)*src_width / (float)dst_width - 0.5; + float fy = (y + 0.5)*src_height / (float)dst_height - 0.5; + int ax = floor(fx); + int ay = floor(fy); + if (ax < 0) + { + ax = 0; + } + else if (ax > src_width - 2) + { + ax = src_width - 2; + } + + if (ay < 0){ + ay = 0; + } + else if (ay > src_height - 2) + { + ay = src_height - 2; + } + + int A = ax + ay*src_width; + int B = ax + ay*src_width + 1; + int C = ax + ay*src_width + src_width; + int D = ax + ay*src_width + src_width + 1; + + float w1, w2, w3, w4; + w1 = fx - ax; + w2 = 1 - w1; + w3 = fy - ay; + w4 = 1 - w3; + + unsigned char val = src_img[A] * w2*w4 + src_img[B] * w1*w4 + src_img[C] * w2*w3 + src_img[D] * w1*w3; + + dst_img[y * nPitch + x] = clip_value(val,0,255); + } + + cudaError_t RGB2YUV(float* d_srcRGB, int src_width, int src_height, + unsigned char* Y, size_t yPitch, int yWidth, int yHeight, + unsigned char* U, size_t uPitch, int uWidth, int uHeight, + unsigned char* V, size_t vPitch, int vWidth, int vHeight) + { + unsigned char * u ; + unsigned char * v ; + + cudaError_t cudaStatus; + + cudaStatus = cudaMalloc((void**)&u, src_width * src_height * sizeof(unsigned char)); + cudaStatus = cudaMalloc((void**)&v, src_width * src_height * sizeof(unsigned char)); + + dim3 block(32, 16, 1); + dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1); + dim3 grid1((uWidth + (block.x - 1)) / block.x, (uHeight + (block.y - 1)) / block.y, 1); + dim3 grid2((vWidth + (block.x - 1)) / block.x, (vHeight + (block.y - 1)) / block.y, 1); + + kernel_rgb2yuv << < grid, block >> >(d_srcRGB, Y, u, v, src_width, src_height, yPitch); + + cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "kernel_rgb2yuv launch failed: %s\n", cudaGetErrorString(cudaStatus)); + goto Error; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_rgb2yuv!\n", cudaStatus); + goto Error; + } + + kernel_resize_UV << < grid1, block >> >(u, U, src_width, src_height, uWidth, uHeight, uPitch); + + cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus)); + goto Error; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus); + goto Error; + } + + kernel_resize_UV << < grid2, block >> >(v, V, src_width, src_height, vWidth, vHeight, vPitch); + + cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus)); + goto Error; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus); + goto Error; + } + +Error : + cudaFree(u); + cudaFree(v); + + return cudaStatus; + } + + + + cudaError_t RGB2YUV(unsigned char* d_srcRGB, int src_width, int src_height, + unsigned char* Y, size_t yPitch, int yWidth, int yHeight, + unsigned char* U, size_t uPitch, int uWidth, int uHeight, + unsigned char* V, size_t vPitch, int vWidth, int vHeight) + { + unsigned char * u; + unsigned char * v; + + cudaError_t cudaStatus; + + cudaStatus = cudaMalloc((void**)&u, src_width * src_height * sizeof(unsigned char)); + cudaStatus = cudaMalloc((void**)&v, src_width * src_height * sizeof(unsigned char)); + + dim3 block(32, 16, 1); + dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1); + dim3 grid1((uWidth + (block.x - 1)) / block.x, (uHeight + (block.y - 1)) / block.y, 1); + dim3 grid2((vWidth + (block.x - 1)) / block.x, (vHeight + (block.y - 1)) / block.y, 1); + + kernel_rgb2yuv << < grid, block >> >(d_srcRGB, Y, u, v, src_width, src_height, yPitch); + + cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "kernel_rgb2yuv launch failed: %s\n", cudaGetErrorString(cudaStatus)); + goto Error; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_rgb2yuv!\n", cudaStatus); + goto Error; + } + + kernel_resize_UV << < grid1, block >> >(u, U, src_width, src_height, uWidth, uHeight, uPitch); + + cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus)); + goto Error; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus); + goto Error; + } + + kernel_resize_UV << < grid2, block >> >(v, V, src_width, src_height, vWidth, vHeight, vPitch); + + cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus)); + goto Error; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus); + goto Error; + } + + Error: + cudaFree(u); + cudaFree(v); + + return cudaStatus; + } +} + diff --git a/src/nvdec/ResizeImage.cu b/src/nvdec/ResizeImage.cu new file mode 100644 index 0000000..fdc6961 --- /dev/null +++ b/src/nvdec/ResizeImage.cu @@ -0,0 +1,84 @@ +#include "cuda_kernels.h" + +typedef unsigned char uchar; +typedef unsigned int uint32; +typedef int int32; + +namespace cuda_common +{ + __global__ void kernel_bilinear(float *src_img, float *dst_img, + int src_width, int src_height, int dst_width, int dst_height) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x < dst_width && y < dst_height) + { + float fx = (x + 0.5)*src_width / (float)dst_width - 0.5; + float fy = (y + 0.5)*src_height / (float)dst_height - 0.5; + int ax = floor(fx); + int ay = floor(fy); + if (ax < 0) + { + ax = 0; + } + else if (ax > src_width - 2) + { + ax = src_width - 2; + } + + if (ay < 0){ + ay = 0; + } + else if (ay > src_height - 2) + { + ay = src_height - 2; + } + + int A = ax + ay*src_width; + int B = ax + ay*src_width + 1; + int C = ax + ay*src_width + src_width; + int D = ax + ay*src_width + src_width + 1; + + float w1, w2, w3, w4; + w1 = fx - ax; + w2 = 1 - w1; + w3 = fy - ay; + w4 = 1 - w3; + + float blue = src_img[A] * w2*w4 + src_img[B] * w1*w4 + src_img[C] * w2*w3 + src_img[D] * w1*w3; + + float green = src_img[src_width * src_height + A] * w2*w4 + src_img[src_width * src_height + B] * w1*w4 + + src_img[src_width * src_height + C] * w2*w3 + src_img[src_width * src_height + D] * w1*w3; + + float red = src_img[src_width * src_height * 2 + A] * w2*w4 + src_img[src_width * src_height * 2 + B] * w1*w4 + + src_img[src_width * src_height * 2 + C] * w2*w3 + src_img[src_width * src_height * 2 + D] * w1*w3; + + dst_img[y * dst_width + x] = blue; + dst_img[dst_width * dst_height + y * dst_width + x] = green; + dst_img[dst_width * dst_height * 2 + y * dst_width + x] = red; + } + } + + cudaError_t ResizeImage(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height) + { + dim3 block(32, 16, 1); + dim3 grid((dst_width + (block.x - 1)) / block.x, (dst_height + (block.y - 1)) / block.y, 1); + + kernel_bilinear << < grid, block >> >(d_srcRGB, d_dstRGB, src_width, src_height, dst_width, dst_height); + + cudaError_t cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "kernel_bilinear launch failed: %s\n", cudaGetErrorString(cudaStatus)); + return cudaStatus; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus); + return cudaStatus; + } + + return cudaStatus; + } +} \ No newline at end of file diff --git a/src/nvdec/common_header.h b/src/nvdec/common_header.h new file mode 100644 index 0000000..cf45c91 --- /dev/null +++ b/src/nvdec/common_header.h @@ -0,0 +1,9 @@ +#ifndef _COMMON_HEADER_H_ +#define _COMMON_HEADER_H_ + + +#include "../interface/logger.hpp" +#include "../interface/utiltools.hpp" +#include "../interface/interface_headers.h" + +#endif \ No newline at end of file diff --git a/src/nvdec/cuda_kernels.h b/src/nvdec/cuda_kernels.h new file mode 100644 index 0000000..cd1eb00 --- /dev/null +++ b/src/nvdec/cuda_kernels.h @@ -0,0 +1,63 @@ +#pragma once +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include <stdio.h> +#include <stdlib.h> + +#include <string.h> +#include <math.h> + +#include <cuda.h> + +typedef enum +{ + ITU_601 = 1, + ITU_709 = 2 +} FF_ColorSpace; + +namespace cuda_common +{ + cudaError_t setColorSpace(FF_ColorSpace CSC, float hue); + + cudaError_t NV12ToRGBnot(CUdeviceptr d_srcNV12, size_t nSourcePitch, unsigned char* d_dstRGB, int width, int height); + cudaError_t CUDAToBGR(CUdeviceptr dataY, CUdeviceptr dataUV, size_t pitchY, size_t pitchUV, unsigned char* d_dstRGB, int width, int height); + + + cudaError_t ResizeImage(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height); + + cudaError_t RGB2YUV(float* d_srcRGB, int src_width, int src_height, + unsigned char* Y, size_t yPitch, int yWidth, int yHeight, + unsigned char* U, size_t uPitch, int uWidth, int uHeight, + unsigned char* V, size_t vPitch, int vWidth, int vHeight); + + cudaError_t RGB2YUV(unsigned char* d_srcRGB, int src_width, int src_height, + unsigned char* Y, size_t yPitch, int yWidth, int yHeight, + unsigned char* U, size_t uPitch, int uWidth, int uHeight, + unsigned char* V, size_t vPitch, int vWidth, int vHeight); + + cudaError_t PartMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom); + // cudaError_t PartMemResize(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int left, int top, int right, int bottom); + + cudaError_t PartMemResizeBatch(unsigned char* d_srcRGB, int srcimg_width, int srcimg_height, unsigned char** d_dstRGB, int count, + int* left, int* top, int* right, int* bottom, int *dst_w, int *dst_h, + float submeanb, float submeang, float submeanr, + float varianceb, float varianceg, float variancer); + + cudaError_t DrawImage(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom); + cudaError_t DrawImage(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom); + + cudaError_t DrawLine(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y); +} + + +int jpegNPP(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height); +int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height); + +int jpegNPP(const char *szOutputFile, float* d_srcRGB); +int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB); + +int initTable(); +int initTable(int flag, int width, int height); +int releaseJpegNPP(); + diff --git a/src/nvdec/define.hpp b/src/nvdec/define.hpp new file mode 100644 index 0000000..ed20540 --- /dev/null +++ b/src/nvdec/define.hpp @@ -0,0 +1,11 @@ +#pragma once + +#include <string> + + +#define CHECK_CUDA(call) \ +{\ + const cudaError_t error_code = call;\ + if (cudaSuccess != error_code)\ + LOG_ERROR("CUDA error, code: {} reason: {}", error_code, cudaGetErrorString(error_code));\ +} diff --git a/src/nvdec/jpegNPP.cpp-1 b/src/nvdec/jpegNPP.cpp-1 new file mode 100644 index 0000000..f0bf2e6 --- /dev/null +++ b/src/nvdec/jpegNPP.cpp-1 @@ -0,0 +1,1193 @@ +/* +* Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO USER: +* +* This source code is subject to NVIDIA ownership rights under U.S. and +* international Copyright laws. +* +* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +* OR PERFORMANCE OF THIS SOURCE CODE. +* +* U.S. Government End Users. This source code is a "commercial item" as +* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +* "commercial computer software" and "commercial computer software +* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +* and is provided to the U.S. Government only as a commercial end item. +* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +* source code with only those rights set forth herein. +*/ + +// This sample needs at least CUDA 5.5 and a GPU that has at least Compute Capability 2.0 + +// This sample demonstrates a simple image processing pipeline. +// First, a JPEG file is huffman decoded and inverse DCT transformed and dequantized. +// Then the different planes are resized. Finally, the resized image is quantized, forward +// DCT transformed and huffman encoded. + +#include "cuda_kernels.h" + +#include <npp.h> +#include <cuda_runtime.h> +#include "common/UtilNPP/Exceptions.h" + +#include "Endianess.h" +#include <math.h> + +#include <string.h> +#include <fstream> +#include <iostream> + +#include "common/inc/helper_string.h" +#include "common/inc/helper_cuda.h" +//#include "MacroDef.h" +#include "cuda.h" + +using namespace std; + +struct FrameHeader +{ + unsigned char nSamplePrecision; + unsigned short nHeight; + unsigned short nWidth; + unsigned char nComponents; + unsigned char aComponentIdentifier[3]; + unsigned char aSamplingFactors[3]; + unsigned char aQuantizationTableSelector[3]; +}; + +struct ScanHeader +{ + unsigned char nComponents; + unsigned char aComponentSelector[3]; + unsigned char aHuffmanTablesSelector[3]; + unsigned char nSs; + unsigned char nSe; + unsigned char nA; +}; + +struct QuantizationTable +{ + unsigned char nPrecisionAndIdentifier; + unsigned char aTable[64]; +}; + +struct HuffmanTable +{ + unsigned char nClassAndIdentifier; + unsigned char aCodes[16]; + unsigned char aTable[256]; +}; + +//??准?炼??藕?量??模?? +//unsigned char std_Y_QT[64] = +//{ +// 16, 11, 10, 16, 24, 40, 51, 61, +// 12, 12, 14, 19, 26, 58, 60, 55, +// 14, 13, 16, 24, 40, 57, 69, 56, +// 14, 17, 22, 29, 51, 87, 80, 62, +// 18, 22, 37, 56, 68, 109, 103, 77, +// 24, 35, 55, 64, 81, 104, 113, 92, +// 49, 64, 78, 87, 103, 121, 120, 101, +// 72, 92, 95, 98, 112, 100, 103, 99 +//}; +// +////??准色???藕?量??模?? +//unsigned char std_UV_QT[64] = +//{ +// 17, 18, 24, 47, 99, 99, 99, 99, +// 18, 21, 26, 66, 99, 99, 99, 99, +// 24, 26, 56, 99, 99, 99, 99, 99, +// 47, 66, 99, 99, 99, 99, 99, 99, +// 99, 99, 99, 99, 99, 99, 99, 99, +// 99, 99, 99, 99, 99, 99, 99, 99, +// 99, 99, 99, 99, 99, 99, 99, 99, +// 99, 99, 99, 99, 99, 99, 99, 99 +//}; + +////?炼??藕?量??模?? +//unsigned char std_Y_QT[64] = +//{ +// 6, 4, 5, 6, 5, 4, 6, 6, +// 5, 6, 7, 7, 6, 8, 10, 16, +// 10, 10, 9, 9, 10, 20, 14, 15, +// 12, 16, 23, 20, 24, 24, 23, 20, +// 22, 22, 26, 29, 37, 31, 26, 27, +// 35, 28, 22, 22, 32, 44, 32, 35, +// 38, 39, 41, 42, 41, 25, 31, 45, +// 48, 45, 40, 48, 37, 40, 41, 40 +//}; +// +////色???藕?量??模?? +//unsigned char std_UV_QT[64] = +//{ +// 7, 7, 7, 10, 8, 10, 19, 10, +// 10, 19, 40, 26, 22, 26, 40, 40, +// 40, 40, 40, 40, 40, 40, 40, 40, +// 40, 40, 40, 40, 40, 40, 40, 40, +// 40, 40, 40, 40, 40, 40, 40, 40, +// 40, 40, 40, 40, 40, 40, 40, 40, +// 40, 40, 40, 40, 40, 40, 40, 40, +// 40, 40, 40, 40, 40, 40, 40, 40 +//}; + +//?炼??藕?量??模?? +unsigned char std_Y_QT[64] = +{ + 0.75 * 6, 0.75 * 4, 0.75 * 5, 0.75 * 6, 0.75 * 5, 0.75 * 4, 0.75 * 6, 0.75 * 6, + 0.75 * 5, 0.75 * 6, 0.75 * 7, 0.75 * 7, 0.75 * 6, 0.75 * 8, 0.75 * 10, 0.75 * 16, + 0.75 * 10, 0.75 * 10, 0.75 * 9, 0.75 * 9, 0.75 * 10, 0.75 * 20, 0.75 * 14, 0.75 * 15, + 0.75 * 12, 0.75 * 16, 0.75 * 23, 0.75 * 20, 0.75 * 24, 0.75 * 24, 0.75 * 23, 0.75 * 20, + 0.75 * 22, 0.75 * 22, 0.75 * 26, 0.75 * 29, 0.75 * 37, 0.75 * 31, 0.75 * 26, 0.75 * 27, + 0.75 * 35, 0.75 * 28, 0.75 * 22, 0.75 * 22, 0.75 * 32, 0.75 * 44, 0.75 * 32, 0.75 * 35, + 0.75 * 38, 0.75 * 39, 0.75 * 41, 0.75 * 42, 0.75 * 41, 0.75 * 25, 0.75 * 31, 0.75 * 45, + 0.75 * 48, 0.75 * 45, 0.75 * 40, 0.75 * 48, 0.75 * 37, 0.75 * 40, 0.75 * 41, 0.75 * 40 +}; + +//色???藕?量??模?? +unsigned char std_UV_QT[64] = +{ + 0.75 * 7, 0.75 * 7, 0.75 * 7, 0.75 * 10, 0.75 * 8, 0.75 * 10, 0.75 * 19, 0.75 * 10, + 0.75 * 10, 0.75 * 19, 0.75 * 40, 0.75 * 26, 0.75 * 22, 0.75 * 26, 0.75 * 40, 0.75 * 40, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30 +}; + +unsigned char STD_DC_Y_NRCODES[16] = { 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 }; +unsigned char STD_DC_Y_VALUES[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; + +unsigned char STD_DC_UV_NRCODES[16] = { 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 }; +unsigned char STD_DC_UV_VALUES[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; + +unsigned char STD_AC_Y_NRCODES[16] = { 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0X7D }; +unsigned char STD_AC_Y_VALUES[162] = +{ + 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, + 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07, + 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08, + 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0, + 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16, + 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28, + 0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, + 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, + 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, + 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, + 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, + 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, + 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, + 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, + 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2, + 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, + 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, + 0xf9, 0xfa +}; + +unsigned char STD_AC_UV_NRCODES[16] = { 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0X77 }; +unsigned char STD_AC_UV_VALUES[162] = +{ + 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, + 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71, + 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, + 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0, + 0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34, + 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26, + 0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, + 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, + 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, + 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, + 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, + 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, + 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, + 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, + 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, + 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, + 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, + 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, + 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, + 0xf9, 0xfa +}; + +int DivUp(int x, int d) +{ + return (x + d - 1) / d; +} + +template<typename T> +void writeAndAdvance(unsigned char *&pData, T nElement) +{ + writeBigEndian<T>(pData, nElement); + pData += sizeof(T); +} + +void writeMarker(unsigned char nMarker, unsigned char *&pData) +{ + *pData++ = 0x0ff; + *pData++ = nMarker; +} + +void writeJFIFTag(unsigned char *&pData) +{ + const char JFIF_TAG[] = + { + 0x4a, 0x46, 0x49, 0x46, 0x00, + 0x01, 0x02, + 0x00, + 0x00, 0x01, 0x00, 0x01, + 0x00, 0x00 + }; + + writeMarker(0x0e0, pData); + writeAndAdvance<unsigned short>(pData, sizeof(JFIF_TAG) + sizeof(unsigned short)); + memcpy(pData, JFIF_TAG, sizeof(JFIF_TAG)); + pData += sizeof(JFIF_TAG); +} + +void writeFrameHeader(const FrameHeader &header, unsigned char *&pData) +{ + unsigned char aTemp[128]; + unsigned char *pTemp = aTemp; + + writeAndAdvance<unsigned char>(pTemp, header.nSamplePrecision); + writeAndAdvance<unsigned short>(pTemp, header.nHeight); + writeAndAdvance<unsigned short>(pTemp, header.nWidth); + writeAndAdvance<unsigned char>(pTemp, header.nComponents); + + for (int c = 0; c<header.nComponents; ++c) + { + writeAndAdvance<unsigned char>(pTemp, header.aComponentIdentifier[c]); + writeAndAdvance<unsigned char>(pTemp, header.aSamplingFactors[c]); + writeAndAdvance<unsigned char>(pTemp, header.aQuantizationTableSelector[c]); + } + + unsigned short nLength = (unsigned short)(pTemp - aTemp); + + writeMarker(0x0C0, pData); + writeAndAdvance<unsigned short>(pData, nLength + 2); + memcpy(pData, aTemp, nLength); + pData += nLength; +} + +void writeScanHeader(const ScanHeader &header, unsigned char *&pData) +{ + unsigned char aTemp[128]; + unsigned char *pTemp = aTemp; + + writeAndAdvance<unsigned char>(pTemp, header.nComponents); + + for (int c = 0; c<header.nComponents; ++c) + { + writeAndAdvance<unsigned char>(pTemp, header.aComponentSelector[c]); + writeAndAdvance<unsigned char>(pTemp, header.aHuffmanTablesSelector[c]); + } + + writeAndAdvance<unsigned char>(pTemp, header.nSs); + writeAndAdvance<unsigned char>(pTemp, header.nSe); + writeAndAdvance<unsigned char>(pTemp, header.nA); + + unsigned short nLength = (unsigned short)(pTemp - aTemp); + + writeMarker(0x0DA, pData); + writeAndAdvance<unsigned short>(pData, nLength + 2); + memcpy(pData, aTemp, nLength); + pData += nLength; +} + +void writeQuantizationTable(const QuantizationTable &table, unsigned char *&pData) +{ + writeMarker(0x0DB, pData); + writeAndAdvance<unsigned short>(pData, sizeof(QuantizationTable) + 2); + memcpy(pData, &table, sizeof(QuantizationTable)); + pData += sizeof(QuantizationTable); +} + +void writeHuffmanTable(const HuffmanTable &table, unsigned char *&pData) +{ + writeMarker(0x0C4, pData); + + // Number of Codes for Bit Lengths [1..16] + int nCodeCount = 0; + + for (int i = 0; i < 16; ++i) + { + nCodeCount += table.aCodes[i]; + } + + writeAndAdvance<unsigned short>(pData, 17 + nCodeCount + 2); + memcpy(pData, &table, 17 + nCodeCount); + pData += 17 + nCodeCount; +} + +bool printfNPPinfo(int cudaVerMajor, int cudaVerMinor) +{ + const NppLibraryVersion *libVer = nppGetLibVersion(); + + printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); + + int driverVersion, runtimeVersion; + cudaDriverGetVersion(&driverVersion); + cudaRuntimeGetVersion(&runtimeVersion); + + printf(" CUDA Driver Version: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); + printf(" CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10); + + bool bVal = checkCudaCapabilities(cudaVerMajor, cudaVerMinor); + return bVal; +} + +NppiDCTState *pDCTState; +FrameHeader oFrameHeader; +FrameHeader oFrameHeaderFixedSize; +ScanHeader oScanHeader; +QuantizationTable aQuantizationTables[4]; +Npp8u *pdQuantizationTables; +HuffmanTable aHuffmanTables[4]; +HuffmanTable *pHuffmanDCTables; +HuffmanTable *pHuffmanACTables; +int nMCUBlocksH; +int nMCUBlocksV; +int nMCUBlocksHFixedSize; +int nMCUBlocksVFixedSize; +Npp8u *pdScan; +NppiEncodeHuffmanSpec *apHuffmanDCTable[3]; +NppiEncodeHuffmanSpec *apHuffmanACTable[3]; +unsigned char *pDstJpeg; +unsigned char *pDstOutput; +int nRestartInterval; + +int initTable() +{ + NPP_CHECK_NPP(nppiDCTInitAlloc(&pDCTState)); + + nRestartInterval = -1; + + cudaMalloc(&pdQuantizationTables, 64 * 4); + pHuffmanDCTables = aHuffmanTables; + pHuffmanACTables = &aHuffmanTables[2]; + memset(aQuantizationTables, 0, 4 * sizeof(QuantizationTable)); + memset(aHuffmanTables, 0, 4 * sizeof(HuffmanTable)); + memset(&oFrameHeader, 0, sizeof(FrameHeader)); + + + //????Huffman?? + aHuffmanTables[0].nClassAndIdentifier = 0; + memcpy(aHuffmanTables[0].aCodes, STD_DC_Y_NRCODES, 16); + memcpy(aHuffmanTables[0].aTable, STD_DC_Y_VALUES, 12); + + aHuffmanTables[1].nClassAndIdentifier = 1; + memcpy(aHuffmanTables[1].aCodes, STD_DC_UV_NRCODES, 16); + memcpy(aHuffmanTables[1].aTable, STD_DC_UV_VALUES, 12); + + aHuffmanTables[2].nClassAndIdentifier = 16; + memcpy(aHuffmanTables[2].aCodes, STD_AC_Y_NRCODES, 16); + memcpy(aHuffmanTables[2].aTable, STD_AC_Y_VALUES, 162); + + aHuffmanTables[3].nClassAndIdentifier = 17; + memcpy(aHuffmanTables[3].aCodes, STD_AC_UV_NRCODES, 16); + memcpy(aHuffmanTables[3].aTable, STD_AC_UV_VALUES, 162); + + + //????量???? + aQuantizationTables[0].nPrecisionAndIdentifier = 0; + memcpy(aQuantizationTables[0].aTable, std_Y_QT, 64); + aQuantizationTables[1].nPrecisionAndIdentifier = 1; + memcpy(aQuantizationTables[1].aTable, std_UV_QT, 64); + + NPP_CHECK_CUDA(cudaMemcpyAsync(pdQuantizationTables, aQuantizationTables[0].aTable, 64, cudaMemcpyHostToDevice)); + NPP_CHECK_CUDA(cudaMemcpyAsync(pdQuantizationTables + 64, aQuantizationTables[1].aTable, 64, cudaMemcpyHostToDevice)); + + oFrameHeader.nSamplePrecision = 8; + oFrameHeader.nComponents = 3; + oFrameHeader.aComponentIdentifier[0] = 1; + oFrameHeader.aComponentIdentifier[1] = 2; + oFrameHeader.aComponentIdentifier[2] = 3; + oFrameHeader.aSamplingFactors[0] = 34; + oFrameHeader.aSamplingFactors[1] = 17; + oFrameHeader.aSamplingFactors[2] = 17; + oFrameHeader.aQuantizationTableSelector[0] = 0; + oFrameHeader.aQuantizationTableSelector[1] = 1; + oFrameHeader.aQuantizationTableSelector[2] = 1; + + for (int i = 0; i < oFrameHeader.nComponents; ++i) + { + nMCUBlocksV = max(nMCUBlocksV, oFrameHeader.aSamplingFactors[i] & 0x0f); + nMCUBlocksH = max(nMCUBlocksH, oFrameHeader.aSamplingFactors[i] >> 4); + } + NPP_CHECK_CUDA(cudaMalloc(&pdScan, 4 << 20)); + + + + oScanHeader.nComponents = 3; + oScanHeader.aComponentSelector[0] = 1; + oScanHeader.aComponentSelector[1] = 2; + oScanHeader.aComponentSelector[2] = 3; + oScanHeader.aHuffmanTablesSelector[0] = 0; + oScanHeader.aHuffmanTablesSelector[1] = 17; + oScanHeader.aHuffmanTablesSelector[2] = 17; + oScanHeader.nSs = 0; + oScanHeader.nSe = 63; + oScanHeader.nA = 0; + + + return 0; +} + +NppiSize aSrcSize[3]; +Npp16s *apdDCT[3];// = { 0, 0, 0 }; +Npp32s aDCTStep[3]; + +Npp8u *apSrcImage[3];// = { 0, 0, 0 }; +Npp32s aSrcImageStep[3]; +size_t aSrcPitch[3]; + + +int releaseJpegNPP() +{ + nppiDCTFree(pDCTState); + cudaFree(pdQuantizationTables); + cudaFree(pdScan); + for (int i = 0; i < 3; ++i) + { + cudaFree(apdDCT[i]); + cudaFree(apSrcImage[i]); + } + return 0; +} + + +int initTable(int flag, int width, int height) +{ + //????帧头 + oFrameHeaderFixedSize.nSamplePrecision = 8; + oFrameHeaderFixedSize.nComponents = 3; + oFrameHeaderFixedSize.aComponentIdentifier[0] = 1; + oFrameHeaderFixedSize.aComponentIdentifier[1] = 2; + oFrameHeaderFixedSize.aComponentIdentifier[2] = 3; + oFrameHeaderFixedSize.aSamplingFactors[0] = 34; + oFrameHeaderFixedSize.aSamplingFactors[1] = 17; + oFrameHeaderFixedSize.aSamplingFactors[2] = 17; + oFrameHeaderFixedSize.aQuantizationTableSelector[0] = 0; + oFrameHeaderFixedSize.aQuantizationTableSelector[1] = 1; + oFrameHeaderFixedSize.aQuantizationTableSelector[2] = 1; + oFrameHeaderFixedSize.nWidth = width; + oFrameHeaderFixedSize.nHeight = height; + + for (int i = 0; i < oFrameHeaderFixedSize.nComponents; ++i) + { + nMCUBlocksVFixedSize = max(nMCUBlocksVFixedSize, oFrameHeaderFixedSize.aSamplingFactors[i] & 0x0f); + nMCUBlocksHFixedSize = max(nMCUBlocksHFixedSize, oFrameHeaderFixedSize.aSamplingFactors[i] >> 4); + } + + for (int i = 0; i < oFrameHeaderFixedSize.nComponents; ++i) + { + NppiSize oBlocks; + NppiSize oBlocksPerMCU = { oFrameHeaderFixedSize.aSamplingFactors[i] >> 4, oFrameHeaderFixedSize.aSamplingFactors[i] & 0x0f }; + + oBlocks.width = (int)ceil((oFrameHeaderFixedSize.nWidth + 7) / 8 * + static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksHFixedSize); + oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; + + oBlocks.height = (int)ceil((oFrameHeaderFixedSize.nHeight + 7) / 8 * + static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksVFixedSize); + oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; + + aSrcSize[i].width = oBlocks.width * 8; + aSrcSize[i].height = oBlocks.height * 8; + + // Allocate Memory + size_t nPitch; + NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height)); + aDCTStep[i] = static_cast<Npp32s>(nPitch); + + NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height)); + + aSrcPitch[i] = nPitch; + aSrcImageStep[i] = static_cast<Npp32s>(nPitch); + } + + return 0; +} + +int jpegNPP(const char *szOutputFile, float* d_srcRGB) +{ + //RGB2YUV + cudaError_t cudaStatus; + cudaStatus = cuda_common::RGB2YUV(d_srcRGB, oFrameHeaderFixedSize.nWidth, oFrameHeaderFixedSize.nHeight, + apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height, + apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height, + apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height); + + /** + * Forward DCT, quantization and level shift part of the JPEG encoding. + * Input is expected in 8x8 macro blocks and output is expected to be in 64x1 + * macro blocks. The new version of the primitive takes the ROI in image pixel size and + * works with DCT coefficients that are in zig-zag order. + */ + int k = 0; + //LOG_INFO("NPP_CHECK_NPP:%d", 1); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0], + apdDCT[0], aDCTStep[0], + pdQuantizationTables + k * 64, + aSrcSize[0], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + k = 1; + //LOG_INFO("NPP_CHECK_NPP:%d", 2); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1], + apdDCT[1], aDCTStep[1], + pdQuantizationTables + k * 64, + aSrcSize[1], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + //LOG_INFO("NPP_CHECK_NPP:%d", 3); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2], + apdDCT[2], aDCTStep[2], + pdQuantizationTables + k * 64, + aSrcSize[2], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + // Huffman Encoding + + Npp32s nScanLength; + Npp8u *pJpegEncoderTemp; + +#if (CUDA_VERSION == 8000) + Npp32s nTempSize; //when using CUDA8 +#else + size_t nTempSize; //when using CUDA9 +#endif + //modified by Junlin 190221 + + //LOG_INFO("NPP_CHECK_NPP:%d",4); + if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize))) + { + printf("nppiEncodeHuffmanGetSize Failed!\n"); + return EXIT_FAILURE; + } + + //LOG_INFO("NPP_CHECK_CUDA:%d",5); + NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize)); + + /** + * Allocates memory and creates a Huffman table in a format that is suitable for the encoder. + */ + NppStatus t_status; + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]); + + /** + * Huffman Encoding of the JPEG Encoding. + * Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan. + */ + Npp32s nSs = 0; + Npp32s nSe = 63; + Npp32s nH = 0; + Npp32s nL = 0; + //LOG_INFO("NPP_CHECK_NPP:%d",6); + if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep, + 0, nSs, nSe, nH, nL, + pdScan, &nScanLength, + apHuffmanDCTable, + apHuffmanACTable, + aSrcSize, + pJpegEncoderTemp))) + { + printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n"); + return EXIT_FAILURE; + } + + for (int i = 0; i < 3; ++i) + { + nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]); + nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]); + } + // Write JPEG + pDstJpeg = new unsigned char[4 << 20]{}; + pDstOutput = pDstJpeg; + + writeMarker(0x0D8, pDstOutput); + writeJFIFTag(pDstOutput); + writeQuantizationTable(aQuantizationTables[0], pDstOutput); + writeQuantizationTable(aQuantizationTables[1], pDstOutput); + writeHuffmanTable(pHuffmanDCTables[0], pDstOutput); + writeHuffmanTable(pHuffmanACTables[0], pDstOutput); + writeHuffmanTable(pHuffmanDCTables[1], pDstOutput); + writeHuffmanTable(pHuffmanACTables[1], pDstOutput); + writeFrameHeader(oFrameHeaderFixedSize, pDstOutput); + writeScanHeader(oScanHeader, pDstOutput); + + //LOG_INFO("NPP_CHECK_CUDA:%d",7); + NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost)); + + pDstOutput += nScanLength; + writeMarker(0x0D9, pDstOutput); + { + // Write result to file. + std::ofstream outputFile(szOutputFile, ios::out | ios::binary); + outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg)); + } + + // Cleanup + cudaFree(pJpegEncoderTemp); + delete[] pDstJpeg; + + + return EXIT_SUCCESS; +} + +int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB) +{ + //RGB2YUV + cudaError_t cudaStatus; + cudaStatus = cuda_common::RGB2YUV(d_srcRGB, oFrameHeaderFixedSize.nWidth, oFrameHeaderFixedSize.nHeight, + apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height, + apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height, + apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height); + + /** + * Forward DCT, quantization and level shift part of the JPEG encoding. + * Input is expected in 8x8 macro blocks and output is expected to be in 64x1 + * macro blocks. The new version of the primitive takes the ROI in image pixel size and + * works with DCT coefficients that are in zig-zag order. + */ + int k = 0; + //LOG_INFO("NPP_CHECK_NPP:%d", 1); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0], + apdDCT[0], aDCTStep[0], + pdQuantizationTables + k * 64, + aSrcSize[0], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + k = 1; + //LOG_INFO("NPP_CHECK_NPP:%d", 2); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1], + apdDCT[1], aDCTStep[1], + pdQuantizationTables + k * 64, + aSrcSize[1], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + //LOG_INFO("NPP_CHECK_NPP:%d", 3); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2], + apdDCT[2], aDCTStep[2], + pdQuantizationTables + k * 64, + aSrcSize[2], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + // Huffman Encoding + + Npp32s nScanLength; + Npp8u *pJpegEncoderTemp; + +#if (CUDA_VERSION == 8000) + Npp32s nTempSize; //when using CUDA8 +#else + size_t nTempSize; //when using CUDA9 +#endif + //modified by Junlin 190221 + + //LOG_INFO("NPP_CHECK_NPP:%d",4); + if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize))) + { + printf("nppiEncodeHuffmanGetSize Failed!\n"); + return EXIT_FAILURE; + } + + //LOG_INFO("NPP_CHECK_CUDA:%d",5); + NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize)); + + /** + * Allocates memory and creates a Huffman table in a format that is suitable for the encoder. + */ + NppStatus t_status; + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]); + + /** + * Huffman Encoding of the JPEG Encoding. + * Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan. + */ + Npp32s nSs = 0; + Npp32s nSe = 63; + Npp32s nH = 0; + Npp32s nL = 0; + //LOG_INFO("NPP_CHECK_NPP:%d",6); + if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep, + 0, nSs, nSe, nH, nL, + pdScan, &nScanLength, + apHuffmanDCTable, + apHuffmanACTable, + aSrcSize, + pJpegEncoderTemp))) + { + printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n"); + return EXIT_FAILURE; + } + + for (int i = 0; i < 3; ++i) + { + nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]); + nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]); + } + // Write JPEG + pDstJpeg = new unsigned char[4 << 20]{}; + pDstOutput = pDstJpeg; + + writeMarker(0x0D8, pDstOutput); + writeJFIFTag(pDstOutput); + writeQuantizationTable(aQuantizationTables[0], pDstOutput); + writeQuantizationTable(aQuantizationTables[1], pDstOutput); + writeHuffmanTable(pHuffmanDCTables[0], pDstOutput); + writeHuffmanTable(pHuffmanACTables[0], pDstOutput); + writeHuffmanTable(pHuffmanDCTables[1], pDstOutput); + writeHuffmanTable(pHuffmanACTables[1], pDstOutput); + writeFrameHeader(oFrameHeaderFixedSize, pDstOutput); + writeScanHeader(oScanHeader, pDstOutput); + + //LOG_INFO("NPP_CHECK_CUDA:%d",7); + NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost)); + + pDstOutput += nScanLength; + writeMarker(0x0D9, pDstOutput); + { + // Write result to file. + std::ofstream outputFile(szOutputFile, ios::out | ios::binary); + outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg)); + } + + // Cleanup + cudaFree(pJpegEncoderTemp); + delete[] pDstJpeg; + + + return EXIT_SUCCESS; +} + + +int jpegNPP(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height) +{ + NppiSize aSrcSize[3]; + Npp16s *apdDCT[3] = { 0, 0, 0 }; + Npp32s aDCTStep[3]; + + Npp8u *apSrcImage[3] = { 0, 0, 0 }; + Npp32s aSrcImageStep[3]; + size_t aSrcPitch[3]; + + + //????帧头 + oFrameHeader.nWidth = img_width; + oFrameHeader.nHeight = img_height; + + for (int i = 0; i < oFrameHeader.nComponents; ++i) + { + NppiSize oBlocks; + NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] >> 4, oFrameHeader.aSamplingFactors[i] & 0x0f }; + + oBlocks.width = (int)ceil((oFrameHeader.nWidth + 7) / 8 * + static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH); + oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; + + oBlocks.height = (int)ceil((oFrameHeader.nHeight + 7) / 8 * + static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV); + oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; + + aSrcSize[i].width = oBlocks.width * 8; + aSrcSize[i].height = oBlocks.height * 8; + + // Allocate Memory + size_t nPitch; + //LOG_INFO("NPP_CHECK_CUDA:%d",1); + NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height)); + aDCTStep[i] = static_cast<Npp32s>(nPitch); + + //LOG_INFO("NPP_CHECK_CUDA:%d",2); + NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height)); + + aSrcPitch[i] = nPitch; + aSrcImageStep[i] = static_cast<Npp32s>(nPitch); + } + + //RGB2YUV + cudaError_t cudaStatus; + cudaStatus = cuda_common::RGB2YUV(d_srcRGB, img_width, img_height, + apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height, + apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height, + apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height); + + /** + * Forward DCT, quantization and level shift part of the JPEG encoding. + * Input is expected in 8x8 macro blocks and output is expected to be in 64x1 + * macro blocks. The new version of the primitive takes the ROI in image pixel size and + * works with DCT coefficients that are in zig-zag order. + */ + int k = 0; + //LOG_INFO("NPP_CHECK_CUDA:%d",3); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0], + apdDCT[0], aDCTStep[0], + pdQuantizationTables + k * 64, + aSrcSize[0], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + k = 1; + + //LOG_INFO("NPP_CHECK_CUDA:%d",4); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1], + apdDCT[1], aDCTStep[1], + pdQuantizationTables + k * 64, + aSrcSize[1], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + //LOG_INFO("NPP_CHECK_CUDA:%d",5); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2], + apdDCT[2], aDCTStep[2], + pdQuantizationTables + k * 64, + aSrcSize[2], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + // Huffman Encoding + + Npp32s nScanLength; + Npp8u *pJpegEncoderTemp; + +#if (CUDA_VERSION == 8000) + Npp32s nTempSize; //when using CUDA8 +#else + size_t nTempSize; //when using CUDA9 +#endif + //modified by Junlin 190221 + + //LOG_INFO("NPP_CHECK_CUDA:%d",6); + if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize))) + { + printf("nppiEncodeHuffmanGetSize Failed!\n"); + return EXIT_FAILURE; + } + + //LOG_INFO("NPP_CHECK_CUDA:%d",7); + NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize)); + + /** + * Allocates memory and creates a Huffman table in a format that is suitable for the encoder. + */ + NppStatus t_status; + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]); + + /** + * Huffman Encoding of the JPEG Encoding. + * Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan. + */ + Npp32s nSs = 0; + Npp32s nSe = 63; + Npp32s nH = 0; + Npp32s nL = 0; + //LOG_INFO("NPP_CHECK_CUDA:%d",8); + if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep, + 0, nSs, nSe, nH, nL, + pdScan, &nScanLength, + apHuffmanDCTable, + apHuffmanACTable, + aSrcSize, + pJpegEncoderTemp))) + { + printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n"); + return EXIT_FAILURE; + } + + for (int i = 0; i < 3; ++i) + { + nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]); + nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]); + } + // Write JPEG + pDstJpeg = new unsigned char[4 << 20]{}; + pDstOutput = pDstJpeg; + + writeMarker(0x0D8, pDstOutput); + writeJFIFTag(pDstOutput); + writeQuantizationTable(aQuantizationTables[0], pDstOutput); + writeQuantizationTable(aQuantizationTables[1], pDstOutput); + writeHuffmanTable(pHuffmanDCTables[0], pDstOutput); + writeHuffmanTable(pHuffmanACTables[0], pDstOutput); + writeHuffmanTable(pHuffmanDCTables[1], pDstOutput); + writeHuffmanTable(pHuffmanACTables[1], pDstOutput); + writeFrameHeader(oFrameHeader, pDstOutput); + writeScanHeader(oScanHeader, pDstOutput); + + //LOG_INFO("NPP_CHECK_CUDA:%d",9); + NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost)); + + pDstOutput += nScanLength; + writeMarker(0x0D9, pDstOutput); + + { + // Write result to file. + std::ofstream outputFile(szOutputFile, ios::out | ios::binary); + outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg)); + } + + // Cleanup + cudaFree(pJpegEncoderTemp); + delete[] pDstJpeg; + for (int i = 0; i < 3; ++i) + { + cudaFree(apdDCT[i]); + cudaFree(apSrcImage[i]); + } + + return EXIT_SUCCESS; +} + + +int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height) +{ + NppiSize aSrcSize[3]; + Npp16s *apdDCT[3] = { 0, 0, 0 }; + Npp32s aDCTStep[3]; + + Npp8u *apSrcImage[3] = { 0, 0, 0 }; + Npp32s aSrcImageStep[3]; + size_t aSrcPitch[3]; + + + //????帧头 + oFrameHeader.nWidth = img_width; + oFrameHeader.nHeight = img_height; + + for (int i = 0; i < oFrameHeader.nComponents; ++i) + { + NppiSize oBlocks; + NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] >> 4, oFrameHeader.aSamplingFactors[i] & 0x0f }; + + oBlocks.width = (int)ceil((oFrameHeader.nWidth + 7) / 8 * + static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH); + oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; + + oBlocks.height = (int)ceil((oFrameHeader.nHeight + 7) / 8 * + static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV); + oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; + + aSrcSize[i].width = oBlocks.width * 8; + aSrcSize[i].height = oBlocks.height * 8; + + // Allocate Memory + size_t nPitch; + //LOG_INFO("NPP_CHECK_CUDA:%d",1); + NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height)); + aDCTStep[i] = static_cast<Npp32s>(nPitch); + + //LOG_INFO("NPP_CHECK_CUDA:%d",2); + NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height)); + + aSrcPitch[i] = nPitch; + aSrcImageStep[i] = static_cast<Npp32s>(nPitch); + } + + //RGB2YUV + cudaError_t cudaStatus; + cudaStatus = cuda_common::RGB2YUV(d_srcRGB, img_width, img_height, + apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height, + apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height, + apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height); + + /** + * Forward DCT, quantization and level shift part of the JPEG encoding. + * Input is expected in 8x8 macro blocks and output is expected to be in 64x1 + * macro blocks. The new version of the primitive takes the ROI in image pixel size and + * works with DCT coefficients that are in zig-zag order. + */ + int k = 0; + //LOG_INFO("NPP_CHECK_CUDA:%d",3); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0], + apdDCT[0], aDCTStep[0], + pdQuantizationTables + k * 64, + aSrcSize[0], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + k = 1; + + //LOG_INFO("NPP_CHECK_CUDA:%d",4); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1], + apdDCT[1], aDCTStep[1], + pdQuantizationTables + k * 64, + aSrcSize[1], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + //LOG_INFO("NPP_CHECK_CUDA:%d",5); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2], + apdDCT[2], aDCTStep[2], + pdQuantizationTables + k * 64, + aSrcSize[2], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + // Huffman Encoding + + Npp32s nScanLength; + Npp8u *pJpegEncoderTemp; + +#if (CUDA_VERSION == 8000) + Npp32s nTempSize; //when using CUDA8 +#else + size_t nTempSize; //when using CUDA9 +#endif + //modified by Junlin 190221 + + //LOG_INFO("NPP_CHECK_CUDA:%d",6); + if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize))) + { + printf("nppiEncodeHuffmanGetSize Failed!\n"); + return EXIT_FAILURE; + } + + //LOG_INFO("NPP_CHECK_CUDA:%d",7); + NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize)); + + /** + * Allocates memory and creates a Huffman table in a format that is suitable for the encoder. + */ + NppStatus t_status; + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]); + + /** + * Huffman Encoding of the JPEG Encoding. + * Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan. + */ + Npp32s nSs = 0; + Npp32s nSe = 63; + Npp32s nH = 0; + Npp32s nL = 0; + //LOG_INFO("NPP_CHECK_CUDA:%d",8); + if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep, + 0, nSs, nSe, nH, nL, + pdScan, &nScanLength, + apHuffmanDCTable, + apHuffmanACTable, + aSrcSize, + pJpegEncoderTemp))) + { + printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n"); + return EXIT_FAILURE; + } + + for (int i = 0; i < 3; ++i) + { + nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]); + nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]); + } + // Write JPEG + pDstJpeg = new unsigned char[4 << 20]{}; + pDstOutput = pDstJpeg; + + writeMarker(0x0D8, pDstOutput); + writeJFIFTag(pDstOutput); + writeQuantizationTable(aQuantizationTables[0], pDstOutput); + writeQuantizationTable(aQuantizationTables[1], pDstOutput); + writeHuffmanTable(pHuffmanDCTables[0], pDstOutput); + writeHuffmanTable(pHuffmanACTables[0], pDstOutput); + writeHuffmanTable(pHuffmanDCTables[1], pDstOutput); + writeHuffmanTable(pHuffmanACTables[1], pDstOutput); + writeFrameHeader(oFrameHeader, pDstOutput); + writeScanHeader(oScanHeader, pDstOutput); + + //LOG_INFO("NPP_CHECK_CUDA:%d",9); + NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost)); + + pDstOutput += nScanLength; + writeMarker(0x0D9, pDstOutput); + + { + // Write result to file. + std::ofstream outputFile(szOutputFile, ios::out | ios::binary); + outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg)); + } + + // Cleanup + cudaFree(pJpegEncoderTemp); + delete[] pDstJpeg; + for (int i = 0; i < 3; ++i) + { + cudaFree(apdDCT[i]); + cudaFree(apSrcImage[i]); + } + + return EXIT_SUCCESS; +} diff --git a/src/nvdecoder/DrawImageOnGPU.cu b/src/nvdecoder/DrawImageOnGPU.cu deleted file mode 100644 index 1fa99dc..0000000 --- a/src/nvdecoder/DrawImageOnGPU.cu +++ /dev/null @@ -1,126 +0,0 @@ -#include "cuda_kernels.h" - -#include "../interface/logger.hpp" - -typedef unsigned char uchar; -typedef unsigned int uint32; -typedef int int32; - -namespace cuda_common -{ - __global__ void kernel_drawPixel(float* d_srcRGB, int src_width, int src_height, - int left, int top, int right, int bottom) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (((x == left || x == right) && y >= top && y <= bottom) || ((y == top || y == bottom) && x >= left && x <= right)) - { - d_srcRGB[(y*src_width) + x] = 0; - d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255; - d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0; - } - } - - cudaError_t DrawImage(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom) - { - dim3 block(32, 16, 1); - dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1); - - kernel_drawPixel << < grid, block >> >(d_srcRGB, src_width, src_height, left, top, right, bottom); - - cudaError_t cudaStatus = cudaGetLastError(); - if (cudaStatus != cudaSuccess) { - LOG_ERROR("Draw 32 kernel_memcopy launch failed:{}",cudaGetErrorString(cudaStatus)); - return cudaStatus; - } - - cudaStatus = cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus); - return cudaStatus; - } - - return cudaStatus; - } - - __global__ void kernel_drawPixel(unsigned char* d_srcRGB, int src_width, int src_height, - int left, int top, int right, int bottom) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (((x == left || x == right) && y >= top && y <= bottom) || ((y == top || y == bottom) && x >= left && x <= right)) - { - d_srcRGB[(y*src_width) + x] = 0; - d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255; - d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0; - } - } - - cudaError_t DrawImage(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom) - { - dim3 block(32, 16, 1); - dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1); - - kernel_drawPixel << < grid, block >> >(d_srcRGB, src_width, src_height, left, top, right, bottom); - - cudaError_t cudaStatus = cudaGetLastError(); - if (cudaStatus != cudaSuccess) { - LOG_ERROR("Draw 68 kernel_memcopy launch failed: {}",cudaGetErrorString(cudaStatus)); - return cudaStatus; - } - - cudaStatus = cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus); - return cudaStatus; - } - - return cudaStatus; - } - - __global__ void kernel_drawLine(float* d_srcRGB, int src_width, int src_height, - int begin_x, int begin_y, int end_x, int end_y) - { - int min_x = end_x < begin_x ? end_x : begin_x; - int max_x = end_x < begin_x ? begin_x : end_x; - - int min_y = end_y < begin_y ? end_y : begin_y; - int max_y = end_y < begin_y ? begin_y : end_y; - - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; - - if ((x - begin_x) * (end_y - begin_y) == (end_x - begin_x) * (y - begin_y) - && min_x <= x && x <= max_x - && min_y <= y && y <= max_y) - { - d_srcRGB[(y*src_width) + x] = 0; - d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255; - d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0; - } - } - - cudaError_t DrawLine(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y) - { - dim3 block(32, 16, 1); - dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1); - - kernel_drawLine << < grid, block >> >(d_srcRGB, src_width, src_height, begin_x, begin_y, end_x, end_y); - - cudaError_t cudaStatus = cudaGetLastError(); - if (cudaStatus != cudaSuccess) { - LOG_ERROR("Draw 112 kernel_memcopy launch failed: {}",cudaGetErrorString(cudaStatus)); - return cudaStatus; - } - - cudaStatus = cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus); - return cudaStatus; - } - - return cudaStatus; - } -} \ No newline at end of file diff --git a/src/nvdecoder/FFCuContextManager.cpp b/src/nvdecoder/FFCuContextManager.cpp deleted file mode 100644 index 382c4d8..0000000 --- a/src/nvdecoder/FFCuContextManager.cpp +++ /dev/null @@ -1,41 +0,0 @@ -#include "FFCuContextManager.h" - -#include "common_header.h" - -using namespace std; - -extern "C" -{ - #include <libavcodec/avcodec.h> - #include <libavdevice/avdevice.h> - #include <libavformat/avformat.h> - #include <libavfilter/avfilter.h> - #include <libavutil/avutil.h> - #include <libavutil/pixdesc.h> - #include <libswscale/swscale.h> - #include <libavutil/imgutils.h> -} - -FFCuContextManager::~FFCuContextManager() -{ - for(auto iter = ctxMap.begin(); iter != ctxMap.end(); iter++){ - av_buffer_unref(&iter->second); - } - ctxMap.clear(); -} - -AVBufferRef *FFCuContextManager::getCuCtx(string gpuid) -{ - AVBufferRef *hw_device_ctx = ctxMap[gpuid]; - if (nullptr == hw_device_ctx) - { - // 初始化硬件解码器 - if (av_hwdevice_ctx_create(&hw_device_ctx, AV_HWDEVICE_TYPE_CUDA, gpuid.c_str(), nullptr, 0) < 0) - { - LOG_ERROR("Failed to create specified HW device."); - return nullptr; - } - ctxMap[gpuid] = hw_device_ctx; - } - return hw_device_ctx; -} \ No newline at end of file diff --git a/src/nvdecoder/FFCuContextManager.h b/src/nvdecoder/FFCuContextManager.h deleted file mode 100644 index 758167c..0000000 --- a/src/nvdecoder/FFCuContextManager.h +++ /dev/null @@ -1,28 +0,0 @@ - -#include<map> -#include<string> - -using namespace std; - -struct AVBufferRef; - -class FFCuContextManager{ -public: - static FFCuContextManager* getInstance(){ - static FFCuContextManager* singleton = nullptr; - if (singleton == nullptr){ - singleton = new FFCuContextManager(); - } - return singleton; - } - - AVBufferRef *getCuCtx(string gpuid); - -private: - FFCuContextManager(){} - ~FFCuContextManager(); - -private: - map<string,AVBufferRef *> ctxMap; - -}; \ No newline at end of file diff --git a/src/nvdecoder/FFNvDecoder.cpp b/src/nvdecoder/FFNvDecoder.cpp deleted file mode 100644 index e64e2a5..0000000 --- a/src/nvdecoder/FFNvDecoder.cpp +++ /dev/null @@ -1,513 +0,0 @@ -#include "FFNvDecoder.h" - -#include <chrono> -#include <thread> -#include <fstream> - -#include <chrono> - -#include "FFCuContextManager.h" - -#include "common_header.h" - -#include "GpuRgbMemory.hpp" -#include "cuda_kernels.h" - -using namespace std; - -// 参考博客: https://blog.csdn.net/qq_40116098/article/details/120704340 - -static AVPixelFormat get_hw_format(AVCodecContext *avctx, const AVPixelFormat *pix_fmts) -{ - FFNvDecoder* _this = (FFNvDecoder*)avctx->opaque; - - const AVPixelFormat *p; - - for (p = pix_fmts; *p != -1; p++) { - if (*p == _this->getHwPixFmt()) - return *p; - } - - LOG_ERROR("Failed to get HW surface format"); - return AV_PIX_FMT_NONE; -} - -FFNvDecoder::FFNvDecoder() -{ - // 初始化解码对象 - fmt_ctx = nullptr; - avctx = nullptr; - m_bRunning = false; - - stream = nullptr; - stream_index = -1; - hw_pix_fmt = AV_PIX_FMT_NONE; - m_dec_name = ""; - - m_bPause = false; - m_bReal = true; - - m_decode_thread = 0; - m_post_decode_thread = 0; - - m_bFinished = false; - m_dec_keyframe = false; - m_fps = 0.0; -} - -FFNvDecoder::~FFNvDecoder() -{ - m_dec_keyframe = false; -} - -bool FFNvDecoder::init(FFDecConfig& cfg) -{ - m_cfg = cfg; - m_dec_name = cfg.dec_name; - - fstream infile(cfg.uri); - if (infile.is_open()){ - m_bReal = false; - infile.close(); - }else { - m_bReal = true; - } - - post_decoded_cbk = cfg.post_decoded_cbk; - decode_finished_cbk = cfg.decode_finished_cbk; - - return init(cfg.uri.c_str(), cfg.gpuid.c_str(),cfg.force_tcp); -} - -bool FFNvDecoder::init(const char* uri, const char* gpuid, bool force_tcp) -{ - // av_log_set_level(AV_LOG_DEBUG); - - avformat_network_init(); - - // 打开输入视频文件 - AVDictionary *options = nullptr; - av_dict_set( &options, "bufsize", "655360", 0 ); - av_dict_set( &options, "rtsp_transport", force_tcp ? "tcp" : "udp", 0 ); - // av_dict_set( &options, "listen_timeout", "30", 0 ); // 单位为s - av_dict_set( &options, "stimeout", "30000000", 0 ); // 单位为 百万分之一秒 - - fmt_ctx = avformat_alloc_context(); - const char* input_file = uri; - if (avformat_open_input(&fmt_ctx, input_file, nullptr, &options) != 0) { - LOG_ERROR("Cannot open input file:{}",input_file); - return false; - } - - // 查找流信息 - if (avformat_find_stream_info(fmt_ctx, nullptr) < 0) { - LOG_ERROR("Cannot find input stream information"); - return false; - } - - // 查找视频流信息 - AVCodec *decoder = nullptr; - stream_index = av_find_best_stream(fmt_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, &decoder, 0); - if (stream_index < 0) { - LOG_ERROR("Cannot find a video stream in the input file"); - return false; - } - - string cuvid_dec_name = string(decoder->name) + "_cuvid"; - AVCodec *vcodec = avcodec_find_decoder_by_name(cuvid_dec_name.c_str()); - if (!(avctx = avcodec_alloc_context3(vcodec))) - return (bool)AVERROR(ENOMEM); - - // 得到视频流对象 - stream = fmt_ctx->streams[stream_index]; - if (avcodec_parameters_to_context(avctx, stream->codecpar) < 0) - return false; - - m_fps = av_q2d(stream ->avg_frame_rate); - - avctx->opaque = this; - // 设置解码器管理器的像素格式回调函数 - avctx->get_format = get_hw_format; - - hw_pix_fmt = AV_PIX_FMT_CUDA; - - FFCuContextManager* pCtxMgr = FFCuContextManager::getInstance(); - - AVBufferRef *hw_device_ctx = pCtxMgr->getCuCtx(gpuid); - if(nullptr == hw_device_ctx){ - av_log(nullptr, AV_LOG_ERROR, "create CUDA context failed ! \n"); - return false; - } - avctx->hw_device_ctx = av_buffer_ref(hw_device_ctx); - if (nullptr == avctx->hw_device_ctx) - { - return false; - } - - // 打开解码器流 - AVDictionary *op = nullptr; - av_dict_set( &op, "gpu", gpuid, 0 ); - // av_dict_set( &op, "surfaces", "5", 0 ); - if (avcodec_open2(avctx, vcodec, &op) < 0) { - LOG_ERROR("Failed to open codec for stream"); - return false; - } - - return true; -} - -bool FFNvDecoder::isSurport(FFDecConfig& cfg) -{ - bool bRet = init(cfg); - decode_finished(); - return bRet; -} - -bool FFNvDecoder::start(){ - - m_bRunning = true; - - pthread_create(&m_decode_thread,0, - [](void* arg) - { - FFNvDecoder* a=(FFNvDecoder*)arg; - a->decode_thread(); - return (void*)0; - } - ,this); - - return true; -} - -void FFNvDecoder::decode_thread() -{ - AVPacket* pkt ; - pkt = av_packet_alloc(); - av_init_packet( pkt ); - - pthread_create(&m_post_decode_thread,0, - [](void* arg) - { - FFNvDecoder* a=(FFNvDecoder*)arg; - a->post_decode_thread(); - return (void*)0; - } - ,this); - - // long start_time = UtilTools::get_cur_time_ms(); - - while (m_bRunning) - { - if (!m_bReal) - { - if (m_bPause) - { - std::this_thread::sleep_for(std::chrono::milliseconds(3)); - continue; - } - } - - int result = av_read_frame(fmt_ctx, pkt); - if (result == AVERROR_EOF || result < 0) - { - LOG_ERROR("Failed to read frame!"); - break; - } - - if (m_dec_keyframe && !(pkt->flags & AV_PKT_FLAG_KEY)) { - av_packet_unref(pkt); - continue; - } - - if (stream_index == pkt->stream_index){ - result = avcodec_send_packet(avctx, pkt); - if (result < 0){ - av_packet_unref(pkt); - LOG_ERROR("{} - Failed to send pkt: {}", m_dec_name, result); - continue; - } - - AVFrame* gpuFrame = av_frame_alloc(); - result = avcodec_receive_frame(avctx, gpuFrame); - if ((result == AVERROR(EAGAIN) || result == AVERROR_EOF) || result < 0){ - LOG_ERROR("{} - Failed to receive frame: {}", m_dec_name, result); - av_frame_free(&gpuFrame); - av_packet_unref(pkt); - continue; - } - av_packet_unref(pkt); - - if (m_bReal){ - if (m_bPause){ - av_frame_free(&gpuFrame); - std::this_thread::sleep_for(std::chrono::milliseconds(3)); - continue; - } - } - - if(gpuFrame != nullptr){ - m_queue_mutex.lock(); - if(mFrameQueue.size() <= 10){ - mFrameQueue.push(gpuFrame); - }else{ - av_frame_free(&gpuFrame); - } - m_queue_mutex.unlock(); - } - } - av_packet_unref(pkt); - } - - m_bRunning = false; - av_packet_free(&pkt); - - // long end_time = UtilTools::get_cur_time_ms(); - // cout << "解码用时:" << end_time - start_time << endl; - - if (m_post_decode_thread != 0) - { - pthread_join(m_post_decode_thread,0); - } - - decode_finished_cbk(m_finishedDecArg); - - decode_finished(); - - // 清空队列 - while(mFrameQueue.size() > 0){ - AVFrame * gpuFrame = mFrameQueue.front(); - av_frame_free(&gpuFrame); - mFrameQueue.pop(); - } - - LOG_INFO("{} - decode thread exited.", m_dec_name); -} - -void FFNvDecoder::decode_finished(){ - if (avctx) - { - avcodec_free_context(&avctx); - } - - if (fmt_ctx) - { - avformat_close_input(&fmt_ctx); - } - - m_bFinished = true; - m_dec_keyframe = false; -} - -void FFNvDecoder::post_decode_thread(){ - int skip_frame = m_cfg.skip_frame; - if (skip_frame <= 0){ - skip_frame = 1; - } - - int index = 0; - while (m_bRunning) - { - if(mFrameQueue.size() > 0){ - std::lock_guard<std::mutex> l(m_snapshot_mutex); - // 取队头数据 - m_queue_mutex.lock(); - AVFrame * gpuFrame = mFrameQueue.front(); - mFrameQueue.pop(); - m_queue_mutex.unlock(); - // 跳帧 - if (skip_frame == 1 || index % skip_frame == 0){ - post_decoded_cbk(m_postDecArg, convert2bgr(gpuFrame)); - index = 0; - } - - av_frame_free(&gpuFrame); - - index++; - } - } - - LOG_INFO("post decode thread exited."); -} - -void FFNvDecoder::close(){ - m_bRunning=false; - if(m_decode_thread != 0){ - pthread_join(m_decode_thread,0); - } - m_dec_keyframe = false; -} - -AVPixelFormat FFNvDecoder::getHwPixFmt(){ - return hw_pix_fmt; -} - -bool FFNvDecoder::isRunning(){ - return m_bRunning; -} - -bool FFNvDecoder::isFinished(){ - return m_bFinished; -} - -bool FFNvDecoder::isPausing(){ - return m_bPause; -} - -bool FFNvDecoder::getResolution( int &width, int &height ){ - if (avctx != nullptr) - { - width = avctx->width; - height = avctx->height; - return true; - } - - return false; -} - -void FFNvDecoder::pause(){ - m_bPause = true; -} - -void FFNvDecoder::resume(){ - m_bPause = false; -} - -void FFNvDecoder::setDecKeyframe(bool bKeyframe) -{ - m_dec_keyframe = bKeyframe; -} - -int FFNvDecoder::getCachedQueueLength(){ - m_queue_mutex.lock(); - int queue_size = mFrameQueue.size(); - m_queue_mutex.lock(); - return queue_size; -} - -float FFNvDecoder::fps(){ - return m_fps; -} - -void FFNvDecoder::setPostDecArg(const void* postDecArg){ - m_postDecArg = postDecArg; -} - -void FFNvDecoder::setFinishedDecArg(const void* finishedDecArg){ - m_finishedDecArg = finishedDecArg; -} - -DeviceRgbMemory* FFNvDecoder::convert2bgr(AVFrame * gpuFrame){ - if (gpuFrame != nullptr && gpuFrame->format == AV_PIX_FMT_CUDA ){ - LOG_DEBUG("decode task: gpuid: {} width: {} height: {}", m_cfg.gpuid, gpuFrame->width, gpuFrame->height); - GpuRgbMemory* gpuMem = new GpuRgbMemory(3, gpuFrame->width, gpuFrame->height, getName(), m_cfg.gpuid, false, true); - - do{ - if (gpuMem->getMem() == nullptr){ - LOG_ERROR("new GpuRgbMemory failed !!!"); - break; - } - - cudaSetDevice(atoi(m_cfg.gpuid.c_str())); - cuda_common::setColorSpace( ITU_709, 0 ); - cudaError_t cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], gpuMem->getMem(), gpuFrame->width, gpuFrame->height); - cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - LOG_ERROR("CUDAToBGR failed failed !!!"); - break; - } - - return gpuMem; - }while(0); - - delete gpuMem; - gpuMem = nullptr; - } - - return nullptr; -} - -FFImgInfo* FFNvDecoder::snapshot(){ - - // 锁住停止队列消耗 - std::lock_guard<std::mutex> l(m_snapshot_mutex); - - AVFrame * gpuFrame = nullptr; - - bool bFirst = true; - while(true){ - m_queue_mutex.lock(); - if(mFrameQueue.size() <= 0){ - m_queue_mutex.unlock(); - if(bFirst){ - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - bFirst = false; - continue; - }else{ - // 再进来说明前面已经等了 100 ms - // 100 ms都没有等到解码数据,则退出 - return nullptr; - } - } - - // 队列中数据大于1 - gpuFrame = mFrameQueue.front(); - m_queue_mutex.unlock(); - break; - } - - if (gpuFrame != nullptr && gpuFrame->format == AV_PIX_FMT_CUDA ){ - LOG_DEBUG("decode task: gpuid: {} width: {} height: {}", m_cfg.gpuid, gpuFrame->width, gpuFrame->height); - GpuRgbMemory* gpuMem = new GpuRgbMemory(3, gpuFrame->width, gpuFrame->height, getName(), m_cfg.gpuid, false, true); - - if (gpuMem->getMem() == nullptr){ - LOG_ERROR("new GpuRgbMemory failed !!!"); - return nullptr; - } - - cudaSetDevice(atoi(m_cfg.gpuid.c_str())); - cuda_common::setColorSpace( ITU_709, 0 ); - cudaError_t cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], gpuMem->getMem(), gpuFrame->width, gpuFrame->height); - cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - LOG_ERROR("CUDAToBGR failed failed !!!"); - return nullptr; - } - - unsigned char * pHwRgb = gpuMem->getMem(); - int channel = gpuMem->getChannel(); - int width = gpuMem->getWidth(); - int height = gpuMem->getHeight(); - - if (pHwRgb != nullptr && channel > 0 && width > 0 && height > 0){ - int nSize = channel * height * width; - - LOG_INFO("channel:{} height:{} width:{}", channel, height, width); - // unsigned char* cpu_data = new unsigned char[nSize]; - - unsigned char* cpu_data = (unsigned char *)av_malloc(nSize * sizeof(unsigned char)); - - cudaMemcpy(cpu_data, pHwRgb, nSize * sizeof(unsigned char), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); - - delete gpuMem; - gpuMem = nullptr; - - FFImgInfo* imgInfo = new FFImgInfo(); - imgInfo->dec_name = m_dec_name; - imgInfo->pData = cpu_data; - imgInfo->height = height; - imgInfo->width = width; - imgInfo->timestamp = UtilTools::get_cur_time_ms(); - imgInfo->index = m_index; - - m_index++; - - return imgInfo; - } - - delete gpuMem; - gpuMem = nullptr; - } - - return nullptr; -} \ No newline at end of file diff --git a/src/nvdecoder/FFNvDecoder.h b/src/nvdecoder/FFNvDecoder.h deleted file mode 100644 index 4784ab6..0000000 --- a/src/nvdecoder/FFNvDecoder.h +++ /dev/null @@ -1,107 +0,0 @@ -#include<string> -#include <pthread.h> - -#include <mutex> - -extern "C" -{ - #include <libavcodec/avcodec.h> - #include <libavdevice/avdevice.h> - #include <libavformat/avformat.h> - #include <libavfilter/avfilter.h> - #include <libavutil/avutil.h> - #include <libavutil/pixdesc.h> - #include <libswscale/swscale.h> - #include <libavutil/imgutils.h> -} - -#include "common_header.h" - -#include "../interface/AbstractDecoder.h" - -using namespace std; - -class FFNvDecoder : public AbstractDecoder { -public: - FFNvDecoder(); - ~FFNvDecoder(); - bool init(FFDecConfig& cfg); - void close(); - bool start(); - void pause(); - void resume(); - - void setDecKeyframe(bool bKeyframe); - - bool isRunning(); - bool isFinished(); - bool isPausing(); - bool getResolution( int &width, int &height ); - - bool isSurport(FFDecConfig& cfg); - - int getCachedQueueLength(); - - float fps(); - - DECODER_TYPE getDecoderType(){ return DECODER_TYPE_FFMPEG; } - - FFImgInfo* snapshot(); - - void setName(string nm){ - m_dec_name = nm; - } - - string getName(){ - return m_dec_name; - } - - void setPostDecArg(const void* postDecArg); - void setFinishedDecArg(const void* finishedDecArg); - -public: - AVPixelFormat getHwPixFmt(); - -private: - void decode_thread(); - void post_decode_thread(); - bool init(const char* uri, const char* gpuid, bool force_tcp); - void decode_finished(); - - DeviceRgbMemory* convert2bgr(AVFrame * gpuFrame); - -private: - string m_dec_name; - FFDecConfig m_cfg; - - AVStream* stream; - AVCodecContext *avctx; - int stream_index; - AVFormatContext *fmt_ctx; - AVPixelFormat hw_pix_fmt; - - pthread_t m_decode_thread; - pthread_t m_post_decode_thread; - - bool m_bRunning; - bool m_bFinished; - - bool m_bPause; - - bool m_bReal; // 是否实时流 - - float m_fps; - - queue<AVFrame*> mFrameQueue; - mutex m_queue_mutex; - mutex m_snapshot_mutex; - long m_index{0}; - - bool m_dec_keyframe; - - const void * m_postDecArg; - POST_DECODE_CALLBACK post_decoded_cbk; // 解码数据回调接口 - - const void * m_finishedDecArg; - DECODE_FINISHED_CALLBACK decode_finished_cbk; -}; \ No newline at end of file diff --git a/src/nvdecoder/GpuRgbMemory.hpp b/src/nvdecoder/GpuRgbMemory.hpp deleted file mode 100644 index 35eac65..0000000 --- a/src/nvdecoder/GpuRgbMemory.hpp +++ /dev/null @@ -1,34 +0,0 @@ -#include<string> - -#include "../interface/DeviceRgbMemory.hpp" -#include "cuda_kernels.h" -#include "define.hpp" -#include "common_header.h" - -using namespace std; - -class GpuRgbMemory : public DeviceRgbMemory{ - -public: - GpuRgbMemory(int _channel, int _width, int _height, string _id, string _gpuid, bool _key_frame, bool _isused) - :DeviceRgbMemory(_channel, _width, _height, _id, _gpuid, _key_frame, _isused){ - gpuid = _gpuid; - cudaSetDevice(atoi(gpuid.c_str())); - CHECK_CUDA(cudaMalloc((void **)&pHwRgb, data_size * sizeof(unsigned char))); - } - - ~GpuRgbMemory(){ - if (pHwRgb) { - cudaSetDevice(atoi(gpuid.c_str())); - CHECK_CUDA(cudaFree(pHwRgb)); - pHwRgb = nullptr; - } - } - - string getGpuId() { - return gpuid; - } - -private: - string gpuid; -}; \ No newline at end of file diff --git a/src/nvdecoder/ImageSaveGPU.cpp b/src/nvdecoder/ImageSaveGPU.cpp deleted file mode 100644 index dde9b64..0000000 --- a/src/nvdecoder/ImageSaveGPU.cpp +++ /dev/null @@ -1,123 +0,0 @@ -#include "cuda_kernels.h" - -#include "common_header.h" - - -//int saveJPEG(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height) -//{ -// return jpegNPP(szOutputFile, d_srcRGB, img_width, img_height); -// //return 0; -//} -// -//int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height) -//{ -// return jpegNPP(szOutputFile, d_srcRGB, img_width, img_height); -// //return 0; -//} -// -//int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB) -//{ -// return jpegNPP(szOutputFile, d_srcRGB); -//} -// -//int saveJPEG(const char *szOutputFile, float* d_srcRGB) -//{ -// return jpegNPP(szOutputFile, d_srcRGB); -//} - -int resizeFrame(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height) -{ - cudaError_t cudaStatus = cuda_common::ResizeImage(d_srcRGB, src_width, src_height, d_dstRGB, dst_width, dst_height); - if (cudaStatus != cudaSuccess) { - LOG_ERROR("cuda_common::ResizeImage failed: {}",cudaGetErrorString(cudaStatus)); - return -1; - } - - return 0; -} - -//int initTables() -//{ -// initTable(); -// return 0; -//} -// -//int initTables(int flag, int width, int height) -//{ -// initTable(0, width, height); -// return 0; -//} - -int drawImageOnGPU(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom) -{ - cuda_common::DrawImage(d_srcRGB, src_width, src_height, left, top, right, bottom); - return 0; -} - -int drawImageOnGPU(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom) -{ - cuda_common::DrawImage(d_srcRGB, src_width, src_height, left, top, right, bottom); - return 0; -} - -int drawLineOnGPU(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y) -{ - cuda_common::DrawLine(d_srcRGB, src_width, src_height, begin_x, begin_y, end_x, end_y); - return 0; -} - -//int releaseJpegSaver() -//{ -// releaseJpegNPP(); -// return 0; -//} - -int partMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom) -{ - cudaError_t cudaStatus = cuda_common::PartMemCopy(d_srcRGB, src_width, src_height, d_dstRGB, left, top, right, bottom); - if (cudaStatus != cudaSuccess) { - LOG_ERROR("cuda_common::77 PartMemCopy failed: {} {} {} {} {} {} {}",cudaGetErrorString(cudaStatus), left, top, right, bottom, src_height, d_dstRGB); - return -1; - } - - return 0; -} -//#include <fstream> -//extern std::ofstream g_os; -int PartMemResizeBatch(unsigned char * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, - int count, int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h, - float submeanb, float submeang, float submeanr, - float varianceb, float varianceg, float variancer) -{ - //g_os << "cudaMemcpyHostToDevice begin 9" << std::endl; - cudaError_t cudaStatus = cuda_common::PartMemResizeBatch( - d_srcRGB, src_width, src_height, d_dstRGB, count, vleft, vtop, vright, vbottom, dst_w, dst_h, - submeanb, submeang, submeanr, - varianceb, varianceg, variancer); - //g_os << "cudaMemcpyHostToDevice end 9" << std::endl; - if (cudaStatus != cudaSuccess) { - LOG_ERROR("cuda_common::PartMemResizeBatch failed: {}",cudaGetErrorString(cudaStatus)); - return -1; - } - - return 0; -} - - -//int PartMemResizeBatch(float * d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, -// int count, int* vleft, int * vtop, int* vright, int* vbottom, int dst_w, int dst_h, -// float submeanb, float submeang, float submeanr, -// float varianceb, float varianceg, float variancer) -// -//{ -// cudaError_t cudaStatus = cuda_common::PartMemResizeBatch( -// d_srcRGB, src_width, src_height, d_dstRGB, count, vleft, vtop, vright, vbottom, dst_w, dst_h, -// submeanb, submeang, submeanr, -// varianceb, varianceg, variancer); -// if (cudaStatus != cudaSuccess) { -// fprintf(stderr, "cuda_common::PartMemCopy failed: %s\n", cudaGetErrorString(cudaStatus)); -// return -1; -// } -// -// return 0; -//} \ No newline at end of file diff --git a/src/nvdecoder/ImageSaveGPU.h b/src/nvdecoder/ImageSaveGPU.h deleted file mode 100644 index 272a6d2..0000000 --- a/src/nvdecoder/ImageSaveGPU.h +++ /dev/null @@ -1,65 +0,0 @@ -/******************************************************************************************* -* Version: VPT_x64_V2.0.0_20170904 -* CopyRight: 中科院自动化研究所模式识别实验室图像视频组 -* UpdateDate: 20170904 -* Content: 人车物监测跟踪 -********************************************************************************************/ - -#ifndef IMAGESAVEGPU_H_ -#define IMAGESAVEGPU_H_ - -#ifdef _MSC_VER - #ifdef IMAGESAVEGPU_EXPORTS - #define IMAGESAVEGPU_API __declspec(dllexport) - #else - #define IMAGESAVEGPU_API __declspec(dllimport) - #endif -#else -#define IMAGESAVEGPU_API __attribute__((visibility ("default"))) -#endif -// 功能:保存成jpeg文件 -// szOutputFile 输出图片路径,如D:\\out.jpg -// d_srcRGB 输入RGB数据,由cudaMalloc分配的显存空间,数据排列形式为:BBBBBB......GGGGGG......RRRRRRRR...... -// img_width RGB数据图片的宽度 -// img_height RGB数据图片的高度 -// -//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height); -//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, float* d_srcRGB); -// -//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height); -//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB); - -// 功能:防缩图像 -IMAGESAVEGPU_API int resizeFrame(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height); - -// 功能:部分拷贝数据 -IMAGESAVEGPU_API int partMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom); - -//IMAGESAVEGPU_API int partMemResizeImage(float * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, -// int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h, -// float submeanb, float submeang, float submeanr, -// float varianceb, float varianceg, float variancer); - - -IMAGESAVEGPU_API int PartMemResizeBatch(unsigned char * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, - int count, int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h, - float submeanb, float submeang, float submeanr, - float varianceb, float varianceg, float variancer); - - -//// 功能:初始化GPU保存图像的各种量化表 -//IMAGESAVEGPU_API int initTables(); -//IMAGESAVEGPU_API int initTables(int falg, int width, int height); -// -//// 功能:释放资源 -//IMAGESAVEGPU_API int releaseJpegSaver(); - -// 功能:在GPU中绘制快照包围框 -IMAGESAVEGPU_API int drawImageOnGPU(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom); - -IMAGESAVEGPU_API int drawImageOnGPU(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom); - -// 功能:在GPU中绘制直线 -IMAGESAVEGPU_API int drawLineOnGPU(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y); - -#endif diff --git a/src/nvdecoder/Makefile b/src/nvdecoder/Makefile deleted file mode 100644 index 8b6ceff..0000000 --- a/src/nvdecoder/Makefile +++ /dev/null @@ -1,102 +0,0 @@ -# 各项目录 -LIB_DIR:=$(BUILD_DIR)/$(MODULE)/lib -DEP_DIR:=$(BUILD_DIR)/$(MODULE)/.dep -OBJ_DIR:=$(BUILD_DIR)/$(MODULE)/obj -SRC_DIR:=$(TOP_DIR)/$(MODULE) - -# 源文件以及中间目标文件和依赖文件 -SRCS:=$(notdir $(wildcard $(SRC_DIR)/*.cpp)) -OBJS:=$(addprefix $(OBJ_DIR)/, $(patsubst %.cpp, %.o, $(SRCS))) -DEPS:=$(addprefix $(DEP_DIR)/, $(patsubst %.cpp, %.d,a $(SRCS))) - -CUDA_ROOT = /usr/local/cuda-11.1 -NVCC = $(CUDA_ROOT)/bin/nvcc - -# 自动生成头文件依赖选项 -DEPFLAGS=-MT $@ -MMD -MP -MF $(DEP_DIR)/$*.d - -DEFS = -DENABLE_DVPP_INTERFACE - -# 最终目标文件 -TARGET:=$(LIB_DIR)/$(MODULE).a - - -PROJECT_ROOT= /mnt/data/cmhu/FFNvDecoder - -DEPEND_DIR = $(PROJECT_ROOT)/bin -THIRDPARTY_ROOT = $(PROJECT_ROOT)/3rdparty -SPDLOG_ROOT = $(THIRDPARTY_ROOT)/spdlog-1.9.2/release -JRTP_ROOT = $(THIRDPARTY_ROOT)/jrtp_export - - -INCLUDE= -I $(DEPEND_DIR)/include \ - -I $(CUDA_ROOT)/include \ - -I $(TOP_DIR)/common/inc \ - -I $(TOP_DIR)/common/UtilNPP \ - -I $(TOP_DIR)/ \ - -I $(SPDLOG_ROOT)/include \ - -I $(JRTP_ROOT)/jrtplib/include/jrtplib3 \ - -I $(JRTP_ROOT)/jthread/include/jthread - -LIBSPATH= -L $(DEPEND_DIR)/lib -lavformat -lavcodec -lswscale -lavutil -lavfilter -lswresample -lavdevice \ - -L $(CUDA_ROOT)/lib64 -lcuda -lcudart -lnvcuvid -lcurand -lcublas -lnvjpeg \ - -L $(SPDLOG_ROOT) -l:libspdlog.a \ - -L $(JRTP_ROOT)/jthread/lib -l:libjthread.a \ - -L $(JRTP_ROOT)/jrtplib/lib -l:libjrtp.a - - -CXXFLAGS= -g -O0 -fPIC $(INCLUDE) $(DEFS) -lpthread -lrt -lz -fexceptions -std=c++11 -fvisibility=hidden -Wl,-Bsymbolic -ldl -Wwrite-strings -# CFLAGS= -g -fPIC -O0 $(INCLUDE) -pthread -lrt -lz -std=c++11 -fvisibility=hidden -Wl,-Bsymbolic -ldl - # -DUNICODE -D_UNICODE - -NFLAGS_LIB=-g -c -shared -Xcompiler -fPIC -Xcompiler -fvisibility=hidden -NFLAGS = $(NFLAGS_LIB) $(INCLUDE) -std=c++11 - -# CU_SOURCES = $(wildcard ${SRC_DIR}/*.cu) -# CU_OBJS = $(patsubst %.cu, %.o, $(notdir $(CU_SOURCES))) - -CU_SOURCES:=$(notdir $(wildcard $(SRC_DIR)/*.cu)) -CU_OBJS:=$(addprefix $(OBJ_DIR)/, $(patsubst %.cu, %.o, $(CU_SOURCES))) - - -# 默认最终目标 -.PHONY:all -all:$(TARGET) - -# 生成最终目标 -$(TARGET):$(OBJS) $(CU_OBJS) | $(LIB_DIR) - @echo -e "\e[32m""Linking static library $(TARGET)""\e[0m" -# @ar -rc $@ $^ - -# 若没有lib目录则自动生成 -$(LIB_DIR): - @mkdir -p $@ - -# 生成中间目标文件 -$(OBJ_DIR)/%.o:$(SRC_DIR)/%.cpp $(DEP_DIR)/%.d | $(OBJ_DIR) $(DEP_DIR) -# @echo -e "\e[33m""Building object $@""\e[0m" - @$(CXX) -c $(DEPFLAGS) $(CXXFLAGS) $(INCS) $(LIBSPATH) $(MACROS) -o $@ $< - -$(OBJ_DIR)%.o:$(SRC_DIR)/%.cu - @echo "#######################CU_OBJS:$@###############" - $(NVCC) $(NFLAGS) -o $@ $< - - -# 若没有obj目录则自动生成 -$(OBJ_DIR): - @mkdir -p $@ - -# 若没有.dep目录则自动生成 -$(DEP_DIR): - @mkdir -p $@ - -# 依赖文件会在生成中间文件的时候自动生成,这里只是为了防止报错 -$(DEPS): - -# 引入中间目标文件头文件依赖关系 -include $(wildcard $(DEPS)) - -# 直接删除组件build目录 -.PHONY:clean -clean: - @rm -rf $(BUILD_DIR)/$(MODULE) diff --git a/src/nvdecoder/NV12ToRGB.cu b/src/nvdecoder/NV12ToRGB.cu deleted file mode 100644 index 68e54ac..0000000 --- a/src/nvdecoder/NV12ToRGB.cu +++ /dev/null @@ -1,345 +0,0 @@ - -#include "cuda_kernels.h" - -#include <builtin_types.h> -#include "helper_cuda_drvapi.h" - -typedef unsigned char uint8; -typedef unsigned int uint32; -typedef int int32; - -#define COLOR_COMPONENT_MASK 0x3FF -#define COLOR_COMPONENT_BIT_SIZE 10 - -namespace cuda_common -{ - -#define MUL(x,y) ((x)*(y)) - - __constant__ float constHueColorSpaceMat2[9]; //默认分配到0卡上,未找到分配到指定卡上设置方法,当前也未用到,先注释掉 - - __device__ void YUV2RGB2(uint32 *yuvi, float *red, float *green, float *blue) - { - float luma, chromaCb, chromaCr; - - // Prepare for hue adjustment - luma = (float)yuvi[0]; - chromaCb = (float)((int32)yuvi[1] - 512.0f); - chromaCr = (float)((int32)yuvi[2] - 512.0f); - - - // Convert YUV To RGB with hue adjustment - *red = MUL(luma, constHueColorSpaceMat2[0]) + - MUL(chromaCb, constHueColorSpaceMat2[1]) + - MUL(chromaCr, constHueColorSpaceMat2[2]); - *green = MUL(luma, constHueColorSpaceMat2[3]) + - MUL(chromaCb, constHueColorSpaceMat2[4]) + - MUL(chromaCr, constHueColorSpaceMat2[5]); - *blue = MUL(luma, constHueColorSpaceMat2[6]) + - MUL(chromaCb, constHueColorSpaceMat2[7]) + - MUL(chromaCr, constHueColorSpaceMat2[8]); - - } - - __device__ unsigned char clip_v(int x, int min_val, int max_val) { - if (x>max_val) { - return max_val; - } - else if (x<min_val) { - return min_val; - } - else { - return x; - } - } - // CUDA kernel for outputing the final RGB output from NV12; - extern "C" - __global__ void NV12ToRGB_drvapi2(uint32 *srcImage, size_t nSourcePitch, unsigned char *dstImage, int width, int height) - { - - int32 x, y; - uint32 yuv101010Pel[2]; - uint32 processingPitch = ((width)+63) & ~63; - uint8 *srcImageU8 = (uint8 *)srcImage; - - processingPitch = nSourcePitch; - - // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread - x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1); - y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x >= width) - { - //printf("x >= width\n"); - //*flag = -1; - return; //x = width - 1; - } - //return; //x = width - 1; - - if (y >= height) - { - //printf("y >= height\n"); - //*flag = -1; - return; // y = height - 1; - } - - // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way. - // if we move to texture we could read 4 luminance values - yuv101010Pel[0] = (srcImageU8[y * processingPitch + x]) << 2; - yuv101010Pel[1] = (srcImageU8[y * processingPitch + x + 1]) << 2; - - uint32 chromaOffset = processingPitch * height; - int32 y_chroma = y >> 1; - - if (y & 1) // odd scanline ? - { - uint32 chromaCb; - uint32 chromaCr; - - chromaCb = srcImageU8[chromaOffset + y_chroma * processingPitch + x]; - chromaCr = srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1]; - - if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically - { - chromaCb = (chromaCb + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x] + 1) >> 1; - chromaCr = (chromaCr + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x + 1] + 1) >> 1; - } - - yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2)); - yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); - - yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2)); - yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); - } - else - { - yuv101010Pel[0] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x] << (COLOR_COMPONENT_BIT_SIZE + 2)); - yuv101010Pel[0] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); - - yuv101010Pel[1] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x] << (COLOR_COMPONENT_BIT_SIZE + 2)); - yuv101010Pel[1] |= ((uint32)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); - } - - // this steps performs the color conversion - uint32 yuvi[6]; - float red[2], green[2], blue[2]; - - yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK); - yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK); - yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK); - - yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK); - yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK); - yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK); - - // YUV to RGB Transformation conversion - YUV2RGB2(&yuvi[0], &red[0], &green[0], &blue[0]); - YUV2RGB2(&yuvi[3], &red[1], &green[1], &blue[1]); - - - dstImage[y * width * 3 + x * 3] = clip_v(blue[0] * 0.25,0 ,255); - dstImage[y * width * 3 + x * 3 + 3] = clip_v(blue[1] * 0.25,0, 255); - - dstImage[width * y * 3 + x * 3 + 1] = clip_v(green[0] * 0.25,0 ,255); - dstImage[width * y * 3 + x * 3 + 4] = clip_v(green[1] * 0.25,0, 255); - - dstImage[width * y * 3 + x * 3 + 2] = clip_v(red[0] * 0.25, 0, 255); - dstImage[width * y * 3 + x * 3 + 5] = clip_v(red[1] * 0.25,0 ,255); - - - //dstImage[y * width * 3 + x * 3] = blue[0] * 0.25; - //dstImage[y * width * 3 + x * 3 + 3] = blue[1] * 0.25; - - //dstImage[width * y * 3 + x * 3 + 1] =green[0] * 0.25; - //dstImage[width * y * 3 + x * 3 + 4] = green[1] * 0.25; - - //dstImage[width * y * 3 + x * 3 + 2] = red[0] * 0.25; - //dstImage[width * y * 3 + x * 3 + 5] = red[1] * 0.25; - - // Clamp the results to BBBBBB....GGGGGGG.......RRRRRRR.... - // dstImage[y * width + x] = blue[0] * 0.25; - // dstImage[y * width + x + 1] = blue[1] * 0.25; - - // dstImage[width * height + y * width + x] = green[0] * 0.25; - // dstImage[width * height + y * width + x + 1] = green[1] * 0.25; - - // dstImage[width * height * 2 + y * width + x] = red[0] * 0.25; - // dstImage[width * height * 2 + y * width + x + 1] = red[1] * 0.25; - return; - - } - - // CUDA kernel for outputing the final RGB output from NV12; - extern "C" - __global__ void CUDAToBGR_drvapi(uint32 *dataY, uint32 *dataUV, size_t pitchY, size_t pitchUV, unsigned char *dstImage, int width, int height) - { - - int32 x, y; - - // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread - x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1); - y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x >= width) - { - return; - } - - if (y >= height) - { - return; - } - - uint32 yuv101010Pel[2]; - uint8 *srcImageU8_Y = (uint8 *)dataY; - uint8 *srcImageU8_UV = (uint8 *)dataUV; - - // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way. - // if we move to texture we could read 4 luminance values - yuv101010Pel[0] = (srcImageU8_Y[y * pitchY + x]) << 2; - yuv101010Pel[1] = (srcImageU8_Y[y * pitchY + x + 1]) << 2; - - int32 y_chroma = y >> 1; - - if (y & 1) // odd scanline ? - { - uint32 chromaCb; - uint32 chromaCr; - - chromaCb = srcImageU8_UV[y_chroma * pitchUV + x]; - chromaCr = srcImageU8_UV[y_chroma * pitchUV + x + 1]; - - if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically - { - chromaCb = (chromaCb + srcImageU8_UV[(y_chroma + 1) * pitchUV + x] + 1) >> 1; - chromaCr = (chromaCr + srcImageU8_UV[(y_chroma + 1) * pitchUV + x + 1] + 1) >> 1; - } - - yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2)); - yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); - - yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2)); - yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); - } - else - { - yuv101010Pel[0] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2)); - yuv101010Pel[0] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); - - yuv101010Pel[1] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2)); - yuv101010Pel[1] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); - } - - // this steps performs the color conversion - uint32 yuvi[6]; - float red[2], green[2], blue[2]; - - yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK); - yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK); - yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK); - - yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK); - yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK); - yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK); - - // YUV to RGB Transformation conversion - YUV2RGB2(&yuvi[0], &red[0], &green[0], &blue[0]); - YUV2RGB2(&yuvi[3], &red[1], &green[1], &blue[1]); - - - dstImage[y * width * 3 + x * 3] = clip_v(blue[0] * 0.25,0 ,255); - dstImage[y * width * 3 + x * 3 + 3] = clip_v(blue[1] * 0.25,0, 255); - - dstImage[width * y * 3 + x * 3 + 1] = clip_v(green[0] * 0.25,0 ,255); - dstImage[width * y * 3 + x * 3 + 4] = clip_v(green[1] * 0.25,0, 255); - - dstImage[width * y * 3 + x * 3 + 2] = clip_v(red[0] * 0.25, 0, 255); - dstImage[width * y * 3 + x * 3 + 5] = clip_v(red[1] * 0.25,0 ,255); - } - - cudaError_t setColorSpace(FF_ColorSpace CSC, float hue) - { - float hueSin = sin(hue); - float hueCos = cos(hue); - - float hueCSC[9]; - if (CSC == ITU_601) - { - //CCIR 601 - hueCSC[0] = 1.1644f; - hueCSC[1] = hueSin * 1.5960f; - hueCSC[2] = hueCos * 1.5960f; - hueCSC[3] = 1.1644f; - hueCSC[4] = (hueCos * -0.3918f) - (hueSin * 0.8130f); - hueCSC[5] = (hueSin * 0.3918f) - (hueCos * 0.8130f); - hueCSC[6] = 1.1644f; - hueCSC[7] = hueCos * 2.0172f; - hueCSC[8] = hueSin * -2.0172f; - } - else if (CSC == ITU_709) - { - //CCIR 709 - hueCSC[0] = 1.0f; - hueCSC[1] = hueSin * 1.57480f; - hueCSC[2] = hueCos * 1.57480f; - hueCSC[3] = 1.0; - hueCSC[4] = (hueCos * -0.18732f) - (hueSin * 0.46812f); - hueCSC[5] = (hueSin * 0.18732f) - (hueCos * 0.46812f); - hueCSC[6] = 1.0f; - hueCSC[7] = hueCos * 1.85560f; - hueCSC[8] = hueSin * -1.85560f; - } - - cudaError_t cudaStatus = cudaMemcpyToSymbol(constHueColorSpaceMat2, hueCSC, 9 * sizeof(float), 0, cudaMemcpyHostToDevice); - float tmpf[9]; - memset(tmpf, 0, 9 * sizeof(float)); - cudaMemcpyFromSymbol(tmpf, constHueColorSpaceMat2, 9 * sizeof(float), 0, ::cudaMemcpyDefault); - cudaDeviceSynchronize(); - - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaMemcpyToSymbol failed: %s\n", cudaGetErrorString(cudaStatus)); - } - - return cudaStatus; - } - - cudaError_t NV12ToRGBnot(CUdeviceptr d_srcNV12, size_t nSourcePitch, unsigned char* d_dstRGB, int width, int height) - { - dim3 block(32, 16, 1); - dim3 grid((width + (2 * block.x - 1)) / (2 * block.x), (height + (block.y - 1)) / block.y, 1); - NV12ToRGB_drvapi2 << < grid, block >> >((uint32 *)d_srcNV12, nSourcePitch, d_dstRGB, width, height); - cudaError_t cudaStatus = cudaGetLastError(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "NV12ToRGB_drvapi launch failed: %s\n", cudaGetErrorString(cudaStatus)); - return cudaStatus; - } - - cudaStatus = cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching NV12ToRGB_drvapi !\n", cudaStatus); - return cudaStatus; - } - - return cudaStatus; - } - - cudaError_t CUDAToBGR(CUdeviceptr dataY, CUdeviceptr dataUV, size_t pitchY, size_t pitchUV, unsigned char* d_dstRGB, int width, int height) - { - dim3 block(32, 16, 1); - dim3 grid((width + (2 * block.x - 1)) / (2 * block.x), (height + (block.y - 1)) / block.y, 1); - CUDAToBGR_drvapi << < grid, block >> >((uint32 *)dataY, (uint32 *)dataUV, pitchY, pitchUV, d_dstRGB, width, height); - cudaError_t cudaStatus = cudaGetLastError(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "NV12ToRGB_drvapi launch failed: %s\n", cudaGetErrorString(cudaStatus)); - return cudaStatus; - } - - cudaStatus = cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching NV12ToRGB_drvapi !\n", cudaStatus); - return cudaStatus; - } - - return cudaStatus; - } -} \ No newline at end of file diff --git a/src/nvdecoder/NvDecoderApi.cpp b/src/nvdecoder/NvDecoderApi.cpp deleted file mode 100644 index efb63cd..0000000 --- a/src/nvdecoder/NvDecoderApi.cpp +++ /dev/null @@ -1,133 +0,0 @@ -#include "NvDecoderApi.h" -#include "FFNvDecoder.h" - -NvDecoderApi::NvDecoderApi(){ - m_pDecoder = nullptr; -} - -NvDecoderApi::~NvDecoderApi(){ - if(m_pDecoder != nullptr){ - delete m_pDecoder; - m_pDecoder = nullptr; - } -} - -bool NvDecoderApi::init(FFDecConfig& cfg){ - m_pDecoder = new FFNvDecoder(); - if(m_pDecoder != nullptr){ - return m_pDecoder->init(cfg); - } - return false; -} - -void NvDecoderApi::close(){ - if(m_pDecoder != nullptr){ - return m_pDecoder->close(); - } -} - -bool NvDecoderApi::start(){ - if(m_pDecoder != nullptr){ - return m_pDecoder->start(); - } - return false; -} - -void NvDecoderApi::pause(){ - if(m_pDecoder != nullptr){ - return m_pDecoder->pause(); - } -} - -void NvDecoderApi::resume(){ - if(m_pDecoder != nullptr){ - return m_pDecoder->resume(); - } -} - -void NvDecoderApi::setDecKeyframe(bool bKeyframe){ - if(m_pDecoder != nullptr){ - return m_pDecoder->setDecKeyframe(bKeyframe); - } -} - -bool NvDecoderApi::isRunning(){ - if(m_pDecoder != nullptr){ - return m_pDecoder->isRunning(); - } - return false; -} - -bool NvDecoderApi::isFinished(){ - if(m_pDecoder != nullptr){ - return m_pDecoder->isFinished(); - } - return false; -} - -bool NvDecoderApi::isPausing(){ - if(m_pDecoder != nullptr){ - return m_pDecoder->isPausing(); - } - return false; -} - -bool NvDecoderApi::getResolution(int &width, int &height){ - if(m_pDecoder != nullptr){ - return m_pDecoder->getResolution(width, height); - } - return false; -} - -bool NvDecoderApi::isSurport(FFDecConfig& cfg){ - if(m_pDecoder != nullptr){ - return m_pDecoder->isSurport(cfg); - } - return false; -} - -float NvDecoderApi::fps(){ - if(m_pDecoder != nullptr){ - return m_pDecoder->fps(); - } - return 0.0; -} - -int NvDecoderApi::getCachedQueueLength(){ - if(m_pDecoder != nullptr){ - return m_pDecoder->getCachedQueueLength(); - } - return 0; -} - -void NvDecoderApi::setName(string nm){ - if(m_pDecoder != nullptr){ - return m_pDecoder->setName(nm); - } -} - -string NvDecoderApi::getName(){ - if(m_pDecoder != nullptr){ - return m_pDecoder->getName(); - } - return nullptr; -} - -FFImgInfo* NvDecoderApi::snapshot(){ - if(m_pDecoder != nullptr){ - return m_pDecoder->snapshot(); - } - return nullptr; -} - -void NvDecoderApi::setPostDecArg(const void* postDecArg){ - if(m_pDecoder != nullptr){ - return m_pDecoder->setPostDecArg(postDecArg); - } -} - -void NvDecoderApi::setFinishedDecArg(const void* finishedDecArg){ - if(m_pDecoder != nullptr){ - return m_pDecoder->setFinishedDecArg(finishedDecArg); - } -} \ No newline at end of file diff --git a/src/nvdecoder/NvDecoderApi.h b/src/nvdecoder/NvDecoderApi.h deleted file mode 100644 index f742dd8..0000000 --- a/src/nvdecoder/NvDecoderApi.h +++ /dev/null @@ -1,44 +0,0 @@ -#include<string> -#include <pthread.h> - -#include "common_header.h" -#include "../interface/AbstractDecoder.h" - -using namespace std; - -class FFNvDecoder; - -class NvDecoderApi : public AbstractDecoder{ -public: - NvDecoderApi(); - ~NvDecoderApi(); - bool init(FFDecConfig& cfg); - void close(); - bool start(); - void pause(); - void resume(); - - void setDecKeyframe(bool bKeyframe); - - bool isRunning(); - bool isFinished(); - bool isPausing(); - bool getResolution( int &width, int &height ); - - bool isSurport(FFDecConfig& cfg); - - int getCachedQueueLength(); - - float fps(); - - FFImgInfo* snapshot(); - - DECODER_TYPE getDecoderType(){ return DECODER_TYPE_DVPP; } - void setName(string nm); - string getName(); - - void setPostDecArg(const void* postDecArg); - void setFinishedDecArg(const void* finishedDecArg); -private: - FFNvDecoder* m_pDecoder; -}; \ No newline at end of file diff --git a/src/nvdecoder/NvJpegEncoder.cpp b/src/nvdecoder/NvJpegEncoder.cpp deleted file mode 100644 index 7ee0727..0000000 --- a/src/nvdecoder/NvJpegEncoder.cpp +++ /dev/null @@ -1,90 +0,0 @@ -#include "NvJpegEncoder.h" - -#include <fstream> -#include <vector> -#include <iostream> - - -#define CHECK_NVJPEG(S) do {nvjpegStatus_t status; \ - status = S; \ - if (status != NVJPEG_STATUS_SUCCESS ) std::cout << __LINE__ <<" CHECK_NVJPEG - status = " << status << std::endl; \ - } while (false) - - -int saveJpeg(const char * filepath, unsigned char* d_srcBGR, int width, int height, cudaStream_t stream) -{ - nvjpegHandle_t nvjpeg_handle; - nvjpegEncoderState_t encoder_state; - nvjpegEncoderParams_t encoder_params; - - cudaEvent_t ev_start, ev_end; - cudaEventCreate(&ev_start); - cudaEventCreate(&ev_end); - - nvjpegImage_t input; - nvjpegInputFormat_t input_format = NVJPEG_INPUT_BGRI; - int image_width = width; - int image_height = height; - - // int channel_size = image_width * image_height; - // for (int i = 0; i < 3; i++) - // { - // input.pitch[i] = image_width; - // (cudaMalloc((void**)&(input.channel[i]), channel_size)); - // (cudaMemset(input.channel[i], 50 * 40 * i, channel_size)); - // } - - input.channel[0] = d_srcBGR; - input.pitch[0] = image_width * 3; - - nvjpegBackend_t backend = NVJPEG_BACKEND_DEFAULT; - - CHECK_NVJPEG(nvjpegCreate(backend, nullptr, &nvjpeg_handle)); - - CHECK_NVJPEG(nvjpegEncoderParamsCreate(nvjpeg_handle, &encoder_params, stream)); - CHECK_NVJPEG(nvjpegEncoderStateCreate(nvjpeg_handle, &encoder_state, stream)); - - // set params - CHECK_NVJPEG(nvjpegEncoderParamsSetEncoding(encoder_params, nvjpegJpegEncoding_t::NVJPEG_ENCODING_PROGRESSIVE_DCT_HUFFMAN, stream)); - CHECK_NVJPEG(nvjpegEncoderParamsSetOptimizedHuffman(encoder_params, 1, stream)); - CHECK_NVJPEG(nvjpegEncoderParamsSetQuality(encoder_params, 70, stream)); - CHECK_NVJPEG(nvjpegEncoderParamsSetSamplingFactors(encoder_params, nvjpegChromaSubsampling_t::NVJPEG_CSS_420, stream)); - - cudaEventRecord(ev_start); - CHECK_NVJPEG(nvjpegEncodeImage(nvjpeg_handle, encoder_state, encoder_params, &input, input_format, image_width, image_height, stream)); - cudaEventRecord(ev_end); - - std::vector<unsigned char> obuffer; - size_t length; - CHECK_NVJPEG(nvjpegEncodeRetrieveBitstream( - nvjpeg_handle, - encoder_state, - NULL, - &length, - stream)); - - obuffer.resize(length); - CHECK_NVJPEG(nvjpegEncodeRetrieveBitstream( - nvjpeg_handle, - encoder_state, - obuffer.data(), - &length, - stream)); - - cudaEventSynchronize(ev_end); - - // 用完销毁,避免显存泄露 - nvjpegEncoderParamsDestroy(encoder_params); - nvjpegEncoderStateDestroy(encoder_state); - nvjpegDestroy(nvjpeg_handle); - - float ms; - cudaEventElapsedTime(&ms, ev_start, ev_end); - // std::cout << "time spend " << ms << " ms" << std::endl; - - std::ofstream outputFile(filepath, std::ios::out | std::ios::binary); - outputFile.write(reinterpret_cast<const char *>(obuffer.data()), static_cast<int>(length)); - outputFile.close(); - - return 0; -} \ No newline at end of file diff --git a/src/nvdecoder/NvJpegEncoder.h b/src/nvdecoder/NvJpegEncoder.h deleted file mode 100644 index 3c27ba8..0000000 --- a/src/nvdecoder/NvJpegEncoder.h +++ /dev/null @@ -1,3 +0,0 @@ -#include <nvjpeg.h> - -int saveJpeg(const char * filepath, unsigned char* d_srcBGR, int width, int height, cudaStream_t stream); \ No newline at end of file diff --git a/src/nvdecoder/PartMemCopy.cu b/src/nvdecoder/PartMemCopy.cu deleted file mode 100644 index 396765b..0000000 --- a/src/nvdecoder/PartMemCopy.cu +++ /dev/null @@ -1,289 +0,0 @@ -#include "cuda_kernels.h" -#include <algorithm> -typedef unsigned char uchar; -typedef unsigned int uint32; -typedef int int32; - -#define MAX_SNAPSHOT_WIDTH 320 -#define MAX_SNAPSHOT_HEIGHT 320 - -namespace cuda_common -{ - __global__ void kernel_memcopy(unsigned char* d_srcRGB, int src_width, int src_height, - unsigned char* d_dstRGB, int left, int top, int right, int bottom) - { - const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - const int dst_width = right - left; - const int dst_height = bottom - top; - if (dst_x < dst_width && dst_y < dst_height) - { - int src_x = left + dst_x; - int src_y = top + dst_y; - - //bgr...bgr...bgr... - d_dstRGB[(dst_y*dst_width + dst_x) * 3] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3]; - d_dstRGB[(dst_y*dst_width + dst_x) - * 3 + 1] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3 + 1]; - d_dstRGB[(dst_y*dst_width + dst_x) * 3 + 2] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3 + 2]; - - //bbb...ggg...rrr... - //d_dstRGB[(dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(src_y*src_width) + src_x]; - //d_dstRGB[(dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(src_width*src_height) + (src_y*src_width) + src_x]; - //d_dstRGB[(2 * dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(2 * src_width*src_height) + (src_y*src_width) + src_x]; - - /* memcpy(d_dstRGB + (dst_y*src_width) + dst_x, d_srcRGB + (src_y*src_width) + src_x, sizeof(float)); - memcpy(d_dstRGB + (src_width*src_height) + (dst_y*src_width) + dst_x, d_srcRGB + (src_width*src_height) + (src_y*src_width) + src_x, sizeof(float)); - memcpy(d_dstRGB + (2 * src_width*src_height) + (dst_y*src_width) + dst_x, d_srcRGB + (2 * src_width*src_height) + (src_y*src_width) + src_x, sizeof(float));*/ - } - } - - cudaError_t PartMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom) - { - dim3 block(32, 16, 1); - dim3 grid(((right - left) + (block.x - 1)) / block.x, ((bottom - top) + (block.y - 1)) / block.y, 1); - - kernel_memcopy << < grid, block >> > (d_srcRGB, src_width, src_height, d_dstRGB, left, top, right, bottom); - - cudaError_t cudaStatus = cudaGetLastError(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "Part 50 kernel_memcopy launch failed: %s\n", cudaGetErrorString(cudaStatus)); - return cudaStatus; - } - cudaStatus = cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus); - return cudaStatus; - } - return cudaStatus; - } - - - // __global__ void kernel_memcopy_mean_variance(float* d_srcRGB, int src_width, int src_height, - // unsigned char* vd_dstRGB, int count, int * vleft, int* vtop, int* vright, int * vbottom, float submeanb,float submeang, float submeanr, float varianceb,float varianceg, float variancer) - // { - // const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - // const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - // for (int i=0;i<count;i++) - // { - // const int left = vleft[i]; - // const int right = vright[i]; - // const int top = vtop[i]; - // const int bottom = vbottom[i]; - // - // const int dst_width = right - left; - // const int dst_height = bottom - top; - // - // - // unsigned char * d_dstRGB = vd_dstRGB + i * ; - // - // if (dst_x < dst_width && dst_y < dst_height) - // { - // int src_x = left + dst_x; - // int src_y = top + dst_y; - // - // d_dstRGB[(dst_y*dst_width) + dst_x] = (d_srcRGB[(src_y*src_width) + src_x] - submeanb)*varianceb; - // d_dstRGB[(dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (d_srcRGB[(src_width*src_height) + (src_y*src_width) + src_x] -submeang)*varianceg; - // d_dstRGB[(2 * dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (d_srcRGB[(2 * src_width*src_height) + (src_y*src_width) + src_x] - submeanr) * variancer; - // - // } - // } - // } - __global__ void PartCopy_ResizeImgBilinearBGR_Mean_Variance_CUDAKernel( - unsigned char * d_srcRGB, int srcimg_width, int srcimg_height, - int* vleft, int* vtop, int* vright, int * vbottom, - unsigned char** vd_dstRGB, int count, int *dst_width, int *dst_height, - float submeanb, float submeang, float submeanr, - float varianceb, float varianceg, float variancer) - { - int i = blockIdx.z; - - //for (int i = 0; i<count; i++) - { - const int left = vleft[i]; - const int right = vright[i]; - const int top = vtop[i]; - const int bottom = vbottom[i]; - const int cur_dst_width = dst_width[i]; - const int cur_dst_height = dst_height[i]; - - unsigned char* d_dstRGB = vd_dstRGB[i]; - - const int src_width = right - left; - const int src_height = bottom - top; - const int x = blockIdx.x * blockDim.x + threadIdx.x;// + left; - const int y = blockIdx.y * blockDim.y + threadIdx.y;//+ top; - const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - - /*if (dst_x == 0 && dst_y == 0) - printf("%d %d %d %d %d\n", i, vleft[i], vright[i], cur_dst_width, cur_dst_height);*/ - - unsigned char * src_img = d_srcRGB; - unsigned char * dst_img = d_dstRGB; - if (dst_x < cur_dst_width && dst_y < cur_dst_height) - { - float fx = (x + 0.5)*src_width / (float)cur_dst_width - 0.5 + left; - float fy = (y + 0.5)*src_height / (float)cur_dst_height - 0.5 + top; - int ax = floor(fx); - int ay = floor(fy); - if (ax < 0) - { - ax = 0; - } - if (ax > srcimg_width - 2) - { - ax = srcimg_width - 2; - } - if (ay < 0) { - ay = 0; - } - if (ay > srcimg_height - 2) - { - ay = srcimg_height - 2; - } - - int A = ax + ay*srcimg_width; - int B = ax + ay*srcimg_width + 1; - int C = ax + ay*srcimg_width + srcimg_width; - int D = ax + ay*srcimg_width + srcimg_width + 1; - - float w1, w2, w3, w4; - w1 = fx - ax; - w2 = 1 - w1; - w3 = fy - ay; - w4 = 1 - w3; - float blue = src_img[A * 3] * w2*w4 + src_img[B * 3] * w1*w4 + src_img[C * 3] * w2*w3 + src_img[D * 3] * w1*w3; - float green = src_img[A * 3 + 1] * w2*w4 + src_img[B * 3 + 1] * w1*w4 - + src_img[C * 3 + 1] * w2*w3 + src_img[D * 3 + 1] * w1*w3; - float red = src_img[A * 3 + 2] * w2*w4 + src_img[B * 3 + 2] * w1*w4 - + src_img[C * 3 + 2] * w2*w3 + src_img[D * 3 + 2] * w1*w3; - - /*dst_img[(dst_y * dst_width + dst_x) * 3] = (unsigned char)(blue - submeanb)*varianceb; - dst_img[(dst_y * dst_width + dst_x) * 3 + 1] =(unsigned char) (green - submeang)*varianceg; - dst_img[(dst_y * dst_width + dst_x) * 3 + 2] = (unsigned char) (red - submeanr)*variancer;*/ - - if (blue < 0) - blue = 0; - else if (blue > 255) - blue = 255; - - if (green < 0) - green = 0; - else if (green > 255) - green = 255; - - if (red < 0) - red = 0; - else if (red > 255) - red = 255; - - dst_img[(dst_y * cur_dst_width + dst_x) * 3] = (unsigned char)blue; - dst_img[(dst_y * cur_dst_width + dst_x) * 3 + 1] = (unsigned char)green; - dst_img[(dst_y * cur_dst_width + dst_x) * 3 + 2] = (unsigned char)red; - - - /*if (src_img[(dst_y * dst_width + dst_x) * 3] < 0) - src_img[(dst_y * dst_width + dst_x) * 3] = 0; - else if (src_img[(dst_y * dst_width + dst_x) * 3] > 255) - src_img[(dst_y * dst_width + dst_x) * 3] = 255; - - if (src_img[(dst_y * dst_width + dst_x) * 3 + 1] < 0) - src_img[(dst_y * dst_width + dst_x) * 3 + 1] = 0; - else if (src_img[(dst_y * dst_width + dst_x) * 3 + 1] > 255) - src_img[(dst_y * dst_width + dst_x) * 3 + 1] = 255; - - if (src_img[(dst_y * dst_width + dst_x) * 3 + 2] < 0) - src_img[(dst_y * dst_width + dst_x) * 3 + 2] = 0; - else if (src_img[(dst_y * dst_width + dst_x) * 3 + 2] > 255) - src_img[(dst_y * dst_width + dst_x) * 3 + 2] = 255; - - - dst_img[(dst_y * dst_width + dst_x) * 3] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3]; - dst_img[(dst_y * dst_width + dst_x) * 3 + 1] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3 + 1]; - dst_img[(dst_y * dst_width + dst_x) * 3 + 2] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3 + 2];*/ - } - } - } - - cudaError_t PartMemResizeBatch(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, int count, int* left, int* top, int* right, int* bottom, int *dst_w, int *dst_h, float submeanb, float submeang, float submeanr, - float varianceb, float varianceg, float variancer) - { - /* cudaEvent_t start, stop; - float time; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start, 0);*/ - - dim3 block(32, 16, 1); - dim3 grid((*std::max_element(dst_w, dst_w+ count) + (block.x - 1)) / block.x, (*std::max_element(dst_h, dst_h + count) + (block.y - 1)) / block.y, count); - - int * gpu_left; - cudaMalloc(&gpu_left, 1000 * sizeof(int)); - cudaMemcpy(gpu_left, left, count * sizeof(int), cudaMemcpyHostToDevice); - - int * gpu_right; - cudaMalloc(&gpu_right, 1000 * sizeof(int)); - cudaMemcpy(gpu_right, right, count * sizeof(int), cudaMemcpyHostToDevice); - - int * gpu_top; - cudaMalloc(&gpu_top, 1000 * sizeof(int)); - cudaMemcpy(gpu_top, top, count * sizeof(int), cudaMemcpyHostToDevice); - - int * gpu_bottom; - cudaMalloc(&gpu_bottom, 1000 * sizeof(int)); - cudaMemcpy(gpu_bottom, bottom, count * sizeof(int), cudaMemcpyHostToDevice); - - int * gpu_dst_w; - cudaMalloc(&gpu_dst_w, 1000 * sizeof(int)); - cudaMemcpy(gpu_dst_w, dst_w, count * sizeof(int), cudaMemcpyHostToDevice); - - int * gpu_dst_h; - cudaMalloc(&gpu_dst_h, 1000 * sizeof(int)); - cudaMemcpy(gpu_dst_h, dst_h, count * sizeof(int), cudaMemcpyHostToDevice); - - unsigned char** gpu_dst_rgb; - cudaMalloc(&gpu_dst_rgb, 1000 * sizeof(unsigned char*)); - cudaMemcpy(gpu_dst_rgb, d_dstRGB, count * sizeof(unsigned char*), cudaMemcpyHostToDevice); - - //cudaMemcpy(cpu_personfloat, d_srcRGB, 112*224*2*sizeof(float), cudaMemcpyDeviceToHost); - // for(int i=0;i<100;i++) - // { - // printf("the score is %f\t",cpu_personfloat[i]); - // } - PartCopy_ResizeImgBilinearBGR_Mean_Variance_CUDAKernel << < grid, block >> > ( - d_srcRGB, src_width, src_height, - gpu_left, gpu_top, gpu_right, gpu_bottom, - gpu_dst_rgb, count, gpu_dst_w, gpu_dst_h, - submeanb, submeang, submeanr, - varianceb, varianceg, variancer); - cudaFree(gpu_top); - cudaFree(gpu_bottom); - cudaFree(gpu_left); - cudaFree(gpu_right); - cudaFree(gpu_dst_w); - cudaFree(gpu_dst_h); - cudaFree(gpu_dst_rgb); - - cudaError_t cudaStatus = cudaGetLastError(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "Part 270 kernel_memcopy launch failed: %s\n", cudaGetErrorString(cudaStatus)); - return cudaStatus; - } - cudaStatus = cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus); - return cudaStatus; - } - - /*cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&time, start, stop); - cudaEventDestroy(start); - cudaEventDestroy(stop); - printf("�˺�������ʱ��:%f\n", time);*/ - - return cudaStatus; - } - -} \ No newline at end of file diff --git a/src/nvdecoder/RGB2YUV.cu b/src/nvdecoder/RGB2YUV.cu deleted file mode 100644 index 7202c3a..0000000 --- a/src/nvdecoder/RGB2YUV.cu +++ /dev/null @@ -1,263 +0,0 @@ - - -#include "cuda_kernels.h" - -typedef unsigned char uint8; -typedef unsigned int uint32; -typedef int int32; - -namespace cuda_common -{ - __device__ unsigned char clip_value(unsigned char x, unsigned char min_val, unsigned char max_val){ - if (x>max_val){ - return max_val; - } - else if (x<min_val){ - return min_val; - } - else{ - return x; - } - } - - __global__ void kernel_rgb2yuv(unsigned char *src_img, unsigned char* Y, unsigned char* u, unsigned char* v, - int src_width, int src_height, size_t yPitch) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x >= src_width) - return; //x = width - 1; - - if (y >= src_height) - return; // y = height - 1; - - int B = src_img[y * src_width * 3 + x * 3]; - int G = src_img[y * src_width * 3 + x * 3 + 1]; - int R = src_img[y * src_width * 3 + x * 3 + 2]; - - /*int B = src_img[y * src_width + x]; - int G = src_img[src_width * src_height + y * src_width + x]; - int R = src_img[src_width * src_height * 2 + y * src_width + x];*/ - - Y[y * yPitch + x] = clip_value((unsigned char)(0.299 * R + 0.587 * G + 0.114 * B), 0, 255); - u[y * src_width + x] = clip_value((unsigned char)(-0.147 * R - 0.289 * G + 0.436 * B + 128), 0, 255); - v[y * src_width + x] = clip_value((unsigned char)(0.615 * R - 0.515 * G - 0.100 * B + 128), 0, 255); - - //Y[y * yPitch + x] = clip_value((unsigned char)(0.257 * R + 0.504 * G + 0.098 * B + 16), 0, 255); - //u[y * src_width + x] = clip_value((unsigned char)(-0.148 * R - 0.291 * G + 0.439 * B + 128), 0, 255); - //v[y * src_width + x] = clip_value((unsigned char)(0.439 * R - 0.368 * G - 0.071 * B + 128), 0, 255); - } - - __global__ void kernel_rgb2yuv(float *src_img, unsigned char* Y, unsigned char* u, unsigned char* v, - int src_width, int src_height, size_t yPitch) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x >= src_width) - return; //x = width - 1; - - if (y >= src_height) - return; // y = height - 1; - - float B = src_img[y * src_width + x]; - float G = src_img[src_width * src_height + y * src_width + x]; - float R = src_img[src_width * src_height * 2 + y * src_width + x]; - - Y[y * yPitch + x] = clip_value((unsigned char)(0.299 * R + 0.587 * G + 0.114 * B), 0, 255); - u[y * src_width + x] = clip_value((unsigned char)(-0.147 * R - 0.289 * G + 0.436 * B + 128), 0, 255); - v[y * src_width + x] = clip_value((unsigned char)(0.615 * R - 0.515 * G - 0.100 * B + 128), 0, 255); - - //Y[y * yPitch + x] = clip_value((unsigned char)(0.257 * R + 0.504 * G + 0.098 * B + 16), 0, 255); - //u[y * src_width + x] = clip_value((unsigned char)(-0.148 * R - 0.291 * G + 0.439 * B + 128), 0, 255); - //v[y * src_width + x] = clip_value((unsigned char)(0.439 * R - 0.368 * G - 0.071 * B + 128), 0, 255); - } - - extern "C" - __global__ void kernel_resize_UV(unsigned char* src_img, unsigned char *dst_img, - int src_width, int src_height, int dst_width, int dst_height, int nPitch) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x >= dst_width) - return; //x = width - 1; - - if (y >= dst_height) - return; // y = height - 1; - - float fx = (x + 0.5)*src_width / (float)dst_width - 0.5; - float fy = (y + 0.5)*src_height / (float)dst_height - 0.5; - int ax = floor(fx); - int ay = floor(fy); - if (ax < 0) - { - ax = 0; - } - else if (ax > src_width - 2) - { - ax = src_width - 2; - } - - if (ay < 0){ - ay = 0; - } - else if (ay > src_height - 2) - { - ay = src_height - 2; - } - - int A = ax + ay*src_width; - int B = ax + ay*src_width + 1; - int C = ax + ay*src_width + src_width; - int D = ax + ay*src_width + src_width + 1; - - float w1, w2, w3, w4; - w1 = fx - ax; - w2 = 1 - w1; - w3 = fy - ay; - w4 = 1 - w3; - - unsigned char val = src_img[A] * w2*w4 + src_img[B] * w1*w4 + src_img[C] * w2*w3 + src_img[D] * w1*w3; - - dst_img[y * nPitch + x] = clip_value(val,0,255); - } - - cudaError_t RGB2YUV(float* d_srcRGB, int src_width, int src_height, - unsigned char* Y, size_t yPitch, int yWidth, int yHeight, - unsigned char* U, size_t uPitch, int uWidth, int uHeight, - unsigned char* V, size_t vPitch, int vWidth, int vHeight) - { - unsigned char * u ; - unsigned char * v ; - - cudaError_t cudaStatus; - - cudaStatus = cudaMalloc((void**)&u, src_width * src_height * sizeof(unsigned char)); - cudaStatus = cudaMalloc((void**)&v, src_width * src_height * sizeof(unsigned char)); - - dim3 block(32, 16, 1); - dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1); - dim3 grid1((uWidth + (block.x - 1)) / block.x, (uHeight + (block.y - 1)) / block.y, 1); - dim3 grid2((vWidth + (block.x - 1)) / block.x, (vHeight + (block.y - 1)) / block.y, 1); - - kernel_rgb2yuv << < grid, block >> >(d_srcRGB, Y, u, v, src_width, src_height, yPitch); - - cudaStatus = cudaGetLastError(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "kernel_rgb2yuv launch failed: %s\n", cudaGetErrorString(cudaStatus)); - goto Error; - } - - cudaStatus = cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_rgb2yuv!\n", cudaStatus); - goto Error; - } - - kernel_resize_UV << < grid1, block >> >(u, U, src_width, src_height, uWidth, uHeight, uPitch); - - cudaStatus = cudaGetLastError(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus)); - goto Error; - } - - cudaStatus = cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus); - goto Error; - } - - kernel_resize_UV << < grid2, block >> >(v, V, src_width, src_height, vWidth, vHeight, vPitch); - - cudaStatus = cudaGetLastError(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus)); - goto Error; - } - - cudaStatus = cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus); - goto Error; - } - -Error : - cudaFree(u); - cudaFree(v); - - return cudaStatus; - } - - - - cudaError_t RGB2YUV(unsigned char* d_srcRGB, int src_width, int src_height, - unsigned char* Y, size_t yPitch, int yWidth, int yHeight, - unsigned char* U, size_t uPitch, int uWidth, int uHeight, - unsigned char* V, size_t vPitch, int vWidth, int vHeight) - { - unsigned char * u; - unsigned char * v; - - cudaError_t cudaStatus; - - cudaStatus = cudaMalloc((void**)&u, src_width * src_height * sizeof(unsigned char)); - cudaStatus = cudaMalloc((void**)&v, src_width * src_height * sizeof(unsigned char)); - - dim3 block(32, 16, 1); - dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1); - dim3 grid1((uWidth + (block.x - 1)) / block.x, (uHeight + (block.y - 1)) / block.y, 1); - dim3 grid2((vWidth + (block.x - 1)) / block.x, (vHeight + (block.y - 1)) / block.y, 1); - - kernel_rgb2yuv << < grid, block >> >(d_srcRGB, Y, u, v, src_width, src_height, yPitch); - - cudaStatus = cudaGetLastError(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "kernel_rgb2yuv launch failed: %s\n", cudaGetErrorString(cudaStatus)); - goto Error; - } - - cudaStatus = cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_rgb2yuv!\n", cudaStatus); - goto Error; - } - - kernel_resize_UV << < grid1, block >> >(u, U, src_width, src_height, uWidth, uHeight, uPitch); - - cudaStatus = cudaGetLastError(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus)); - goto Error; - } - - cudaStatus = cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus); - goto Error; - } - - kernel_resize_UV << < grid2, block >> >(v, V, src_width, src_height, vWidth, vHeight, vPitch); - - cudaStatus = cudaGetLastError(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus)); - goto Error; - } - - cudaStatus = cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus); - goto Error; - } - - Error: - cudaFree(u); - cudaFree(v); - - return cudaStatus; - } -} - diff --git a/src/nvdecoder/ResizeImage.cu b/src/nvdecoder/ResizeImage.cu deleted file mode 100644 index fdc6961..0000000 --- a/src/nvdecoder/ResizeImage.cu +++ /dev/null @@ -1,84 +0,0 @@ -#include "cuda_kernels.h" - -typedef unsigned char uchar; -typedef unsigned int uint32; -typedef int int32; - -namespace cuda_common -{ - __global__ void kernel_bilinear(float *src_img, float *dst_img, - int src_width, int src_height, int dst_width, int dst_height) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x < dst_width && y < dst_height) - { - float fx = (x + 0.5)*src_width / (float)dst_width - 0.5; - float fy = (y + 0.5)*src_height / (float)dst_height - 0.5; - int ax = floor(fx); - int ay = floor(fy); - if (ax < 0) - { - ax = 0; - } - else if (ax > src_width - 2) - { - ax = src_width - 2; - } - - if (ay < 0){ - ay = 0; - } - else if (ay > src_height - 2) - { - ay = src_height - 2; - } - - int A = ax + ay*src_width; - int B = ax + ay*src_width + 1; - int C = ax + ay*src_width + src_width; - int D = ax + ay*src_width + src_width + 1; - - float w1, w2, w3, w4; - w1 = fx - ax; - w2 = 1 - w1; - w3 = fy - ay; - w4 = 1 - w3; - - float blue = src_img[A] * w2*w4 + src_img[B] * w1*w4 + src_img[C] * w2*w3 + src_img[D] * w1*w3; - - float green = src_img[src_width * src_height + A] * w2*w4 + src_img[src_width * src_height + B] * w1*w4 - + src_img[src_width * src_height + C] * w2*w3 + src_img[src_width * src_height + D] * w1*w3; - - float red = src_img[src_width * src_height * 2 + A] * w2*w4 + src_img[src_width * src_height * 2 + B] * w1*w4 - + src_img[src_width * src_height * 2 + C] * w2*w3 + src_img[src_width * src_height * 2 + D] * w1*w3; - - dst_img[y * dst_width + x] = blue; - dst_img[dst_width * dst_height + y * dst_width + x] = green; - dst_img[dst_width * dst_height * 2 + y * dst_width + x] = red; - } - } - - cudaError_t ResizeImage(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height) - { - dim3 block(32, 16, 1); - dim3 grid((dst_width + (block.x - 1)) / block.x, (dst_height + (block.y - 1)) / block.y, 1); - - kernel_bilinear << < grid, block >> >(d_srcRGB, d_dstRGB, src_width, src_height, dst_width, dst_height); - - cudaError_t cudaStatus = cudaGetLastError(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "kernel_bilinear launch failed: %s\n", cudaGetErrorString(cudaStatus)); - return cudaStatus; - } - - cudaStatus = cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus); - return cudaStatus; - } - - return cudaStatus; - } -} \ No newline at end of file diff --git a/src/nvdecoder/common_header.h b/src/nvdecoder/common_header.h deleted file mode 100644 index cf45c91..0000000 --- a/src/nvdecoder/common_header.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef _COMMON_HEADER_H_ -#define _COMMON_HEADER_H_ - - -#include "../interface/logger.hpp" -#include "../interface/utiltools.hpp" -#include "../interface/interface_headers.h" - -#endif \ No newline at end of file diff --git a/src/nvdecoder/cuda_kernels.h b/src/nvdecoder/cuda_kernels.h deleted file mode 100644 index cd1eb00..0000000 --- a/src/nvdecoder/cuda_kernels.h +++ /dev/null @@ -1,63 +0,0 @@ -#pragma once -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - -#include <stdio.h> -#include <stdlib.h> - -#include <string.h> -#include <math.h> - -#include <cuda.h> - -typedef enum -{ - ITU_601 = 1, - ITU_709 = 2 -} FF_ColorSpace; - -namespace cuda_common -{ - cudaError_t setColorSpace(FF_ColorSpace CSC, float hue); - - cudaError_t NV12ToRGBnot(CUdeviceptr d_srcNV12, size_t nSourcePitch, unsigned char* d_dstRGB, int width, int height); - cudaError_t CUDAToBGR(CUdeviceptr dataY, CUdeviceptr dataUV, size_t pitchY, size_t pitchUV, unsigned char* d_dstRGB, int width, int height); - - - cudaError_t ResizeImage(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height); - - cudaError_t RGB2YUV(float* d_srcRGB, int src_width, int src_height, - unsigned char* Y, size_t yPitch, int yWidth, int yHeight, - unsigned char* U, size_t uPitch, int uWidth, int uHeight, - unsigned char* V, size_t vPitch, int vWidth, int vHeight); - - cudaError_t RGB2YUV(unsigned char* d_srcRGB, int src_width, int src_height, - unsigned char* Y, size_t yPitch, int yWidth, int yHeight, - unsigned char* U, size_t uPitch, int uWidth, int uHeight, - unsigned char* V, size_t vPitch, int vWidth, int vHeight); - - cudaError_t PartMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom); - // cudaError_t PartMemResize(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int left, int top, int right, int bottom); - - cudaError_t PartMemResizeBatch(unsigned char* d_srcRGB, int srcimg_width, int srcimg_height, unsigned char** d_dstRGB, int count, - int* left, int* top, int* right, int* bottom, int *dst_w, int *dst_h, - float submeanb, float submeang, float submeanr, - float varianceb, float varianceg, float variancer); - - cudaError_t DrawImage(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom); - cudaError_t DrawImage(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom); - - cudaError_t DrawLine(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y); -} - - -int jpegNPP(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height); -int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height); - -int jpegNPP(const char *szOutputFile, float* d_srcRGB); -int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB); - -int initTable(); -int initTable(int flag, int width, int height); -int releaseJpegNPP(); - diff --git a/src/nvdecoder/define.hpp b/src/nvdecoder/define.hpp deleted file mode 100644 index 2eaafe0..0000000 --- a/src/nvdecoder/define.hpp +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include <string> - - -#define CHECK_CUDA(call) \ -{\ - const cudaError_t error_code = call;\ - if (cudaSuccess != error_code)\ - LOG_ERROR("CUDA error, code: {} reason: {}", error_code, cudaGetErrorString(error_code));\ -} \ No newline at end of file diff --git a/src/nvdecoder/jpegNPP.cpp-1 b/src/nvdecoder/jpegNPP.cpp-1 deleted file mode 100644 index f0bf2e6..0000000 --- a/src/nvdecoder/jpegNPP.cpp-1 +++ /dev/null @@ -1,1193 +0,0 @@ -/* -* Copyright 1993-2015 NVIDIA Corporation. All rights reserved. -* -* NOTICE TO USER: -* -* This source code is subject to NVIDIA ownership rights under U.S. and -* international Copyright laws. -* -* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -* OR PERFORMANCE OF THIS SOURCE CODE. -* -* U.S. Government End Users. This source code is a "commercial item" as -* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -* "commercial computer software" and "commercial computer software -* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -* and is provided to the U.S. Government only as a commercial end item. -* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -* source code with only those rights set forth herein. -*/ - -// This sample needs at least CUDA 5.5 and a GPU that has at least Compute Capability 2.0 - -// This sample demonstrates a simple image processing pipeline. -// First, a JPEG file is huffman decoded and inverse DCT transformed and dequantized. -// Then the different planes are resized. Finally, the resized image is quantized, forward -// DCT transformed and huffman encoded. - -#include "cuda_kernels.h" - -#include <npp.h> -#include <cuda_runtime.h> -#include "common/UtilNPP/Exceptions.h" - -#include "Endianess.h" -#include <math.h> - -#include <string.h> -#include <fstream> -#include <iostream> - -#include "common/inc/helper_string.h" -#include "common/inc/helper_cuda.h" -//#include "MacroDef.h" -#include "cuda.h" - -using namespace std; - -struct FrameHeader -{ - unsigned char nSamplePrecision; - unsigned short nHeight; - unsigned short nWidth; - unsigned char nComponents; - unsigned char aComponentIdentifier[3]; - unsigned char aSamplingFactors[3]; - unsigned char aQuantizationTableSelector[3]; -}; - -struct ScanHeader -{ - unsigned char nComponents; - unsigned char aComponentSelector[3]; - unsigned char aHuffmanTablesSelector[3]; - unsigned char nSs; - unsigned char nSe; - unsigned char nA; -}; - -struct QuantizationTable -{ - unsigned char nPrecisionAndIdentifier; - unsigned char aTable[64]; -}; - -struct HuffmanTable -{ - unsigned char nClassAndIdentifier; - unsigned char aCodes[16]; - unsigned char aTable[256]; -}; - -//??准?炼??藕?量??模?? -//unsigned char std_Y_QT[64] = -//{ -// 16, 11, 10, 16, 24, 40, 51, 61, -// 12, 12, 14, 19, 26, 58, 60, 55, -// 14, 13, 16, 24, 40, 57, 69, 56, -// 14, 17, 22, 29, 51, 87, 80, 62, -// 18, 22, 37, 56, 68, 109, 103, 77, -// 24, 35, 55, 64, 81, 104, 113, 92, -// 49, 64, 78, 87, 103, 121, 120, 101, -// 72, 92, 95, 98, 112, 100, 103, 99 -//}; -// -////??准色???藕?量??模?? -//unsigned char std_UV_QT[64] = -//{ -// 17, 18, 24, 47, 99, 99, 99, 99, -// 18, 21, 26, 66, 99, 99, 99, 99, -// 24, 26, 56, 99, 99, 99, 99, 99, -// 47, 66, 99, 99, 99, 99, 99, 99, -// 99, 99, 99, 99, 99, 99, 99, 99, -// 99, 99, 99, 99, 99, 99, 99, 99, -// 99, 99, 99, 99, 99, 99, 99, 99, -// 99, 99, 99, 99, 99, 99, 99, 99 -//}; - -////?炼??藕?量??模?? -//unsigned char std_Y_QT[64] = -//{ -// 6, 4, 5, 6, 5, 4, 6, 6, -// 5, 6, 7, 7, 6, 8, 10, 16, -// 10, 10, 9, 9, 10, 20, 14, 15, -// 12, 16, 23, 20, 24, 24, 23, 20, -// 22, 22, 26, 29, 37, 31, 26, 27, -// 35, 28, 22, 22, 32, 44, 32, 35, -// 38, 39, 41, 42, 41, 25, 31, 45, -// 48, 45, 40, 48, 37, 40, 41, 40 -//}; -// -////色???藕?量??模?? -//unsigned char std_UV_QT[64] = -//{ -// 7, 7, 7, 10, 8, 10, 19, 10, -// 10, 19, 40, 26, 22, 26, 40, 40, -// 40, 40, 40, 40, 40, 40, 40, 40, -// 40, 40, 40, 40, 40, 40, 40, 40, -// 40, 40, 40, 40, 40, 40, 40, 40, -// 40, 40, 40, 40, 40, 40, 40, 40, -// 40, 40, 40, 40, 40, 40, 40, 40, -// 40, 40, 40, 40, 40, 40, 40, 40 -//}; - -//?炼??藕?量??模?? -unsigned char std_Y_QT[64] = -{ - 0.75 * 6, 0.75 * 4, 0.75 * 5, 0.75 * 6, 0.75 * 5, 0.75 * 4, 0.75 * 6, 0.75 * 6, - 0.75 * 5, 0.75 * 6, 0.75 * 7, 0.75 * 7, 0.75 * 6, 0.75 * 8, 0.75 * 10, 0.75 * 16, - 0.75 * 10, 0.75 * 10, 0.75 * 9, 0.75 * 9, 0.75 * 10, 0.75 * 20, 0.75 * 14, 0.75 * 15, - 0.75 * 12, 0.75 * 16, 0.75 * 23, 0.75 * 20, 0.75 * 24, 0.75 * 24, 0.75 * 23, 0.75 * 20, - 0.75 * 22, 0.75 * 22, 0.75 * 26, 0.75 * 29, 0.75 * 37, 0.75 * 31, 0.75 * 26, 0.75 * 27, - 0.75 * 35, 0.75 * 28, 0.75 * 22, 0.75 * 22, 0.75 * 32, 0.75 * 44, 0.75 * 32, 0.75 * 35, - 0.75 * 38, 0.75 * 39, 0.75 * 41, 0.75 * 42, 0.75 * 41, 0.75 * 25, 0.75 * 31, 0.75 * 45, - 0.75 * 48, 0.75 * 45, 0.75 * 40, 0.75 * 48, 0.75 * 37, 0.75 * 40, 0.75 * 41, 0.75 * 40 -}; - -//色???藕?量??模?? -unsigned char std_UV_QT[64] = -{ - 0.75 * 7, 0.75 * 7, 0.75 * 7, 0.75 * 10, 0.75 * 8, 0.75 * 10, 0.75 * 19, 0.75 * 10, - 0.75 * 10, 0.75 * 19, 0.75 * 40, 0.75 * 26, 0.75 * 22, 0.75 * 26, 0.75 * 40, 0.75 * 40, - 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30, - 30, 30, 30, 30, 30, 30, 30, 30 -}; - -unsigned char STD_DC_Y_NRCODES[16] = { 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 }; -unsigned char STD_DC_Y_VALUES[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; - -unsigned char STD_DC_UV_NRCODES[16] = { 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 }; -unsigned char STD_DC_UV_VALUES[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; - -unsigned char STD_AC_Y_NRCODES[16] = { 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0X7D }; -unsigned char STD_AC_Y_VALUES[162] = -{ - 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, - 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07, - 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08, - 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0, - 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16, - 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28, - 0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, - 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, - 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, - 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, - 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, - 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, - 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, - 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, - 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, - 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, - 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, - 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2, - 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, - 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, - 0xf9, 0xfa -}; - -unsigned char STD_AC_UV_NRCODES[16] = { 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0X77 }; -unsigned char STD_AC_UV_VALUES[162] = -{ - 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, - 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71, - 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, - 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0, - 0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34, - 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26, - 0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, - 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, - 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, - 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, - 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, - 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, - 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, - 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, - 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, - 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, - 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, - 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, - 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, - 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, - 0xf9, 0xfa -}; - -int DivUp(int x, int d) -{ - return (x + d - 1) / d; -} - -template<typename T> -void writeAndAdvance(unsigned char *&pData, T nElement) -{ - writeBigEndian<T>(pData, nElement); - pData += sizeof(T); -} - -void writeMarker(unsigned char nMarker, unsigned char *&pData) -{ - *pData++ = 0x0ff; - *pData++ = nMarker; -} - -void writeJFIFTag(unsigned char *&pData) -{ - const char JFIF_TAG[] = - { - 0x4a, 0x46, 0x49, 0x46, 0x00, - 0x01, 0x02, - 0x00, - 0x00, 0x01, 0x00, 0x01, - 0x00, 0x00 - }; - - writeMarker(0x0e0, pData); - writeAndAdvance<unsigned short>(pData, sizeof(JFIF_TAG) + sizeof(unsigned short)); - memcpy(pData, JFIF_TAG, sizeof(JFIF_TAG)); - pData += sizeof(JFIF_TAG); -} - -void writeFrameHeader(const FrameHeader &header, unsigned char *&pData) -{ - unsigned char aTemp[128]; - unsigned char *pTemp = aTemp; - - writeAndAdvance<unsigned char>(pTemp, header.nSamplePrecision); - writeAndAdvance<unsigned short>(pTemp, header.nHeight); - writeAndAdvance<unsigned short>(pTemp, header.nWidth); - writeAndAdvance<unsigned char>(pTemp, header.nComponents); - - for (int c = 0; c<header.nComponents; ++c) - { - writeAndAdvance<unsigned char>(pTemp, header.aComponentIdentifier[c]); - writeAndAdvance<unsigned char>(pTemp, header.aSamplingFactors[c]); - writeAndAdvance<unsigned char>(pTemp, header.aQuantizationTableSelector[c]); - } - - unsigned short nLength = (unsigned short)(pTemp - aTemp); - - writeMarker(0x0C0, pData); - writeAndAdvance<unsigned short>(pData, nLength + 2); - memcpy(pData, aTemp, nLength); - pData += nLength; -} - -void writeScanHeader(const ScanHeader &header, unsigned char *&pData) -{ - unsigned char aTemp[128]; - unsigned char *pTemp = aTemp; - - writeAndAdvance<unsigned char>(pTemp, header.nComponents); - - for (int c = 0; c<header.nComponents; ++c) - { - writeAndAdvance<unsigned char>(pTemp, header.aComponentSelector[c]); - writeAndAdvance<unsigned char>(pTemp, header.aHuffmanTablesSelector[c]); - } - - writeAndAdvance<unsigned char>(pTemp, header.nSs); - writeAndAdvance<unsigned char>(pTemp, header.nSe); - writeAndAdvance<unsigned char>(pTemp, header.nA); - - unsigned short nLength = (unsigned short)(pTemp - aTemp); - - writeMarker(0x0DA, pData); - writeAndAdvance<unsigned short>(pData, nLength + 2); - memcpy(pData, aTemp, nLength); - pData += nLength; -} - -void writeQuantizationTable(const QuantizationTable &table, unsigned char *&pData) -{ - writeMarker(0x0DB, pData); - writeAndAdvance<unsigned short>(pData, sizeof(QuantizationTable) + 2); - memcpy(pData, &table, sizeof(QuantizationTable)); - pData += sizeof(QuantizationTable); -} - -void writeHuffmanTable(const HuffmanTable &table, unsigned char *&pData) -{ - writeMarker(0x0C4, pData); - - // Number of Codes for Bit Lengths [1..16] - int nCodeCount = 0; - - for (int i = 0; i < 16; ++i) - { - nCodeCount += table.aCodes[i]; - } - - writeAndAdvance<unsigned short>(pData, 17 + nCodeCount + 2); - memcpy(pData, &table, 17 + nCodeCount); - pData += 17 + nCodeCount; -} - -bool printfNPPinfo(int cudaVerMajor, int cudaVerMinor) -{ - const NppLibraryVersion *libVer = nppGetLibVersion(); - - printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); - - int driverVersion, runtimeVersion; - cudaDriverGetVersion(&driverVersion); - cudaRuntimeGetVersion(&runtimeVersion); - - printf(" CUDA Driver Version: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); - printf(" CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10); - - bool bVal = checkCudaCapabilities(cudaVerMajor, cudaVerMinor); - return bVal; -} - -NppiDCTState *pDCTState; -FrameHeader oFrameHeader; -FrameHeader oFrameHeaderFixedSize; -ScanHeader oScanHeader; -QuantizationTable aQuantizationTables[4]; -Npp8u *pdQuantizationTables; -HuffmanTable aHuffmanTables[4]; -HuffmanTable *pHuffmanDCTables; -HuffmanTable *pHuffmanACTables; -int nMCUBlocksH; -int nMCUBlocksV; -int nMCUBlocksHFixedSize; -int nMCUBlocksVFixedSize; -Npp8u *pdScan; -NppiEncodeHuffmanSpec *apHuffmanDCTable[3]; -NppiEncodeHuffmanSpec *apHuffmanACTable[3]; -unsigned char *pDstJpeg; -unsigned char *pDstOutput; -int nRestartInterval; - -int initTable() -{ - NPP_CHECK_NPP(nppiDCTInitAlloc(&pDCTState)); - - nRestartInterval = -1; - - cudaMalloc(&pdQuantizationTables, 64 * 4); - pHuffmanDCTables = aHuffmanTables; - pHuffmanACTables = &aHuffmanTables[2]; - memset(aQuantizationTables, 0, 4 * sizeof(QuantizationTable)); - memset(aHuffmanTables, 0, 4 * sizeof(HuffmanTable)); - memset(&oFrameHeader, 0, sizeof(FrameHeader)); - - - //????Huffman?? - aHuffmanTables[0].nClassAndIdentifier = 0; - memcpy(aHuffmanTables[0].aCodes, STD_DC_Y_NRCODES, 16); - memcpy(aHuffmanTables[0].aTable, STD_DC_Y_VALUES, 12); - - aHuffmanTables[1].nClassAndIdentifier = 1; - memcpy(aHuffmanTables[1].aCodes, STD_DC_UV_NRCODES, 16); - memcpy(aHuffmanTables[1].aTable, STD_DC_UV_VALUES, 12); - - aHuffmanTables[2].nClassAndIdentifier = 16; - memcpy(aHuffmanTables[2].aCodes, STD_AC_Y_NRCODES, 16); - memcpy(aHuffmanTables[2].aTable, STD_AC_Y_VALUES, 162); - - aHuffmanTables[3].nClassAndIdentifier = 17; - memcpy(aHuffmanTables[3].aCodes, STD_AC_UV_NRCODES, 16); - memcpy(aHuffmanTables[3].aTable, STD_AC_UV_VALUES, 162); - - - //????量???? - aQuantizationTables[0].nPrecisionAndIdentifier = 0; - memcpy(aQuantizationTables[0].aTable, std_Y_QT, 64); - aQuantizationTables[1].nPrecisionAndIdentifier = 1; - memcpy(aQuantizationTables[1].aTable, std_UV_QT, 64); - - NPP_CHECK_CUDA(cudaMemcpyAsync(pdQuantizationTables, aQuantizationTables[0].aTable, 64, cudaMemcpyHostToDevice)); - NPP_CHECK_CUDA(cudaMemcpyAsync(pdQuantizationTables + 64, aQuantizationTables[1].aTable, 64, cudaMemcpyHostToDevice)); - - oFrameHeader.nSamplePrecision = 8; - oFrameHeader.nComponents = 3; - oFrameHeader.aComponentIdentifier[0] = 1; - oFrameHeader.aComponentIdentifier[1] = 2; - oFrameHeader.aComponentIdentifier[2] = 3; - oFrameHeader.aSamplingFactors[0] = 34; - oFrameHeader.aSamplingFactors[1] = 17; - oFrameHeader.aSamplingFactors[2] = 17; - oFrameHeader.aQuantizationTableSelector[0] = 0; - oFrameHeader.aQuantizationTableSelector[1] = 1; - oFrameHeader.aQuantizationTableSelector[2] = 1; - - for (int i = 0; i < oFrameHeader.nComponents; ++i) - { - nMCUBlocksV = max(nMCUBlocksV, oFrameHeader.aSamplingFactors[i] & 0x0f); - nMCUBlocksH = max(nMCUBlocksH, oFrameHeader.aSamplingFactors[i] >> 4); - } - NPP_CHECK_CUDA(cudaMalloc(&pdScan, 4 << 20)); - - - - oScanHeader.nComponents = 3; - oScanHeader.aComponentSelector[0] = 1; - oScanHeader.aComponentSelector[1] = 2; - oScanHeader.aComponentSelector[2] = 3; - oScanHeader.aHuffmanTablesSelector[0] = 0; - oScanHeader.aHuffmanTablesSelector[1] = 17; - oScanHeader.aHuffmanTablesSelector[2] = 17; - oScanHeader.nSs = 0; - oScanHeader.nSe = 63; - oScanHeader.nA = 0; - - - return 0; -} - -NppiSize aSrcSize[3]; -Npp16s *apdDCT[3];// = { 0, 0, 0 }; -Npp32s aDCTStep[3]; - -Npp8u *apSrcImage[3];// = { 0, 0, 0 }; -Npp32s aSrcImageStep[3]; -size_t aSrcPitch[3]; - - -int releaseJpegNPP() -{ - nppiDCTFree(pDCTState); - cudaFree(pdQuantizationTables); - cudaFree(pdScan); - for (int i = 0; i < 3; ++i) - { - cudaFree(apdDCT[i]); - cudaFree(apSrcImage[i]); - } - return 0; -} - - -int initTable(int flag, int width, int height) -{ - //????帧头 - oFrameHeaderFixedSize.nSamplePrecision = 8; - oFrameHeaderFixedSize.nComponents = 3; - oFrameHeaderFixedSize.aComponentIdentifier[0] = 1; - oFrameHeaderFixedSize.aComponentIdentifier[1] = 2; - oFrameHeaderFixedSize.aComponentIdentifier[2] = 3; - oFrameHeaderFixedSize.aSamplingFactors[0] = 34; - oFrameHeaderFixedSize.aSamplingFactors[1] = 17; - oFrameHeaderFixedSize.aSamplingFactors[2] = 17; - oFrameHeaderFixedSize.aQuantizationTableSelector[0] = 0; - oFrameHeaderFixedSize.aQuantizationTableSelector[1] = 1; - oFrameHeaderFixedSize.aQuantizationTableSelector[2] = 1; - oFrameHeaderFixedSize.nWidth = width; - oFrameHeaderFixedSize.nHeight = height; - - for (int i = 0; i < oFrameHeaderFixedSize.nComponents; ++i) - { - nMCUBlocksVFixedSize = max(nMCUBlocksVFixedSize, oFrameHeaderFixedSize.aSamplingFactors[i] & 0x0f); - nMCUBlocksHFixedSize = max(nMCUBlocksHFixedSize, oFrameHeaderFixedSize.aSamplingFactors[i] >> 4); - } - - for (int i = 0; i < oFrameHeaderFixedSize.nComponents; ++i) - { - NppiSize oBlocks; - NppiSize oBlocksPerMCU = { oFrameHeaderFixedSize.aSamplingFactors[i] >> 4, oFrameHeaderFixedSize.aSamplingFactors[i] & 0x0f }; - - oBlocks.width = (int)ceil((oFrameHeaderFixedSize.nWidth + 7) / 8 * - static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksHFixedSize); - oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; - - oBlocks.height = (int)ceil((oFrameHeaderFixedSize.nHeight + 7) / 8 * - static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksVFixedSize); - oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; - - aSrcSize[i].width = oBlocks.width * 8; - aSrcSize[i].height = oBlocks.height * 8; - - // Allocate Memory - size_t nPitch; - NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height)); - aDCTStep[i] = static_cast<Npp32s>(nPitch); - - NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height)); - - aSrcPitch[i] = nPitch; - aSrcImageStep[i] = static_cast<Npp32s>(nPitch); - } - - return 0; -} - -int jpegNPP(const char *szOutputFile, float* d_srcRGB) -{ - //RGB2YUV - cudaError_t cudaStatus; - cudaStatus = cuda_common::RGB2YUV(d_srcRGB, oFrameHeaderFixedSize.nWidth, oFrameHeaderFixedSize.nHeight, - apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height, - apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height, - apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height); - - /** - * Forward DCT, quantization and level shift part of the JPEG encoding. - * Input is expected in 8x8 macro blocks and output is expected to be in 64x1 - * macro blocks. The new version of the primitive takes the ROI in image pixel size and - * works with DCT coefficients that are in zig-zag order. - */ - int k = 0; - //LOG_INFO("NPP_CHECK_NPP:%d", 1); - if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0], - apdDCT[0], aDCTStep[0], - pdQuantizationTables + k * 64, - aSrcSize[0], - pDCTState))) - { - printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); - return EXIT_FAILURE; - } - - k = 1; - //LOG_INFO("NPP_CHECK_NPP:%d", 2); - if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1], - apdDCT[1], aDCTStep[1], - pdQuantizationTables + k * 64, - aSrcSize[1], - pDCTState))) - { - printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); - return EXIT_FAILURE; - } - - //LOG_INFO("NPP_CHECK_NPP:%d", 3); - if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2], - apdDCT[2], aDCTStep[2], - pdQuantizationTables + k * 64, - aSrcSize[2], - pDCTState))) - { - printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); - return EXIT_FAILURE; - } - - // Huffman Encoding - - Npp32s nScanLength; - Npp8u *pJpegEncoderTemp; - -#if (CUDA_VERSION == 8000) - Npp32s nTempSize; //when using CUDA8 -#else - size_t nTempSize; //when using CUDA9 -#endif - //modified by Junlin 190221 - - //LOG_INFO("NPP_CHECK_NPP:%d",4); - if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize))) - { - printf("nppiEncodeHuffmanGetSize Failed!\n"); - return EXIT_FAILURE; - } - - //LOG_INFO("NPP_CHECK_CUDA:%d",5); - NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize)); - - /** - * Allocates memory and creates a Huffman table in a format that is suitable for the encoder. - */ - NppStatus t_status; - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]); - - /** - * Huffman Encoding of the JPEG Encoding. - * Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan. - */ - Npp32s nSs = 0; - Npp32s nSe = 63; - Npp32s nH = 0; - Npp32s nL = 0; - //LOG_INFO("NPP_CHECK_NPP:%d",6); - if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep, - 0, nSs, nSe, nH, nL, - pdScan, &nScanLength, - apHuffmanDCTable, - apHuffmanACTable, - aSrcSize, - pJpegEncoderTemp))) - { - printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n"); - return EXIT_FAILURE; - } - - for (int i = 0; i < 3; ++i) - { - nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]); - nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]); - } - // Write JPEG - pDstJpeg = new unsigned char[4 << 20]{}; - pDstOutput = pDstJpeg; - - writeMarker(0x0D8, pDstOutput); - writeJFIFTag(pDstOutput); - writeQuantizationTable(aQuantizationTables[0], pDstOutput); - writeQuantizationTable(aQuantizationTables[1], pDstOutput); - writeHuffmanTable(pHuffmanDCTables[0], pDstOutput); - writeHuffmanTable(pHuffmanACTables[0], pDstOutput); - writeHuffmanTable(pHuffmanDCTables[1], pDstOutput); - writeHuffmanTable(pHuffmanACTables[1], pDstOutput); - writeFrameHeader(oFrameHeaderFixedSize, pDstOutput); - writeScanHeader(oScanHeader, pDstOutput); - - //LOG_INFO("NPP_CHECK_CUDA:%d",7); - NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost)); - - pDstOutput += nScanLength; - writeMarker(0x0D9, pDstOutput); - { - // Write result to file. - std::ofstream outputFile(szOutputFile, ios::out | ios::binary); - outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg)); - } - - // Cleanup - cudaFree(pJpegEncoderTemp); - delete[] pDstJpeg; - - - return EXIT_SUCCESS; -} - -int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB) -{ - //RGB2YUV - cudaError_t cudaStatus; - cudaStatus = cuda_common::RGB2YUV(d_srcRGB, oFrameHeaderFixedSize.nWidth, oFrameHeaderFixedSize.nHeight, - apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height, - apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height, - apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height); - - /** - * Forward DCT, quantization and level shift part of the JPEG encoding. - * Input is expected in 8x8 macro blocks and output is expected to be in 64x1 - * macro blocks. The new version of the primitive takes the ROI in image pixel size and - * works with DCT coefficients that are in zig-zag order. - */ - int k = 0; - //LOG_INFO("NPP_CHECK_NPP:%d", 1); - if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0], - apdDCT[0], aDCTStep[0], - pdQuantizationTables + k * 64, - aSrcSize[0], - pDCTState))) - { - printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); - return EXIT_FAILURE; - } - - k = 1; - //LOG_INFO("NPP_CHECK_NPP:%d", 2); - if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1], - apdDCT[1], aDCTStep[1], - pdQuantizationTables + k * 64, - aSrcSize[1], - pDCTState))) - { - printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); - return EXIT_FAILURE; - } - - //LOG_INFO("NPP_CHECK_NPP:%d", 3); - if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2], - apdDCT[2], aDCTStep[2], - pdQuantizationTables + k * 64, - aSrcSize[2], - pDCTState))) - { - printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); - return EXIT_FAILURE; - } - - // Huffman Encoding - - Npp32s nScanLength; - Npp8u *pJpegEncoderTemp; - -#if (CUDA_VERSION == 8000) - Npp32s nTempSize; //when using CUDA8 -#else - size_t nTempSize; //when using CUDA9 -#endif - //modified by Junlin 190221 - - //LOG_INFO("NPP_CHECK_NPP:%d",4); - if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize))) - { - printf("nppiEncodeHuffmanGetSize Failed!\n"); - return EXIT_FAILURE; - } - - //LOG_INFO("NPP_CHECK_CUDA:%d",5); - NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize)); - - /** - * Allocates memory and creates a Huffman table in a format that is suitable for the encoder. - */ - NppStatus t_status; - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]); - - /** - * Huffman Encoding of the JPEG Encoding. - * Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan. - */ - Npp32s nSs = 0; - Npp32s nSe = 63; - Npp32s nH = 0; - Npp32s nL = 0; - //LOG_INFO("NPP_CHECK_NPP:%d",6); - if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep, - 0, nSs, nSe, nH, nL, - pdScan, &nScanLength, - apHuffmanDCTable, - apHuffmanACTable, - aSrcSize, - pJpegEncoderTemp))) - { - printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n"); - return EXIT_FAILURE; - } - - for (int i = 0; i < 3; ++i) - { - nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]); - nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]); - } - // Write JPEG - pDstJpeg = new unsigned char[4 << 20]{}; - pDstOutput = pDstJpeg; - - writeMarker(0x0D8, pDstOutput); - writeJFIFTag(pDstOutput); - writeQuantizationTable(aQuantizationTables[0], pDstOutput); - writeQuantizationTable(aQuantizationTables[1], pDstOutput); - writeHuffmanTable(pHuffmanDCTables[0], pDstOutput); - writeHuffmanTable(pHuffmanACTables[0], pDstOutput); - writeHuffmanTable(pHuffmanDCTables[1], pDstOutput); - writeHuffmanTable(pHuffmanACTables[1], pDstOutput); - writeFrameHeader(oFrameHeaderFixedSize, pDstOutput); - writeScanHeader(oScanHeader, pDstOutput); - - //LOG_INFO("NPP_CHECK_CUDA:%d",7); - NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost)); - - pDstOutput += nScanLength; - writeMarker(0x0D9, pDstOutput); - { - // Write result to file. - std::ofstream outputFile(szOutputFile, ios::out | ios::binary); - outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg)); - } - - // Cleanup - cudaFree(pJpegEncoderTemp); - delete[] pDstJpeg; - - - return EXIT_SUCCESS; -} - - -int jpegNPP(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height) -{ - NppiSize aSrcSize[3]; - Npp16s *apdDCT[3] = { 0, 0, 0 }; - Npp32s aDCTStep[3]; - - Npp8u *apSrcImage[3] = { 0, 0, 0 }; - Npp32s aSrcImageStep[3]; - size_t aSrcPitch[3]; - - - //????帧头 - oFrameHeader.nWidth = img_width; - oFrameHeader.nHeight = img_height; - - for (int i = 0; i < oFrameHeader.nComponents; ++i) - { - NppiSize oBlocks; - NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] >> 4, oFrameHeader.aSamplingFactors[i] & 0x0f }; - - oBlocks.width = (int)ceil((oFrameHeader.nWidth + 7) / 8 * - static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH); - oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; - - oBlocks.height = (int)ceil((oFrameHeader.nHeight + 7) / 8 * - static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV); - oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; - - aSrcSize[i].width = oBlocks.width * 8; - aSrcSize[i].height = oBlocks.height * 8; - - // Allocate Memory - size_t nPitch; - //LOG_INFO("NPP_CHECK_CUDA:%d",1); - NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height)); - aDCTStep[i] = static_cast<Npp32s>(nPitch); - - //LOG_INFO("NPP_CHECK_CUDA:%d",2); - NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height)); - - aSrcPitch[i] = nPitch; - aSrcImageStep[i] = static_cast<Npp32s>(nPitch); - } - - //RGB2YUV - cudaError_t cudaStatus; - cudaStatus = cuda_common::RGB2YUV(d_srcRGB, img_width, img_height, - apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height, - apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height, - apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height); - - /** - * Forward DCT, quantization and level shift part of the JPEG encoding. - * Input is expected in 8x8 macro blocks and output is expected to be in 64x1 - * macro blocks. The new version of the primitive takes the ROI in image pixel size and - * works with DCT coefficients that are in zig-zag order. - */ - int k = 0; - //LOG_INFO("NPP_CHECK_CUDA:%d",3); - if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0], - apdDCT[0], aDCTStep[0], - pdQuantizationTables + k * 64, - aSrcSize[0], - pDCTState))) - { - printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); - return EXIT_FAILURE; - } - k = 1; - - //LOG_INFO("NPP_CHECK_CUDA:%d",4); - if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1], - apdDCT[1], aDCTStep[1], - pdQuantizationTables + k * 64, - aSrcSize[1], - pDCTState))) - { - printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); - return EXIT_FAILURE; - } - - //LOG_INFO("NPP_CHECK_CUDA:%d",5); - if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2], - apdDCT[2], aDCTStep[2], - pdQuantizationTables + k * 64, - aSrcSize[2], - pDCTState))) - { - printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); - return EXIT_FAILURE; - } - - // Huffman Encoding - - Npp32s nScanLength; - Npp8u *pJpegEncoderTemp; - -#if (CUDA_VERSION == 8000) - Npp32s nTempSize; //when using CUDA8 -#else - size_t nTempSize; //when using CUDA9 -#endif - //modified by Junlin 190221 - - //LOG_INFO("NPP_CHECK_CUDA:%d",6); - if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize))) - { - printf("nppiEncodeHuffmanGetSize Failed!\n"); - return EXIT_FAILURE; - } - - //LOG_INFO("NPP_CHECK_CUDA:%d",7); - NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize)); - - /** - * Allocates memory and creates a Huffman table in a format that is suitable for the encoder. - */ - NppStatus t_status; - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]); - - /** - * Huffman Encoding of the JPEG Encoding. - * Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan. - */ - Npp32s nSs = 0; - Npp32s nSe = 63; - Npp32s nH = 0; - Npp32s nL = 0; - //LOG_INFO("NPP_CHECK_CUDA:%d",8); - if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep, - 0, nSs, nSe, nH, nL, - pdScan, &nScanLength, - apHuffmanDCTable, - apHuffmanACTable, - aSrcSize, - pJpegEncoderTemp))) - { - printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n"); - return EXIT_FAILURE; - } - - for (int i = 0; i < 3; ++i) - { - nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]); - nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]); - } - // Write JPEG - pDstJpeg = new unsigned char[4 << 20]{}; - pDstOutput = pDstJpeg; - - writeMarker(0x0D8, pDstOutput); - writeJFIFTag(pDstOutput); - writeQuantizationTable(aQuantizationTables[0], pDstOutput); - writeQuantizationTable(aQuantizationTables[1], pDstOutput); - writeHuffmanTable(pHuffmanDCTables[0], pDstOutput); - writeHuffmanTable(pHuffmanACTables[0], pDstOutput); - writeHuffmanTable(pHuffmanDCTables[1], pDstOutput); - writeHuffmanTable(pHuffmanACTables[1], pDstOutput); - writeFrameHeader(oFrameHeader, pDstOutput); - writeScanHeader(oScanHeader, pDstOutput); - - //LOG_INFO("NPP_CHECK_CUDA:%d",9); - NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost)); - - pDstOutput += nScanLength; - writeMarker(0x0D9, pDstOutput); - - { - // Write result to file. - std::ofstream outputFile(szOutputFile, ios::out | ios::binary); - outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg)); - } - - // Cleanup - cudaFree(pJpegEncoderTemp); - delete[] pDstJpeg; - for (int i = 0; i < 3; ++i) - { - cudaFree(apdDCT[i]); - cudaFree(apSrcImage[i]); - } - - return EXIT_SUCCESS; -} - - -int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height) -{ - NppiSize aSrcSize[3]; - Npp16s *apdDCT[3] = { 0, 0, 0 }; - Npp32s aDCTStep[3]; - - Npp8u *apSrcImage[3] = { 0, 0, 0 }; - Npp32s aSrcImageStep[3]; - size_t aSrcPitch[3]; - - - //????帧头 - oFrameHeader.nWidth = img_width; - oFrameHeader.nHeight = img_height; - - for (int i = 0; i < oFrameHeader.nComponents; ++i) - { - NppiSize oBlocks; - NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] >> 4, oFrameHeader.aSamplingFactors[i] & 0x0f }; - - oBlocks.width = (int)ceil((oFrameHeader.nWidth + 7) / 8 * - static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH); - oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; - - oBlocks.height = (int)ceil((oFrameHeader.nHeight + 7) / 8 * - static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV); - oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; - - aSrcSize[i].width = oBlocks.width * 8; - aSrcSize[i].height = oBlocks.height * 8; - - // Allocate Memory - size_t nPitch; - //LOG_INFO("NPP_CHECK_CUDA:%d",1); - NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height)); - aDCTStep[i] = static_cast<Npp32s>(nPitch); - - //LOG_INFO("NPP_CHECK_CUDA:%d",2); - NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height)); - - aSrcPitch[i] = nPitch; - aSrcImageStep[i] = static_cast<Npp32s>(nPitch); - } - - //RGB2YUV - cudaError_t cudaStatus; - cudaStatus = cuda_common::RGB2YUV(d_srcRGB, img_width, img_height, - apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height, - apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height, - apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height); - - /** - * Forward DCT, quantization and level shift part of the JPEG encoding. - * Input is expected in 8x8 macro blocks and output is expected to be in 64x1 - * macro blocks. The new version of the primitive takes the ROI in image pixel size and - * works with DCT coefficients that are in zig-zag order. - */ - int k = 0; - //LOG_INFO("NPP_CHECK_CUDA:%d",3); - if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0], - apdDCT[0], aDCTStep[0], - pdQuantizationTables + k * 64, - aSrcSize[0], - pDCTState))) - { - printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); - return EXIT_FAILURE; - } - k = 1; - - //LOG_INFO("NPP_CHECK_CUDA:%d",4); - if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1], - apdDCT[1], aDCTStep[1], - pdQuantizationTables + k * 64, - aSrcSize[1], - pDCTState))) - { - printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); - return EXIT_FAILURE; - } - - //LOG_INFO("NPP_CHECK_CUDA:%d",5); - if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2], - apdDCT[2], aDCTStep[2], - pdQuantizationTables + k * 64, - aSrcSize[2], - pDCTState))) - { - printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); - return EXIT_FAILURE; - } - - // Huffman Encoding - - Npp32s nScanLength; - Npp8u *pJpegEncoderTemp; - -#if (CUDA_VERSION == 8000) - Npp32s nTempSize; //when using CUDA8 -#else - size_t nTempSize; //when using CUDA9 -#endif - //modified by Junlin 190221 - - //LOG_INFO("NPP_CHECK_CUDA:%d",6); - if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize))) - { - printf("nppiEncodeHuffmanGetSize Failed!\n"); - return EXIT_FAILURE; - } - - //LOG_INFO("NPP_CHECK_CUDA:%d",7); - NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize)); - - /** - * Allocates memory and creates a Huffman table in a format that is suitable for the encoder. - */ - NppStatus t_status; - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]); - t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]); - - /** - * Huffman Encoding of the JPEG Encoding. - * Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan. - */ - Npp32s nSs = 0; - Npp32s nSe = 63; - Npp32s nH = 0; - Npp32s nL = 0; - //LOG_INFO("NPP_CHECK_CUDA:%d",8); - if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep, - 0, nSs, nSe, nH, nL, - pdScan, &nScanLength, - apHuffmanDCTable, - apHuffmanACTable, - aSrcSize, - pJpegEncoderTemp))) - { - printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n"); - return EXIT_FAILURE; - } - - for (int i = 0; i < 3; ++i) - { - nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]); - nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]); - } - // Write JPEG - pDstJpeg = new unsigned char[4 << 20]{}; - pDstOutput = pDstJpeg; - - writeMarker(0x0D8, pDstOutput); - writeJFIFTag(pDstOutput); - writeQuantizationTable(aQuantizationTables[0], pDstOutput); - writeQuantizationTable(aQuantizationTables[1], pDstOutput); - writeHuffmanTable(pHuffmanDCTables[0], pDstOutput); - writeHuffmanTable(pHuffmanACTables[0], pDstOutput); - writeHuffmanTable(pHuffmanDCTables[1], pDstOutput); - writeHuffmanTable(pHuffmanACTables[1], pDstOutput); - writeFrameHeader(oFrameHeader, pDstOutput); - writeScanHeader(oScanHeader, pDstOutput); - - //LOG_INFO("NPP_CHECK_CUDA:%d",9); - NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost)); - - pDstOutput += nScanLength; - writeMarker(0x0D9, pDstOutput); - - { - // Write result to file. - std::ofstream outputFile(szOutputFile, ios::out | ios::binary); - outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg)); - } - - // Cleanup - cudaFree(pJpegEncoderTemp); - delete[] pDstJpeg; - for (int i = 0; i < 3; ++i) - { - cudaFree(apdDCT[i]); - cudaFree(apSrcImage[i]); - } - - return EXIT_SUCCESS; -} -- libgit2 0.21.4