From 92989af0db1827cabea63ec2a20ff37eb9ac047d Mon Sep 17 00:00:00 2001 From: ming Date: Mon, 27 Feb 2023 17:18:36 +0800 Subject: [PATCH] 更新解码器 --- .gitignore | 2 ++ .vscode/launch.json | 2 +- README.md | 2 +- src/AbstractDecoder.cpp | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/AbstractDecoder.h | 30 +++++++++++++++++++++++++++++- src/DrawImageOnGPU.cu | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/FFCuContextManager.cpp | 5 +++-- src/FFNvDecoder.cpp | 84 +++++++++++++++++++++++++++++++++++++++++++++--------------------------------------- src/FFNvDecoder.h | 5 ++--- src/FFNvDecoderManager.cpp | 44 +++++++++++++++++++++++++++++++++++++++++--- src/FFNvDecoderManager.h | 13 ++++++------- src/FrameQueue.cpp | 85 ------------------------------------------------------------------------------------- src/FrameQueue.h | 42 ------------------------------------------ src/GpuRgbMemory.hpp | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/ImageSaveGPU.cpp | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/ImageSaveGPU.h | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/NV12ToRGB.cu | 7 +++---- src/PartMemCopy.cu | 289 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/RGB2YUV.cu | 263 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/ResizeImage.cu | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/common/inc/helper_cuda_drvapi.h | 19 +++++-------------- src/cuda_kernels.h | 45 +++++++++++++++++++++++++++++++++++++++++---- src/define.hpp | 6 ++++++ src/gb28181/FFGB28181Decoder.cpp | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------------------------------------- src/gb28181/FFGB28181Decoder.h | 11 +++++++---- src/gb28181/RTPReceiver.cpp | 11 ++++++----- src/gb28181/RTPReceiver.h | 4 ++-- src/gb28181/RTPTcpReceiver.cpp | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------- src/gb28181/RTPTcpReceiver.h | 12 +++++++++--- src/gb28181/RTPUdpReceiver.cpp | 15 +++------------ src/gb28181/common_header.h | 8 ++++++++ src/gb28181/demuxer.h | 2 ++ src/jpegNPP.cpp-1 | 1193 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.cpp | 30 +++++++++++------------------- src/utiltools.hpp | 18 ++++++++++++++++++ 35 files changed, 2765 insertions(+), 340 deletions(-) create mode 100644 src/AbstractDecoder.cpp create mode 100644 src/DrawImageOnGPU.cu delete mode 100644 src/FrameQueue.cpp delete mode 100644 src/FrameQueue.h create mode 100644 src/GpuRgbMemory.hpp create mode 100644 src/ImageSaveGPU.cpp create mode 100644 src/ImageSaveGPU.h create mode 100644 src/PartMemCopy.cu create mode 100644 src/RGB2YUV.cu create mode 100644 src/ResizeImage.cu create mode 100644 src/gb28181/common_header.h create mode 100644 src/jpegNPP.cpp-1 create mode 100644 src/utiltools.hpp diff --git a/.gitignore b/.gitignore index 1b7582d..43dff6d 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ ffmpeg-4.2.2/ .vscode/ bin/ .idea/ + +3rdparty/ \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json index 8c146b1..cc4d00d 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -6,7 +6,7 @@ "type": "cppdbg", "request": "launch", "program": "${workspaceFolder}/bin/lib/test", - "args": ["rtsp://122.97.218.170:8604/openUrl/V5nXRHa?params=eyJwcm90b2NhbCI6InJ0c3AiLCJjbGllbnRUeXBlIjoib3Blbl9hcGkiLCJleHByaWVUaW1lIjotMSwicHJvdG9jb2wiOiJydHNwIiwiZXhwaXJlVGltZSI6MzAwLCJlbmFibGVNR0MiOnRydWUsImV4cGFuZCI6InN0YW5kYXJkPXJ0c3Amc3RyZWFtZm9ybT1ydHAiLCJhIjoiMTBjZjM4N2JjY2Y5NDg3YzhjNWYzNjE2M2ViMWUyNTJ8MXwwfDEiLCJ0IjoxfQ==","0"], + "args": ["rtsp","3", "30012"], "stopAtEntry": false, "cwd": "${workspaceFolder}/bin/lib", "environment": [], diff --git a/README.md b/README.md index 9e1def6..4f3dc70 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ 支持 cuvid 需要安装 nv-codec-headers, 进入 nv-codec-headers 文件夹后以sudo权限make && make install即可 3. 编译ffmpeg ~~~ -./configure --enable-debug --extra-cflags=-g --extra-ldflags=-g --disable-optimizations --disable-stripping --enable-cuda --enable-cuvid --enable-nvenc --disable-x86asm --enable-nonfree --enable-libnpp --extra-cflags=-I/usr/local/cuda-11.7/targets/x86_64-linux/include --extra-cflags=-fPIC --extra-ldflags=-L/usr/local/cuda-11.7/targets/x86_64-linux/lib --enable-shared --enable-pic --enable-ffplay --prefix=../bin +./configure --enable-debug --extra-cflags=-g --extra-ldflags=-g --disable-optimizations --disable-stripping --enable-cuda --enable-cuvid --enable-nvenc --disable-x86asm --enable-nonfree --enable-libnpp --disable-vaapi --extra-cflags=-I/usr/local/cuda-11.7/targets/x86_64-linux/include --extra-cflags=-fPIC --extra-ldflags=-L/usr/local/cuda-11.7/targets/x86_64-linux/lib --enable-shared --enable-pic --enable-ffplay --prefix=../bin ~~~ 其中以下是用于调试的,编译release可以去掉: ~~~ diff --git a/src/AbstractDecoder.cpp b/src/AbstractDecoder.cpp new file mode 100644 index 0000000..0e51524 --- /dev/null +++ b/src/AbstractDecoder.cpp @@ -0,0 +1,114 @@ +#include "AbstractDecoder.h" + +#include "logger.hpp" +#include "GpuRgbMemory.hpp" +#include "cuda_kernels.h" + +#include "utiltools.hpp" + + +FFImgInfo* AbstractDecoder::snapshot(){ + + // 锁住停止队列消耗 + std::lock_guard l(m_snapshot_mutex); + + AVFrame * gpuFrame = nullptr; + + bool bFirst = true; + while(true){ + m_queue_mutex.lock(); + if(mFrameQueue.size() <= 0){ + m_queue_mutex.unlock(); + if(bFirst){ + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + bFirst = false; + continue; + }else{ + // 再进来说明前面已经等了 100 ms + // 100 ms都没有等到解码数据,则退出 + return nullptr; + } + } + + // 队列中数据大于1 + gpuFrame = mFrameQueue.front(); + m_queue_mutex.unlock(); + break; + } + + if (gpuFrame != nullptr && gpuFrame->format == AV_PIX_FMT_CUDA ){ + LOG_DEBUG("decode task: gpuid: {} width: {} height: {}", m_cfg.gpuid, gpuFrame->width, gpuFrame->height); + GpuRgbMemory* gpuMem = new GpuRgbMemory(3, gpuFrame->width, gpuFrame->height, getName(), m_cfg.gpuid , true); + + if (gpuMem->getMem() == nullptr){ + LOG_ERROR("new GpuRgbMemory failed !!!"); + return nullptr; + } + + cudaSetDevice(atoi(m_cfg.gpuid.c_str())); + cuda_common::setColorSpace( ITU_709, 0 ); + cudaError_t cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], gpuMem->getMem(), gpuFrame->width, gpuFrame->height); + cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("CUDAToBGR failed failed !!!"); + return nullptr; + } + + unsigned char * pHwRgb = gpuMem->getMem(); + int channel = gpuMem->getChannel(); + int width = gpuMem->getWidth(); + int height = gpuMem->getHeight(); + + if (pHwRgb != nullptr && channel > 0 && width > 0 && height > 0){ + int nSize = channel * height * width; + + LOG_INFO("channel:{} height:{} width:{}", channel, height, width); + // unsigned char* cpu_data = new unsigned char[nSize]; + + unsigned char* cpu_data = (unsigned char *)av_malloc(nSize * sizeof(unsigned char)); + + cudaMemcpy(cpu_data, pHwRgb, nSize * sizeof(unsigned char), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + + delete gpuMem; + gpuMem = nullptr; + + FFImgInfo* imgInfo = new FFImgInfo(); + imgInfo->dec_name = m_dec_name; + imgInfo->pData = cpu_data; + imgInfo->height = height; + imgInfo->width = width; + imgInfo->timestamp = UtilTools::get_cur_time_ms(); + imgInfo->index = m_index; + + m_index++; + + return imgInfo; + } + + delete gpuMem; + gpuMem = nullptr; + } + + return nullptr; +} + +bool AbstractDecoder::isSnapTime(){ + if(m_snap_time_interval <= 0){ + return false; + } + long cur_time = UtilTools::get_cur_time_ms(); + if(cur_time - m_last_snap_time > m_snap_time_interval){ + return true; + } + return false; +} + +void AbstractDecoder::updateLastSnapTime(){ + m_last_snap_time = UtilTools::get_cur_time_ms(); +} + +void AbstractDecoder::setSnapTimeInterval(long interval){ + m_snap_time_interval = interval; + m_last_snap_time = UtilTools::get_cur_time_ms(); +} \ No newline at end of file diff --git a/src/AbstractDecoder.h b/src/AbstractDecoder.h index 8f696bc..b5a5665 100644 --- a/src/AbstractDecoder.h +++ b/src/AbstractDecoder.h @@ -15,6 +15,9 @@ extern "C" #include } +#include +#include + using namespace std; /************************************************** @@ -32,7 +35,7 @@ typedef void(*POST_DECODE_CALLBACK)(const void * userPtr, AVFrame * gpuFrame); typedef void(*DECODE_FINISHED_CALLBACK)(const void* userPtr); -typedef bool(*DECODE_REQUEST_STREAM_CALLBACK)(); +typedef bool(*DECODE_REQUEST_STREAM_CALLBACK)(const char* deviceId); struct FFDecConfig{ string uri; // 视频地址 @@ -51,6 +54,15 @@ enum DECODER_TYPE{ DECODER_TYPE_FFMPEG }; +struct FFImgInfo{ + string dec_name; + int width; + int height; + unsigned char * pData; + long timestamp; + long index; +}; + class AbstractDecoder { public: virtual ~AbstractDecoder(){}; @@ -83,6 +95,14 @@ public: return m_dec_name; } + FFImgInfo* snapshot(); + + bool isSnapTime(); + + void updateLastSnapTime(); + + void setSnapTimeInterval(long interval); + public: const void * m_postDecArg; POST_DECODE_CALLBACK post_decoded_cbk; @@ -95,6 +115,14 @@ public: bool m_dec_keyframe; FFDecConfig m_cfg; + + queue mFrameQueue; + mutex m_queue_mutex; + mutex m_snapshot_mutex; + + long m_snap_time_interval{-1}; + long m_last_snap_time; + long m_index{0}; }; #endif // _ABSTRACT_DECODER_H_ \ No newline at end of file diff --git a/src/DrawImageOnGPU.cu b/src/DrawImageOnGPU.cu new file mode 100644 index 0000000..8770cea --- /dev/null +++ b/src/DrawImageOnGPU.cu @@ -0,0 +1,126 @@ +#include "cuda_kernels.h" + +#include "logger.hpp" + +typedef unsigned char uchar; +typedef unsigned int uint32; +typedef int int32; + +namespace cuda_common +{ + __global__ void kernel_drawPixel(float* d_srcRGB, int src_width, int src_height, + int left, int top, int right, int bottom) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (((x == left || x == right) && y >= top && y <= bottom) || ((y == top || y == bottom) && x >= left && x <= right)) + { + d_srcRGB[(y*src_width) + x] = 0; + d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255; + d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0; + } + } + + cudaError_t DrawImage(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom) + { + dim3 block(32, 16, 1); + dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1); + + kernel_drawPixel << < grid, block >> >(d_srcRGB, src_width, src_height, left, top, right, bottom); + + cudaError_t cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("Draw 32 kernel_memcopy launch failed:{}",cudaGetErrorString(cudaStatus)); + return cudaStatus; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus); + return cudaStatus; + } + + return cudaStatus; + } + + __global__ void kernel_drawPixel(unsigned char* d_srcRGB, int src_width, int src_height, + int left, int top, int right, int bottom) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (((x == left || x == right) && y >= top && y <= bottom) || ((y == top || y == bottom) && x >= left && x <= right)) + { + d_srcRGB[(y*src_width) + x] = 0; + d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255; + d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0; + } + } + + cudaError_t DrawImage(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom) + { + dim3 block(32, 16, 1); + dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1); + + kernel_drawPixel << < grid, block >> >(d_srcRGB, src_width, src_height, left, top, right, bottom); + + cudaError_t cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("Draw 68 kernel_memcopy launch failed: {}",cudaGetErrorString(cudaStatus)); + return cudaStatus; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus); + return cudaStatus; + } + + return cudaStatus; + } + + __global__ void kernel_drawLine(float* d_srcRGB, int src_width, int src_height, + int begin_x, int begin_y, int end_x, int end_y) + { + int min_x = end_x < begin_x ? end_x : begin_x; + int max_x = end_x < begin_x ? begin_x : end_x; + + int min_y = end_y < begin_y ? end_y : begin_y; + int max_y = end_y < begin_y ? begin_y : end_y; + + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if ((x - begin_x) * (end_y - begin_y) == (end_x - begin_x) * (y - begin_y) + && min_x <= x && x <= max_x + && min_y <= y && y <= max_y) + { + d_srcRGB[(y*src_width) + x] = 0; + d_srcRGB[(src_width*src_height) + (y*src_width) + x] = 255; + d_srcRGB[(2 * src_width*src_height) + (y*src_width) + x] = 0; + } + } + + cudaError_t DrawLine(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y) + { + dim3 block(32, 16, 1); + dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1); + + kernel_drawLine << < grid, block >> >(d_srcRGB, src_width, src_height, begin_x, begin_y, end_x, end_y); + + cudaError_t cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("Draw 112 kernel_memcopy launch failed: {}",cudaGetErrorString(cudaStatus)); + return cudaStatus; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("cudaDeviceSynchronize returned error code {} after launching kernel_bilinear!", cudaStatus); + return cudaStatus; + } + + return cudaStatus; + } +} \ No newline at end of file diff --git a/src/FFCuContextManager.cpp b/src/FFCuContextManager.cpp index 8f5b09a..db097d6 100644 --- a/src/FFCuContextManager.cpp +++ b/src/FFCuContextManager.cpp @@ -1,5 +1,6 @@ #include "FFCuContextManager.h" -#include + +#include "logger.hpp" using namespace std; @@ -19,7 +20,7 @@ AVBufferRef *FFCuContextManager::getCuCtx(string gpuid) // 初始化硬件解码器 if (av_hwdevice_ctx_create(&hw_device_ctx, AV_HWDEVICE_TYPE_CUDA, gpuid.c_str(), nullptr, 0) < 0) { - av_log(nullptr, AV_LOG_ERROR, "Failed to create specified HW device ! \n"); + LOG_ERROR("Failed to create specified HW device."); return nullptr; } ctxMap[gpuid] = hw_device_ctx; diff --git a/src/FFNvDecoder.cpp b/src/FFNvDecoder.cpp index 4b20546..9aff5fd 100644 --- a/src/FFNvDecoder.cpp +++ b/src/FFNvDecoder.cpp @@ -10,6 +10,8 @@ #include "logger.hpp" +#include "utiltools.hpp" + using namespace std; // 参考博客: https://blog.csdn.net/qq_40116098/article/details/120704340 @@ -175,18 +177,6 @@ bool FFNvDecoder::start(){ return true; } -static long long get_cur_time(){ - // 获取操作系统当前时间点(精确到微秒) - chrono::time_point tpMicro - = chrono::time_point_cast(chrono::system_clock::now()); - // (微秒精度的)时间点 => (微秒精度的)时间戳 - time_t totalMicroSeconds = tpMicro.time_since_epoch().count(); - - long long currentTime = ((long long)totalMicroSeconds)/1000; - - return currentTime; -} - void FFNvDecoder::decode_thread() { AVPacket* pkt ; @@ -202,7 +192,7 @@ void FFNvDecoder::decode_thread() } ,this); - // long start_time = get_cur_time(); + // long start_time = UtilTools::get_cur_time_ms(); while (m_bRunning) { @@ -214,13 +204,6 @@ void FFNvDecoder::decode_thread() continue; } } - - AVFrame * gpuFrame = mFrameQueue.getTail(); - if (gpuFrame == nullptr) - { - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - continue; - } int result = av_read_frame(fmt_ctx, pkt); if (result == AVERROR_EOF || result < 0) @@ -247,25 +230,37 @@ void FFNvDecoder::decode_thread() if (stream_index == pkt->stream_index){ result = avcodec_send_packet(avctx, pkt); if (result < 0){ + av_packet_unref(pkt); LOG_ERROR("{} - Failed to send pkt: {}", m_dec_name, result); continue; } + AVFrame* gpuFrame = av_frame_alloc(); result = avcodec_receive_frame(avctx, gpuFrame); if ((result == AVERROR(EAGAIN) || result == AVERROR_EOF) || result < 0){ LOG_ERROR("{} - Failed to receive frame: {}", m_dec_name, result); + av_frame_free(&gpuFrame); + av_packet_unref(pkt); continue; } + av_packet_unref(pkt); - mFrameQueue.addTail(); + if(gpuFrame != nullptr){ + m_queue_mutex.lock(); + if(mFrameQueue.size() <= 10){ + mFrameQueue.push(gpuFrame); + }else{ + av_frame_free(&gpuFrame); + } + m_queue_mutex.unlock(); + } } av_packet_unref(pkt); } m_bRunning = false; - // long end_time = get_cur_time(); - + // long end_time = UtilTools::get_cur_time_ms(); // cout << "解码用时:" << end_time - start_time << endl; if (m_post_decode_thread != 0) @@ -277,6 +272,13 @@ void FFNvDecoder::decode_thread() decode_finished(); + // 清空队列 + while(mFrameQueue.size() > 0){ + AVFrame * gpuFrame = mFrameQueue.front(); + av_frame_free(&gpuFrame); + mFrameQueue.pop(); + } + LOG_INFO("{} - decode thread exited.", m_dec_name); } @@ -302,24 +304,25 @@ void FFNvDecoder::post_decode_thread(){ } int index = 0; - while (m_bRunning || mFrameQueue.length() > 0) + while (m_bRunning) { - AVFrame * gpuFrame = mFrameQueue.getHead(); - if (gpuFrame == nullptr) - { - std::this_thread::sleep_for(std::chrono::milliseconds(3)); - continue; - } + if(mFrameQueue.size() > 0){ + std::lock_guard l(m_snapshot_mutex); + // 取队头数据 + m_queue_mutex.lock(); + AVFrame * gpuFrame = mFrameQueue.front(); + mFrameQueue.pop(); + m_queue_mutex.unlock(); + // 跳帧 + if (skip_frame == 1 || index % skip_frame == 0){ + post_decoded_cbk(m_postDecArg, gpuFrame); + index = 0; + } - // 跳帧 - if (skip_frame == 1 || index % skip_frame == 0){ - post_decoded_cbk(m_postDecArg, gpuFrame); - index = 0; - } - - mFrameQueue.addHead(); + av_frame_free(&gpuFrame); - index++; + index++; + } } LOG_INFO("post decode thread exited."); @@ -374,7 +377,10 @@ void FFNvDecoder::setDecKeyframe(bool bKeyframe) } int FFNvDecoder::getCachedQueueLength(){ - return mFrameQueue.length(); + m_queue_mutex.lock(); + int queue_size = mFrameQueue.size(); + m_queue_mutex.lock(); + return queue_size; } float FFNvDecoder::fps(){ diff --git a/src/FFNvDecoder.h b/src/FFNvDecoder.h index 3a7c2b9..68d2a2f 100644 --- a/src/FFNvDecoder.h +++ b/src/FFNvDecoder.h @@ -1,10 +1,10 @@ #include #include -#include "FrameQueue.h" - #include "AbstractDecoder.h" +#include + using namespace std; class FFNvDecoder : public AbstractDecoder{ @@ -55,7 +55,6 @@ private: bool m_bFinished; bool m_bPause; - FrameQueue mFrameQueue; bool m_bReal; // 是否实时流 diff --git a/src/FFNvDecoderManager.cpp b/src/FFNvDecoderManager.cpp index edb4669..b15ef22 100644 --- a/src/FFNvDecoderManager.cpp +++ b/src/FFNvDecoderManager.cpp @@ -116,11 +116,12 @@ AbstractDecoder* FFNvDecoderManager::getDecoderByName(const string name) return nullptr; } -void FFNvDecoderManager::startDecode(AbstractDecoder* dec){ +bool FFNvDecoderManager::startDecode(AbstractDecoder* dec){ if (dec != nullptr && !dec->isRunning()) { - dec->start(); + return dec->start(); } + return false; } bool FFNvDecoderManager::startDecodeByName(const string name){ @@ -486,7 +487,7 @@ FFImgInfo* FFNvDecoderManager::snapshot(const string& uri){ } // 计算解码后原始数据所需缓冲区大小,并分配内存空间 Determine required buffer size and allocate buffer - numBytes = av_image_get_buffer_size(AV_PIX_FMT_RGB24, codec_ctx->width, codec_ctx->height, 1); + numBytes = av_image_get_buffer_size(AV_PIX_FMT_BGR24, codec_ctx->width, codec_ctx->height, 1); buffer = (uint8_t *)av_malloc(numBytes * sizeof(uint8_t)); pFrameRGB = av_frame_alloc(); @@ -560,3 +561,40 @@ void FFNvDecoderManager::releaseFFImgInfo(FFImgInfo* info){ info = nullptr; } } + +FFImgInfo* FFNvDecoderManager::snapshot_in_task(const string name){ + if (name.empty()){ + LOG_ERROR("name 为空!"); + return nullptr; + } + + std::lock_guard l(m_mutex); + + auto dec = decoderMap.find(name); + if (dec != decoderMap.end()){ + return dec->second->snapshot(); + } + + LOG_ERROR("没有找到name为{}的解码器",name); + return nullptr; +} + +vector FFNvDecoderManager::timing_snapshot_all(){ + + closeAllFinishedDecoder(); + + std::lock_guard l(m_mutex); + + vector vec; + for(auto it = decoderMap.begin(); it != decoderMap.end(); ++it){ + if(it->second->isSnapTime()){ + FFImgInfo* imginfo = it->second->snapshot(); + if(imginfo != nullptr){ + vec.push_back(imginfo); + } + it->second->updateLastSnapTime(); + } + } + + return vec; +} \ No newline at end of file diff --git a/src/FFNvDecoderManager.h b/src/FFNvDecoderManager.h index 4eaca45..685b1f9 100644 --- a/src/FFNvDecoderManager.h +++ b/src/FFNvDecoderManager.h @@ -14,14 +14,9 @@ struct MgrDecConfig string name{""}; // 解码器名称 }; -struct FFImgInfo{ - int width; - int height; - unsigned char * pData; -}; - /** * 解码器管理类,单例类 + * 谨防死锁 **/ class FFNvDecoderManager { public: @@ -90,7 +85,7 @@ public: * 返回:void * 备注: **************************************************/ - void startDecode(AbstractDecoder*); + bool startDecode(AbstractDecoder*); /************************************************** * 接口:startAllDecode @@ -257,6 +252,10 @@ public: **************************************************/ void releaseFFImgInfo(FFImgInfo* info); + FFImgInfo* snapshot_in_task(const string name); + + vector timing_snapshot_all(); + private: FFNvDecoderManager(){} diff --git a/src/FrameQueue.cpp b/src/FrameQueue.cpp deleted file mode 100644 index 23abef1..0000000 --- a/src/FrameQueue.cpp +++ /dev/null @@ -1,85 +0,0 @@ -#include "FrameQueue.h" - -FrameQueue::FrameQueue(/* args */) -{ - for (size_t i = 0; i < Maxsize; i++) - { - base[i] = av_frame_alloc(); - } - - front = rear = 0;//头指针和尾指针置为零,队列为空 -} - -FrameQueue::~FrameQueue() -{ - if (base) - { - for (size_t i = 0; i < Maxsize; i++) - { - if (base[i]) - { - av_frame_free(&base[i]); - } - } - } - - rear = front = 0; -} - -//循环队列的入队 -AVFrame* FrameQueue::getTail() -{ - //插入一个元素e为Q的新的队尾元素 - if ((rear + 1) % Maxsize == front) - return nullptr;//队满 - return base[rear];//获取队尾元素 -} - -// 将队尾元素添加到队列中 -void FrameQueue::addTail() -{ - rear = (rear + 1) % Maxsize;//队尾指针加1 -} - -//循环队列的出队 -AVFrame* FrameQueue::deQueue() -{ - //删除Q的队头元素,用e返回其值 - if (front == rear) - return nullptr;//队空 - AVFrame* e = base[front];//保存队头元素 - front = (front + 1) % Maxsize;//队头指针加1 - return e; -} - -//取循环队列的队头元素 -AVFrame* FrameQueue::getHead() -{ - //返回Q的队头元素,不修改队头指针 - if (front == rear) - return nullptr;//队列为空,取元素失败 - return base[front]; -} - -void FrameQueue::addHead() -{ - front = (front + 1) % Maxsize;//队头指针加1 -} - -int FrameQueue::length() -{ - return (rear - front + Maxsize) % Maxsize; -} - -bool FrameQueue::isEmpty() -{ - if (front == rear) - return true; - - return false; -} - -void FrameQueue::clearQueue() -{ - rear = front = 0; -} \ No newline at end of file diff --git a/src/FrameQueue.h b/src/FrameQueue.h deleted file mode 100644 index 743bad6..0000000 --- a/src/FrameQueue.h +++ /dev/null @@ -1,42 +0,0 @@ -#include -#include - -extern "C" -{ - #include - #include - #include - #include - #include - #include - #include -} - -using namespace std; - -#define Maxsize 5 // 循环队列的大小 - -// 循环队列 -class FrameQueue -{ -private: - /* data */ -public: - FrameQueue(/* args */); - ~FrameQueue(); - - AVFrame* getTail(); - void addTail(); - AVFrame* deQueue(); - AVFrame* getHead(); - void addHead(); - void clearQueue(); - - int length(); - bool isEmpty(); - -private: - AVFrame* base[Maxsize]; - atomic front; - atomic rear; -}; \ No newline at end of file diff --git a/src/GpuRgbMemory.hpp b/src/GpuRgbMemory.hpp new file mode 100644 index 0000000..8e3d15b --- /dev/null +++ b/src/GpuRgbMemory.hpp @@ -0,0 +1,86 @@ +#include + +#include "cuda_kernels.h" +#include "define.hpp" +#include "utiltools.hpp" + +using namespace std; + +class GpuRgbMemory{ + +public: + GpuRgbMemory(int _channel, int _width, int _height, string _id, string _gpuid, bool _isused){ + channel = _channel; + width = _width; + height = _height; + size = channel * width * height; + isused = _isused; + id = _id; + gpuid = _gpuid; + timestamp = UtilTools::get_cur_time_ms(); + + cudaSetDevice(atoi(gpuid.c_str())); + CHECK_CUDA(cudaMalloc((void **)&pHwRgb, size * sizeof(unsigned char))); + } + + ~GpuRgbMemory(){ + if (pHwRgb) { + cudaSetDevice(atoi(gpuid.c_str())); + CHECK_CUDA(cudaFree(pHwRgb)); + pHwRgb = nullptr; + } + } + + int getSize() { + return size; + } + + bool isIsused() { + return isused; + } + + void setIsused(bool _isused) { + isused = _isused; + // 更新时间戳 + timestamp = UtilTools::get_cur_time_ms(); + } + + string getId() { + return id; + } + + string getGpuId() { + return gpuid; + } + + unsigned char* getMem(){ + return pHwRgb; + } + + long long getTimesstamp(){ + return timestamp; + } + + int getWidth(){ + return width; + } + + int getHeight(){ + return height; + } + + int getChannel(){ + return channel; + } + +private: + int size; + bool isused; + string id; + string gpuid; + unsigned char * pHwRgb{nullptr}; + long long timestamp; + int width{0}; + int height{0}; + int channel{3}; +}; \ No newline at end of file diff --git a/src/ImageSaveGPU.cpp b/src/ImageSaveGPU.cpp new file mode 100644 index 0000000..9382a27 --- /dev/null +++ b/src/ImageSaveGPU.cpp @@ -0,0 +1,123 @@ +#include "cuda_kernels.h" + +#include "logger.hpp" + + +//int saveJPEG(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height) +//{ +// return jpegNPP(szOutputFile, d_srcRGB, img_width, img_height); +// //return 0; +//} +// +//int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height) +//{ +// return jpegNPP(szOutputFile, d_srcRGB, img_width, img_height); +// //return 0; +//} +// +//int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB) +//{ +// return jpegNPP(szOutputFile, d_srcRGB); +//} +// +//int saveJPEG(const char *szOutputFile, float* d_srcRGB) +//{ +// return jpegNPP(szOutputFile, d_srcRGB); +//} + +int resizeFrame(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height) +{ + cudaError_t cudaStatus = cuda_common::ResizeImage(d_srcRGB, src_width, src_height, d_dstRGB, dst_width, dst_height); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("cuda_common::ResizeImage failed: {}",cudaGetErrorString(cudaStatus)); + return -1; + } + + return 0; +} + +//int initTables() +//{ +// initTable(); +// return 0; +//} +// +//int initTables(int flag, int width, int height) +//{ +// initTable(0, width, height); +// return 0; +//} + +int drawImageOnGPU(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom) +{ + cuda_common::DrawImage(d_srcRGB, src_width, src_height, left, top, right, bottom); + return 0; +} + +int drawImageOnGPU(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom) +{ + cuda_common::DrawImage(d_srcRGB, src_width, src_height, left, top, right, bottom); + return 0; +} + +int drawLineOnGPU(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y) +{ + cuda_common::DrawLine(d_srcRGB, src_width, src_height, begin_x, begin_y, end_x, end_y); + return 0; +} + +//int releaseJpegSaver() +//{ +// releaseJpegNPP(); +// return 0; +//} + +int partMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom) +{ + cudaError_t cudaStatus = cuda_common::PartMemCopy(d_srcRGB, src_width, src_height, d_dstRGB, left, top, right, bottom); + if (cudaStatus != cudaSuccess) { + LOG_ERROR("cuda_common::77 PartMemCopy failed: {} {} {} {} {} {} {}",cudaGetErrorString(cudaStatus), left, top, right, bottom, src_height, d_dstRGB); + return -1; + } + + return 0; +} +//#include +//extern std::ofstream g_os; +int PartMemResizeBatch(unsigned char * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, + int count, int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h, + float submeanb, float submeang, float submeanr, + float varianceb, float varianceg, float variancer) +{ + //g_os << "cudaMemcpyHostToDevice begin 9" << std::endl; + cudaError_t cudaStatus = cuda_common::PartMemResizeBatch( + d_srcRGB, src_width, src_height, d_dstRGB, count, vleft, vtop, vright, vbottom, dst_w, dst_h, + submeanb, submeang, submeanr, + varianceb, varianceg, variancer); + //g_os << "cudaMemcpyHostToDevice end 9" << std::endl; + if (cudaStatus != cudaSuccess) { + LOG_ERROR("cuda_common::PartMemResizeBatch failed: {}",cudaGetErrorString(cudaStatus)); + return -1; + } + + return 0; +} + + +//int PartMemResizeBatch(float * d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, +// int count, int* vleft, int * vtop, int* vright, int* vbottom, int dst_w, int dst_h, +// float submeanb, float submeang, float submeanr, +// float varianceb, float varianceg, float variancer) +// +//{ +// cudaError_t cudaStatus = cuda_common::PartMemResizeBatch( +// d_srcRGB, src_width, src_height, d_dstRGB, count, vleft, vtop, vright, vbottom, dst_w, dst_h, +// submeanb, submeang, submeanr, +// varianceb, varianceg, variancer); +// if (cudaStatus != cudaSuccess) { +// fprintf(stderr, "cuda_common::PartMemCopy failed: %s\n", cudaGetErrorString(cudaStatus)); +// return -1; +// } +// +// return 0; +//} \ No newline at end of file diff --git a/src/ImageSaveGPU.h b/src/ImageSaveGPU.h new file mode 100644 index 0000000..272a6d2 --- /dev/null +++ b/src/ImageSaveGPU.h @@ -0,0 +1,65 @@ +/******************************************************************************************* +* Version: VPT_x64_V2.0.0_20170904 +* CopyRight: 中科院自动化研究所模式识别实验室图像视频组 +* UpdateDate: 20170904 +* Content: 人车物监测跟踪 +********************************************************************************************/ + +#ifndef IMAGESAVEGPU_H_ +#define IMAGESAVEGPU_H_ + +#ifdef _MSC_VER + #ifdef IMAGESAVEGPU_EXPORTS + #define IMAGESAVEGPU_API __declspec(dllexport) + #else + #define IMAGESAVEGPU_API __declspec(dllimport) + #endif +#else +#define IMAGESAVEGPU_API __attribute__((visibility ("default"))) +#endif +// 功能:保存成jpeg文件 +// szOutputFile 输出图片路径,如D:\\out.jpg +// d_srcRGB 输入RGB数据,由cudaMalloc分配的显存空间,数据排列形式为:BBBBBB......GGGGGG......RRRRRRRR...... +// img_width RGB数据图片的宽度 +// img_height RGB数据图片的高度 +// +//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height); +//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, float* d_srcRGB); +// +//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height); +//IMAGESAVEGPU_API int saveJPEG(const char *szOutputFile, unsigned char* d_srcRGB); + +// 功能:防缩图像 +IMAGESAVEGPU_API int resizeFrame(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height); + +// 功能:部分拷贝数据 +IMAGESAVEGPU_API int partMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom); + +//IMAGESAVEGPU_API int partMemResizeImage(float * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, +// int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h, +// float submeanb, float submeang, float submeanr, +// float varianceb, float varianceg, float variancer); + + +IMAGESAVEGPU_API int PartMemResizeBatch(unsigned char * d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, + int count, int* vleft, int * vtop, int* vright, int* vbottom, int *dst_w, int *dst_h, + float submeanb, float submeang, float submeanr, + float varianceb, float varianceg, float variancer); + + +//// 功能:初始化GPU保存图像的各种量化表 +//IMAGESAVEGPU_API int initTables(); +//IMAGESAVEGPU_API int initTables(int falg, int width, int height); +// +//// 功能:释放资源 +//IMAGESAVEGPU_API int releaseJpegSaver(); + +// 功能:在GPU中绘制快照包围框 +IMAGESAVEGPU_API int drawImageOnGPU(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom); + +IMAGESAVEGPU_API int drawImageOnGPU(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom); + +// 功能:在GPU中绘制直线 +IMAGESAVEGPU_API int drawLineOnGPU(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y); + +#endif diff --git a/src/NV12ToRGB.cu b/src/NV12ToRGB.cu index 0ec5f00..58e1dff 100644 --- a/src/NV12ToRGB.cu +++ b/src/NV12ToRGB.cu @@ -257,14 +257,13 @@ namespace cuda_common dstImage[width * y * 3 + x * 3 + 5] = clip_v(red[1] * 0.25,0 ,255); } - cudaError_t setColorSpace(e_ColorSpace CSC, float hue) + cudaError_t setColorSpace(FF_ColorSpace CSC, float hue) { - float hueSin = sin(hue); float hueCos = cos(hue); float hueCSC[9]; - if (CSC == ITU601) + if (CSC == ITU_601) { //CCIR 601 hueCSC[0] = 1.1644f; @@ -277,7 +276,7 @@ namespace cuda_common hueCSC[7] = hueCos * 2.0172f; hueCSC[8] = hueSin * -2.0172f; } - else if (CSC == ITU709) + else if (CSC == ITU_709) { //CCIR 709 hueCSC[0] = 1.0f; diff --git a/src/PartMemCopy.cu b/src/PartMemCopy.cu new file mode 100644 index 0000000..396765b --- /dev/null +++ b/src/PartMemCopy.cu @@ -0,0 +1,289 @@ +#include "cuda_kernels.h" +#include +typedef unsigned char uchar; +typedef unsigned int uint32; +typedef int int32; + +#define MAX_SNAPSHOT_WIDTH 320 +#define MAX_SNAPSHOT_HEIGHT 320 + +namespace cuda_common +{ + __global__ void kernel_memcopy(unsigned char* d_srcRGB, int src_width, int src_height, + unsigned char* d_dstRGB, int left, int top, int right, int bottom) + { + const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; + const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; + const int dst_width = right - left; + const int dst_height = bottom - top; + if (dst_x < dst_width && dst_y < dst_height) + { + int src_x = left + dst_x; + int src_y = top + dst_y; + + //bgr...bgr...bgr... + d_dstRGB[(dst_y*dst_width + dst_x) * 3] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3]; + d_dstRGB[(dst_y*dst_width + dst_x) + * 3 + 1] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3 + 1]; + d_dstRGB[(dst_y*dst_width + dst_x) * 3 + 2] = (unsigned char)d_srcRGB[(src_y*src_width + src_x) * 3 + 2]; + + //bbb...ggg...rrr... + //d_dstRGB[(dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(src_y*src_width) + src_x]; + //d_dstRGB[(dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(src_width*src_height) + (src_y*src_width) + src_x]; + //d_dstRGB[(2 * dst_width*dst_height) + (dst_y*dst_width) + dst_x] = (unsigned char)d_srcRGB[(2 * src_width*src_height) + (src_y*src_width) + src_x]; + + /* memcpy(d_dstRGB + (dst_y*src_width) + dst_x, d_srcRGB + (src_y*src_width) + src_x, sizeof(float)); + memcpy(d_dstRGB + (src_width*src_height) + (dst_y*src_width) + dst_x, d_srcRGB + (src_width*src_height) + (src_y*src_width) + src_x, sizeof(float)); + memcpy(d_dstRGB + (2 * src_width*src_height) + (dst_y*src_width) + dst_x, d_srcRGB + (2 * src_width*src_height) + (src_y*src_width) + src_x, sizeof(float));*/ + } + } + + cudaError_t PartMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom) + { + dim3 block(32, 16, 1); + dim3 grid(((right - left) + (block.x - 1)) / block.x, ((bottom - top) + (block.y - 1)) / block.y, 1); + + kernel_memcopy << < grid, block >> > (d_srcRGB, src_width, src_height, d_dstRGB, left, top, right, bottom); + + cudaError_t cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "Part 50 kernel_memcopy launch failed: %s\n", cudaGetErrorString(cudaStatus)); + return cudaStatus; + } + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus); + return cudaStatus; + } + return cudaStatus; + } + + + // __global__ void kernel_memcopy_mean_variance(float* d_srcRGB, int src_width, int src_height, + // unsigned char* vd_dstRGB, int count, int * vleft, int* vtop, int* vright, int * vbottom, float submeanb,float submeang, float submeanr, float varianceb,float varianceg, float variancer) + // { + // const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; + // const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; + // for (int i=0;i srcimg_width - 2) + { + ax = srcimg_width - 2; + } + if (ay < 0) { + ay = 0; + } + if (ay > srcimg_height - 2) + { + ay = srcimg_height - 2; + } + + int A = ax + ay*srcimg_width; + int B = ax + ay*srcimg_width + 1; + int C = ax + ay*srcimg_width + srcimg_width; + int D = ax + ay*srcimg_width + srcimg_width + 1; + + float w1, w2, w3, w4; + w1 = fx - ax; + w2 = 1 - w1; + w3 = fy - ay; + w4 = 1 - w3; + float blue = src_img[A * 3] * w2*w4 + src_img[B * 3] * w1*w4 + src_img[C * 3] * w2*w3 + src_img[D * 3] * w1*w3; + float green = src_img[A * 3 + 1] * w2*w4 + src_img[B * 3 + 1] * w1*w4 + + src_img[C * 3 + 1] * w2*w3 + src_img[D * 3 + 1] * w1*w3; + float red = src_img[A * 3 + 2] * w2*w4 + src_img[B * 3 + 2] * w1*w4 + + src_img[C * 3 + 2] * w2*w3 + src_img[D * 3 + 2] * w1*w3; + + /*dst_img[(dst_y * dst_width + dst_x) * 3] = (unsigned char)(blue - submeanb)*varianceb; + dst_img[(dst_y * dst_width + dst_x) * 3 + 1] =(unsigned char) (green - submeang)*varianceg; + dst_img[(dst_y * dst_width + dst_x) * 3 + 2] = (unsigned char) (red - submeanr)*variancer;*/ + + if (blue < 0) + blue = 0; + else if (blue > 255) + blue = 255; + + if (green < 0) + green = 0; + else if (green > 255) + green = 255; + + if (red < 0) + red = 0; + else if (red > 255) + red = 255; + + dst_img[(dst_y * cur_dst_width + dst_x) * 3] = (unsigned char)blue; + dst_img[(dst_y * cur_dst_width + dst_x) * 3 + 1] = (unsigned char)green; + dst_img[(dst_y * cur_dst_width + dst_x) * 3 + 2] = (unsigned char)red; + + + /*if (src_img[(dst_y * dst_width + dst_x) * 3] < 0) + src_img[(dst_y * dst_width + dst_x) * 3] = 0; + else if (src_img[(dst_y * dst_width + dst_x) * 3] > 255) + src_img[(dst_y * dst_width + dst_x) * 3] = 255; + + if (src_img[(dst_y * dst_width + dst_x) * 3 + 1] < 0) + src_img[(dst_y * dst_width + dst_x) * 3 + 1] = 0; + else if (src_img[(dst_y * dst_width + dst_x) * 3 + 1] > 255) + src_img[(dst_y * dst_width + dst_x) * 3 + 1] = 255; + + if (src_img[(dst_y * dst_width + dst_x) * 3 + 2] < 0) + src_img[(dst_y * dst_width + dst_x) * 3 + 2] = 0; + else if (src_img[(dst_y * dst_width + dst_x) * 3 + 2] > 255) + src_img[(dst_y * dst_width + dst_x) * 3 + 2] = 255; + + + dst_img[(dst_y * dst_width + dst_x) * 3] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3]; + dst_img[(dst_y * dst_width + dst_x) * 3 + 1] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3 + 1]; + dst_img[(dst_y * dst_width + dst_x) * 3 + 2] = (unsigned char)src_img[(dst_y * dst_width + dst_x) * 3 + 2];*/ + } + } + } + + cudaError_t PartMemResizeBatch(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char** d_dstRGB, int count, int* left, int* top, int* right, int* bottom, int *dst_w, int *dst_h, float submeanb, float submeang, float submeanr, + float varianceb, float varianceg, float variancer) + { + /* cudaEvent_t start, stop; + float time; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start, 0);*/ + + dim3 block(32, 16, 1); + dim3 grid((*std::max_element(dst_w, dst_w+ count) + (block.x - 1)) / block.x, (*std::max_element(dst_h, dst_h + count) + (block.y - 1)) / block.y, count); + + int * gpu_left; + cudaMalloc(&gpu_left, 1000 * sizeof(int)); + cudaMemcpy(gpu_left, left, count * sizeof(int), cudaMemcpyHostToDevice); + + int * gpu_right; + cudaMalloc(&gpu_right, 1000 * sizeof(int)); + cudaMemcpy(gpu_right, right, count * sizeof(int), cudaMemcpyHostToDevice); + + int * gpu_top; + cudaMalloc(&gpu_top, 1000 * sizeof(int)); + cudaMemcpy(gpu_top, top, count * sizeof(int), cudaMemcpyHostToDevice); + + int * gpu_bottom; + cudaMalloc(&gpu_bottom, 1000 * sizeof(int)); + cudaMemcpy(gpu_bottom, bottom, count * sizeof(int), cudaMemcpyHostToDevice); + + int * gpu_dst_w; + cudaMalloc(&gpu_dst_w, 1000 * sizeof(int)); + cudaMemcpy(gpu_dst_w, dst_w, count * sizeof(int), cudaMemcpyHostToDevice); + + int * gpu_dst_h; + cudaMalloc(&gpu_dst_h, 1000 * sizeof(int)); + cudaMemcpy(gpu_dst_h, dst_h, count * sizeof(int), cudaMemcpyHostToDevice); + + unsigned char** gpu_dst_rgb; + cudaMalloc(&gpu_dst_rgb, 1000 * sizeof(unsigned char*)); + cudaMemcpy(gpu_dst_rgb, d_dstRGB, count * sizeof(unsigned char*), cudaMemcpyHostToDevice); + + //cudaMemcpy(cpu_personfloat, d_srcRGB, 112*224*2*sizeof(float), cudaMemcpyDeviceToHost); + // for(int i=0;i<100;i++) + // { + // printf("the score is %f\t",cpu_personfloat[i]); + // } + PartCopy_ResizeImgBilinearBGR_Mean_Variance_CUDAKernel << < grid, block >> > ( + d_srcRGB, src_width, src_height, + gpu_left, gpu_top, gpu_right, gpu_bottom, + gpu_dst_rgb, count, gpu_dst_w, gpu_dst_h, + submeanb, submeang, submeanr, + varianceb, varianceg, variancer); + cudaFree(gpu_top); + cudaFree(gpu_bottom); + cudaFree(gpu_left); + cudaFree(gpu_right); + cudaFree(gpu_dst_w); + cudaFree(gpu_dst_h); + cudaFree(gpu_dst_rgb); + + cudaError_t cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "Part 270 kernel_memcopy launch failed: %s\n", cudaGetErrorString(cudaStatus)); + return cudaStatus; + } + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus); + return cudaStatus; + } + + /*cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); + printf("˺ʱ:%f\n", time);*/ + + return cudaStatus; + } + +} \ No newline at end of file diff --git a/src/RGB2YUV.cu b/src/RGB2YUV.cu new file mode 100644 index 0000000..7202c3a --- /dev/null +++ b/src/RGB2YUV.cu @@ -0,0 +1,263 @@ + + +#include "cuda_kernels.h" + +typedef unsigned char uint8; +typedef unsigned int uint32; +typedef int int32; + +namespace cuda_common +{ + __device__ unsigned char clip_value(unsigned char x, unsigned char min_val, unsigned char max_val){ + if (x>max_val){ + return max_val; + } + else if (x= src_width) + return; //x = width - 1; + + if (y >= src_height) + return; // y = height - 1; + + int B = src_img[y * src_width * 3 + x * 3]; + int G = src_img[y * src_width * 3 + x * 3 + 1]; + int R = src_img[y * src_width * 3 + x * 3 + 2]; + + /*int B = src_img[y * src_width + x]; + int G = src_img[src_width * src_height + y * src_width + x]; + int R = src_img[src_width * src_height * 2 + y * src_width + x];*/ + + Y[y * yPitch + x] = clip_value((unsigned char)(0.299 * R + 0.587 * G + 0.114 * B), 0, 255); + u[y * src_width + x] = clip_value((unsigned char)(-0.147 * R - 0.289 * G + 0.436 * B + 128), 0, 255); + v[y * src_width + x] = clip_value((unsigned char)(0.615 * R - 0.515 * G - 0.100 * B + 128), 0, 255); + + //Y[y * yPitch + x] = clip_value((unsigned char)(0.257 * R + 0.504 * G + 0.098 * B + 16), 0, 255); + //u[y * src_width + x] = clip_value((unsigned char)(-0.148 * R - 0.291 * G + 0.439 * B + 128), 0, 255); + //v[y * src_width + x] = clip_value((unsigned char)(0.439 * R - 0.368 * G - 0.071 * B + 128), 0, 255); + } + + __global__ void kernel_rgb2yuv(float *src_img, unsigned char* Y, unsigned char* u, unsigned char* v, + int src_width, int src_height, size_t yPitch) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= src_width) + return; //x = width - 1; + + if (y >= src_height) + return; // y = height - 1; + + float B = src_img[y * src_width + x]; + float G = src_img[src_width * src_height + y * src_width + x]; + float R = src_img[src_width * src_height * 2 + y * src_width + x]; + + Y[y * yPitch + x] = clip_value((unsigned char)(0.299 * R + 0.587 * G + 0.114 * B), 0, 255); + u[y * src_width + x] = clip_value((unsigned char)(-0.147 * R - 0.289 * G + 0.436 * B + 128), 0, 255); + v[y * src_width + x] = clip_value((unsigned char)(0.615 * R - 0.515 * G - 0.100 * B + 128), 0, 255); + + //Y[y * yPitch + x] = clip_value((unsigned char)(0.257 * R + 0.504 * G + 0.098 * B + 16), 0, 255); + //u[y * src_width + x] = clip_value((unsigned char)(-0.148 * R - 0.291 * G + 0.439 * B + 128), 0, 255); + //v[y * src_width + x] = clip_value((unsigned char)(0.439 * R - 0.368 * G - 0.071 * B + 128), 0, 255); + } + + extern "C" + __global__ void kernel_resize_UV(unsigned char* src_img, unsigned char *dst_img, + int src_width, int src_height, int dst_width, int dst_height, int nPitch) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width) + return; //x = width - 1; + + if (y >= dst_height) + return; // y = height - 1; + + float fx = (x + 0.5)*src_width / (float)dst_width - 0.5; + float fy = (y + 0.5)*src_height / (float)dst_height - 0.5; + int ax = floor(fx); + int ay = floor(fy); + if (ax < 0) + { + ax = 0; + } + else if (ax > src_width - 2) + { + ax = src_width - 2; + } + + if (ay < 0){ + ay = 0; + } + else if (ay > src_height - 2) + { + ay = src_height - 2; + } + + int A = ax + ay*src_width; + int B = ax + ay*src_width + 1; + int C = ax + ay*src_width + src_width; + int D = ax + ay*src_width + src_width + 1; + + float w1, w2, w3, w4; + w1 = fx - ax; + w2 = 1 - w1; + w3 = fy - ay; + w4 = 1 - w3; + + unsigned char val = src_img[A] * w2*w4 + src_img[B] * w1*w4 + src_img[C] * w2*w3 + src_img[D] * w1*w3; + + dst_img[y * nPitch + x] = clip_value(val,0,255); + } + + cudaError_t RGB2YUV(float* d_srcRGB, int src_width, int src_height, + unsigned char* Y, size_t yPitch, int yWidth, int yHeight, + unsigned char* U, size_t uPitch, int uWidth, int uHeight, + unsigned char* V, size_t vPitch, int vWidth, int vHeight) + { + unsigned char * u ; + unsigned char * v ; + + cudaError_t cudaStatus; + + cudaStatus = cudaMalloc((void**)&u, src_width * src_height * sizeof(unsigned char)); + cudaStatus = cudaMalloc((void**)&v, src_width * src_height * sizeof(unsigned char)); + + dim3 block(32, 16, 1); + dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1); + dim3 grid1((uWidth + (block.x - 1)) / block.x, (uHeight + (block.y - 1)) / block.y, 1); + dim3 grid2((vWidth + (block.x - 1)) / block.x, (vHeight + (block.y - 1)) / block.y, 1); + + kernel_rgb2yuv << < grid, block >> >(d_srcRGB, Y, u, v, src_width, src_height, yPitch); + + cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "kernel_rgb2yuv launch failed: %s\n", cudaGetErrorString(cudaStatus)); + goto Error; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_rgb2yuv!\n", cudaStatus); + goto Error; + } + + kernel_resize_UV << < grid1, block >> >(u, U, src_width, src_height, uWidth, uHeight, uPitch); + + cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus)); + goto Error; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus); + goto Error; + } + + kernel_resize_UV << < grid2, block >> >(v, V, src_width, src_height, vWidth, vHeight, vPitch); + + cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus)); + goto Error; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus); + goto Error; + } + +Error : + cudaFree(u); + cudaFree(v); + + return cudaStatus; + } + + + + cudaError_t RGB2YUV(unsigned char* d_srcRGB, int src_width, int src_height, + unsigned char* Y, size_t yPitch, int yWidth, int yHeight, + unsigned char* U, size_t uPitch, int uWidth, int uHeight, + unsigned char* V, size_t vPitch, int vWidth, int vHeight) + { + unsigned char * u; + unsigned char * v; + + cudaError_t cudaStatus; + + cudaStatus = cudaMalloc((void**)&u, src_width * src_height * sizeof(unsigned char)); + cudaStatus = cudaMalloc((void**)&v, src_width * src_height * sizeof(unsigned char)); + + dim3 block(32, 16, 1); + dim3 grid((src_width + (block.x - 1)) / block.x, (src_height + (block.y - 1)) / block.y, 1); + dim3 grid1((uWidth + (block.x - 1)) / block.x, (uHeight + (block.y - 1)) / block.y, 1); + dim3 grid2((vWidth + (block.x - 1)) / block.x, (vHeight + (block.y - 1)) / block.y, 1); + + kernel_rgb2yuv << < grid, block >> >(d_srcRGB, Y, u, v, src_width, src_height, yPitch); + + cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "kernel_rgb2yuv launch failed: %s\n", cudaGetErrorString(cudaStatus)); + goto Error; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_rgb2yuv!\n", cudaStatus); + goto Error; + } + + kernel_resize_UV << < grid1, block >> >(u, U, src_width, src_height, uWidth, uHeight, uPitch); + + cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus)); + goto Error; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus); + goto Error; + } + + kernel_resize_UV << < grid2, block >> >(v, V, src_width, src_height, vWidth, vHeight, vPitch); + + cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "kernel_resize_UV launch failed: %s\n", cudaGetErrorString(cudaStatus)); + goto Error; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_resize_UV!\n", cudaStatus); + goto Error; + } + + Error: + cudaFree(u); + cudaFree(v); + + return cudaStatus; + } +} + diff --git a/src/ResizeImage.cu b/src/ResizeImage.cu new file mode 100644 index 0000000..fdc6961 --- /dev/null +++ b/src/ResizeImage.cu @@ -0,0 +1,84 @@ +#include "cuda_kernels.h" + +typedef unsigned char uchar; +typedef unsigned int uint32; +typedef int int32; + +namespace cuda_common +{ + __global__ void kernel_bilinear(float *src_img, float *dst_img, + int src_width, int src_height, int dst_width, int dst_height) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x < dst_width && y < dst_height) + { + float fx = (x + 0.5)*src_width / (float)dst_width - 0.5; + float fy = (y + 0.5)*src_height / (float)dst_height - 0.5; + int ax = floor(fx); + int ay = floor(fy); + if (ax < 0) + { + ax = 0; + } + else if (ax > src_width - 2) + { + ax = src_width - 2; + } + + if (ay < 0){ + ay = 0; + } + else if (ay > src_height - 2) + { + ay = src_height - 2; + } + + int A = ax + ay*src_width; + int B = ax + ay*src_width + 1; + int C = ax + ay*src_width + src_width; + int D = ax + ay*src_width + src_width + 1; + + float w1, w2, w3, w4; + w1 = fx - ax; + w2 = 1 - w1; + w3 = fy - ay; + w4 = 1 - w3; + + float blue = src_img[A] * w2*w4 + src_img[B] * w1*w4 + src_img[C] * w2*w3 + src_img[D] * w1*w3; + + float green = src_img[src_width * src_height + A] * w2*w4 + src_img[src_width * src_height + B] * w1*w4 + + src_img[src_width * src_height + C] * w2*w3 + src_img[src_width * src_height + D] * w1*w3; + + float red = src_img[src_width * src_height * 2 + A] * w2*w4 + src_img[src_width * src_height * 2 + B] * w1*w4 + + src_img[src_width * src_height * 2 + C] * w2*w3 + src_img[src_width * src_height * 2 + D] * w1*w3; + + dst_img[y * dst_width + x] = blue; + dst_img[dst_width * dst_height + y * dst_width + x] = green; + dst_img[dst_width * dst_height * 2 + y * dst_width + x] = red; + } + } + + cudaError_t ResizeImage(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height) + { + dim3 block(32, 16, 1); + dim3 grid((dst_width + (block.x - 1)) / block.x, (dst_height + (block.y - 1)) / block.y, 1); + + kernel_bilinear << < grid, block >> >(d_srcRGB, d_dstRGB, src_width, src_height, dst_width, dst_height); + + cudaError_t cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "kernel_bilinear launch failed: %s\n", cudaGetErrorString(cudaStatus)); + return cudaStatus; + } + + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel_bilinear!\n", cudaStatus); + return cudaStatus; + } + + return cudaStatus; + } +} \ No newline at end of file diff --git a/src/common/inc/helper_cuda_drvapi.h b/src/common/inc/helper_cuda_drvapi.h index ffca8e8..76eacb5 100644 --- a/src/common/inc/helper_cuda_drvapi.h +++ b/src/common/inc/helper_cuda_drvapi.h @@ -218,8 +218,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV() // Find the best major SM Architecture GPU device while (current_device < device_count) { - checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device)); - checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device)); + checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device)); if (major > 0 && major < 9999) { @@ -240,9 +239,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV() checkCudaErrors(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device)); - - checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device)); - checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device)); + checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device)); int computeMode; getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device); @@ -320,9 +317,7 @@ inline int gpuGetMaxGflopsGLDeviceIdDRV() while (current_device < device_count) { checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device)); - - checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device)); - checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device)); + checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device)); #if CUDA_VERSION >= 3020 checkCudaErrors(cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device)); @@ -374,9 +369,7 @@ inline int gpuGetMaxGflopsGLDeviceIdDRV() checkCudaErrors(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device)); - - checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device)); - checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device)); + checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device)); #if CUDA_VERSION >= 3020 checkCudaErrors(cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device)); @@ -507,9 +500,7 @@ inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int d checkCudaErrors(cuDeviceGet(&cuDevice, devID)); checkCudaErrors(cuDeviceGetName(name, 100, cuDevice)); - - checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, devID)); - checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, devID)); + checkCudaErrors(cuDeviceComputeCapability(&major, &minor, devID)); if ((major > major_version) || (major == major_version && minor >= minor_version)) diff --git a/src/cuda_kernels.h b/src/cuda_kernels.h index 0741054..cd1eb00 100644 --- a/src/cuda_kernels.h +++ b/src/cuda_kernels.h @@ -12,15 +12,52 @@ typedef enum { - ITU601 = 1, - ITU709 = 2 -} e_ColorSpace; + ITU_601 = 1, + ITU_709 = 2 +} FF_ColorSpace; namespace cuda_common { - cudaError_t setColorSpace(e_ColorSpace CSC, float hue); + cudaError_t setColorSpace(FF_ColorSpace CSC, float hue); cudaError_t NV12ToRGBnot(CUdeviceptr d_srcNV12, size_t nSourcePitch, unsigned char* d_dstRGB, int width, int height); cudaError_t CUDAToBGR(CUdeviceptr dataY, CUdeviceptr dataUV, size_t pitchY, size_t pitchUV, unsigned char* d_dstRGB, int width, int height); + + + cudaError_t ResizeImage(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int dst_width, int dst_height); + + cudaError_t RGB2YUV(float* d_srcRGB, int src_width, int src_height, + unsigned char* Y, size_t yPitch, int yWidth, int yHeight, + unsigned char* U, size_t uPitch, int uWidth, int uHeight, + unsigned char* V, size_t vPitch, int vWidth, int vHeight); + + cudaError_t RGB2YUV(unsigned char* d_srcRGB, int src_width, int src_height, + unsigned char* Y, size_t yPitch, int yWidth, int yHeight, + unsigned char* U, size_t uPitch, int uWidth, int uHeight, + unsigned char* V, size_t vPitch, int vWidth, int vHeight); + + cudaError_t PartMemCopy(unsigned char* d_srcRGB, int src_width, int src_height, unsigned char* d_dstRGB, int left, int top, int right, int bottom); + // cudaError_t PartMemResize(float* d_srcRGB, int src_width, int src_height, float* d_dstRGB, int left, int top, int right, int bottom); + + cudaError_t PartMemResizeBatch(unsigned char* d_srcRGB, int srcimg_width, int srcimg_height, unsigned char** d_dstRGB, int count, + int* left, int* top, int* right, int* bottom, int *dst_w, int *dst_h, + float submeanb, float submeang, float submeanr, + float varianceb, float varianceg, float variancer); + + cudaError_t DrawImage(float* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom); + cudaError_t DrawImage(unsigned char* d_srcRGB, int src_width, int src_height, int left, int top, int right, int bottom); + + cudaError_t DrawLine(float* d_srcRGB, int src_width, int src_height, int begin_x, int begin_y, int end_x, int end_y); } + +int jpegNPP(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height); +int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height); + +int jpegNPP(const char *szOutputFile, float* d_srcRGB); +int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB); + +int initTable(); +int initTable(int flag, int width, int height); +int releaseJpegNPP(); + diff --git a/src/define.hpp b/src/define.hpp index 6c147cf..26fcc61 100644 --- a/src/define.hpp +++ b/src/define.hpp @@ -5,3 +5,9 @@ #define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) +#define CHECK_CUDA(call) \ +{\ + const cudaError_t error_code = call;\ + if (cudaSuccess != error_code)\ + LOG_ERROR("CUDA error, code: {} reason: {}", error_code, cudaGetErrorString(error_code));\ +} \ No newline at end of file diff --git a/src/gb28181/FFGB28181Decoder.cpp b/src/gb28181/FFGB28181Decoder.cpp index debb325..2207c98 100644 --- a/src/gb28181/FFGB28181Decoder.cpp +++ b/src/gb28181/FFGB28181Decoder.cpp @@ -10,11 +10,13 @@ extern "C" { #include "libswscale/swscale.h" } -#include "../logger.hpp" - #include"RTPTcpReceiver.h" #include"RTPUdpReceiver.h" +#include + +#include "common_header.h" + #define ECLOSED 0 #define ECLOSING 1 #define ERUNNING 2 @@ -36,6 +38,7 @@ FFGB28181Decoder::FFGB28181Decoder() { m_frameSkip = 1; m_port = -1; m_dec_keyframe = false; + m_post_decode_thread = 0; } FFGB28181Decoder::~FFGB28181Decoder() @@ -47,11 +50,6 @@ FFGB28181Decoder::~FFGB28181Decoder() avcodec_free_context(&m_pAVCodecCtx); } - if (m_pAVFrame) { - av_frame_free(&m_pAVFrame); - m_pAVFrame = NULL; - } - m_dec_keyframe = false; LOG_INFO("destroy OK--{}", m_dec_name); @@ -74,9 +72,22 @@ void FFGB28181Decoder::close(){ m_rtpPtr = nullptr; } - LOG_INFO("解码器关闭成功 --{}", m_dec_name); + if (gpu_options) av_dict_free(&gpu_options); + + if (m_post_decode_thread != 0) + { + pthread_join(m_post_decode_thread,0); + } + + while(mFrameQueue.size() > 0){ + AVFrame * gpuFrame = mFrameQueue.front(); + av_frame_free(&gpuFrame); + mFrameQueue.pop(); + } m_status = ECLOSED; + + LOG_INFO("解码器关闭成功 --{}", m_dec_name); } bool FFGB28181Decoder::init(FFDecConfig& cfg){ @@ -124,7 +135,18 @@ bool FFGB28181Decoder::start() { LOG_INFO("start - {} {}: ", m_dec_name, m_port); - return m_rtpPtr->Open((uint16_t)m_port); + bool bRet = m_rtpPtr->Open((uint16_t)m_port); + if(bRet){ + pthread_create(&m_post_decode_thread,0, + [](void* arg) + { + FFGB28181Decoder* a=(FFGB28181Decoder*)arg; + a->post_decode_thread(); + return (void*)0; + } + ,this); + } + return bRet; } void FFGB28181Decoder::setDecKeyframe(bool bKeyframe){ @@ -151,15 +173,12 @@ void FFGB28181Decoder::stream_callback(int videoType, char* data, int len, int i return; } - AVPacket framePacket = {}, mp4Packet = {}; + AVPacket framePacket = {}; av_init_packet(&framePacket); - av_init_packet(&mp4Packet); framePacket.size = len; framePacket.data = (uint8_t*)data; - AVDictionary *gpu_options = nullptr; - if (m_pAVCodecCtx == nullptr) { LOG_INFO("frame data is zero --{}", m_dec_name); if (VIDEO_TYPE_H264 == videoType) { @@ -192,7 +211,6 @@ void FFGB28181Decoder::stream_callback(int videoType, char* data, int len, int i } m_pAVCodecCtx = avcodec_alloc_context3(m_pAVCodec); - if (m_gpuid >= 0) { char gpui[8] = { 0 }; @@ -211,8 +229,6 @@ void FFGB28181Decoder::stream_callback(int videoType, char* data, int len, int i if (avcodec_open2(m_pAVCodecCtx, m_pAVCodec, &gpu_options) < 0) return; - - m_pAVFrame = av_frame_alloc(); } //开始解码 @@ -220,6 +236,7 @@ void FFGB28181Decoder::stream_callback(int videoType, char* data, int len, int i if (ret < 0) { //send_exception(RunMessageType::E2002, e_msg); LOG_ERROR("Real stream视频解码失败,请检查视频设备{}: avcodec_send_packet failed. ret={}", m_dec_name, ret); + av_packet_unref(&framePacket); return; } @@ -228,61 +245,67 @@ void FFGB28181Decoder::stream_callback(int videoType, char* data, int len, int i frameH = m_pAVCodecCtx->height; if (frameW <= 0 || frameH <= 0) { LOG_ERROR("[{}] frame W or H is error! ({},{})", m_dec_name, frameW, frameH); + av_packet_unref(&framePacket); return; } } // m_fps = m_pAVCodecCtx->pkt_timebase.den == 0 ? 25.0 : av_q2d(m_pAVCodecCtx->pkt_timebase); m_fps = av_q2d(m_pAVCodecCtx->framerate); - LOG_DEBUG("frameW {}--frameH {}", frameW, frameH); - while (ret >= 0) { - ret = avcodec_receive_frame(m_pAVCodecCtx, m_pAVFrame); - if (ret == AVERROR_EOF || ret == AVERROR(EAGAIN)) - return; - else if (ret < 0) { - if (m_frameCount % 10 == 0){ - //send_exception(RunMessageType::E2002, e_msg); - LOG_ERROR("Real stream视频解码失败,请检查视频设备{}: avcodec_receive_frame failed. ret={}", m_dec_name, ret); - } - continue; - } + // LOG_DEBUG("frameW {}--frameH {}", frameW, frameH); + + AVFrame* gpuFrame = av_frame_alloc(); + ret = avcodec_receive_frame(m_pAVCodecCtx, gpuFrame); + if ((ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) || ret < 0){ + LOG_ERROR("{} - Failed to receive frame: {}", m_dec_name, ret); + av_packet_unref(&framePacket); + av_frame_free(&gpuFrame); + return; + } - if (++m_frameCount % m_frameSkip != 0) continue; - - if (m_pAVFrame->width != frameW || m_pAVFrame->height != frameH){ - LOG_INFO("AVFrame is inconsistent: width is {}, height is {}; original frameW is {}, frameH is {}--{}", m_pAVFrame->width, m_pAVFrame->height, frameW, frameH , m_dec_name); - continue; - } - - LOG_DEBUG("curpos is: {}", m_frameCount); - - post_decoded_cbk(m_postDecArg, m_pAVFrame); - - //LOG_count++; - //if (LOG_count > 100000) { - // LOG_INFO("Real frame send_shm_videoframe pts={}-{}", localPts, m_dec_name); - // //LOG_count = 0; - //} - //} - //catch (GeneralException2& e) - //{ - // LOG_ERROR("send_shm_videoframe failed! {}--{}--{}", e.err_code(), e.err_msg(), m_dec_name); - // if (e.err_code() == -666) { - // this->close(); - // } - // - // if (e.err_code() == ERROR_MEMORY) { - // if (m_frameCount % 10 == 0) { - // string e_msg; - // format_string(e_msg, "服务器资源内存分配失败, 在vas模块%s文件%d行出现无法获取内存的情况!", __FILE__, __LINE__); - // send_exception(RunMessageType::F4001, e_msg); - // LOG_ERROR("{}", e_msg); - // } - // } - // return; - //} + av_packet_unref(&framePacket); + + if (gpuFrame->width != frameW || gpuFrame->height != frameH){ + LOG_INFO("AVFrame is inconsistent: width is {}, height is {}; original frameW is {}, frameH is {}--{}", gpuFrame->width, gpuFrame->height, frameW, frameH , m_dec_name); + av_frame_free(&gpuFrame); + return; + } + + m_queue_mutex.lock(); + if(mFrameQueue.size() <= 10){ + mFrameQueue.push(gpuFrame); + }else{ + av_frame_free(&gpuFrame); } + m_queue_mutex.unlock(); +} + +void FFGB28181Decoder::post_decode_thread(){ + + int index = 0; + while (isRunning()) + { + if(mFrameQueue.size() > 0){ + std::lock_guard l(m_snapshot_mutex); + // 取队头数据 + m_queue_mutex.lock(); + AVFrame * gpuFrame = mFrameQueue.front(); + mFrameQueue.pop(); + m_queue_mutex.unlock(); + // 跳帧 + if (m_frameSkip == 1 || index % m_frameSkip == 0){ + post_decoded_cbk(m_postDecArg, gpuFrame); + } + + av_frame_free(&gpuFrame); + + index++; + if(index >= 100000){ + index = 0; + } + } + } - if (gpu_options) av_dict_free(&gpu_options); + LOG_INFO("post decode thread exited."); } void FFGB28181Decoder::stream_end_callback() diff --git a/src/gb28181/FFGB28181Decoder.h b/src/gb28181/FFGB28181Decoder.h index 12a085c..1f31a5b 100644 --- a/src/gb28181/FFGB28181Decoder.h +++ b/src/gb28181/FFGB28181Decoder.h @@ -6,6 +6,7 @@ #include "../AbstractDecoder.h" #include +#include struct AVFormatContext; struct AVCodecContext; @@ -14,6 +15,7 @@ struct AVFrame; struct AVPacket; struct SwsContext; +using namespace std; class FFGB28181Decoder: public AbstractDecoder { @@ -45,19 +47,16 @@ public: public: void stream_callback(int videoType, char* data, int len, int isKey, uint64_t pts, uint64_t localPts); void stream_end_callback(); + void post_decode_thread(); private: AVCodecContext* m_pAVCodecCtx {}; const AVCodec* m_pAVCodec {}; - AVFrame* m_pAVFrame {}; int m_gpuid {-1}; RTPReceiver* m_rtpPtr; int m_port; - uint64_t m_frameCount {}; - - AVFrame* pFrameRGB {}; uint64_t m_startPts {}; uint64_t m_lastPts {}; //上一次pts的值 @@ -71,6 +70,10 @@ private: int log_count {}; std::atomic_int m_status {}; + + AVDictionary *gpu_options = nullptr; + + pthread_t m_post_decode_thread; }; #endif // _GB28181_DECODER_H_ diff --git a/src/gb28181/RTPReceiver.cpp b/src/gb28181/RTPReceiver.cpp index 1be46d3..77b8062 100644 --- a/src/gb28181/RTPReceiver.cpp +++ b/src/gb28181/RTPReceiver.cpp @@ -1,8 +1,9 @@ -#include "RTPReceiver.h" +#include "RTPReceiver.h" #include "rtppacket.h" -#include "../logger.hpp" #include +#include "common_header.h" + #define BUFFERSIZE_1024 1024 const int kVideoFrameSize = BUFFERSIZE_1024*BUFFERSIZE_1024*5*2; @@ -174,7 +175,7 @@ int RTPReceiver::OnPsProcess() LOG_INFO("[{}] started.", m_deviceID); while (!m_bPsExit) { m_psFrameMutex.lock(); - LOG_DEBUG("[{}] PS frame size : {}", m_deviceID, m_psVideoFrames.size()); + // LOG_DEBUG("[{}] PS frame size : {}", m_deviceID, m_psVideoFrames.size()); if (m_psVideoFrames.size() <= 0){ m_psFrameMutex.unlock(); std::this_thread::sleep_for(std::chrono::milliseconds(10)); @@ -257,7 +258,7 @@ int RTPReceiver::ParsePacket(RTPPacket* packet){ break; } - LOG_DEBUG("[{}] ParsePacket GetPayloadLength", m_deviceID); + // LOG_DEBUG("[{}] ParsePacket GetPayloadLength", m_deviceID); if (mark) { @@ -271,7 +272,7 @@ int RTPReceiver::ParsePacket(RTPPacket* packet){ std::lock_guard l(m_psFrameMutex); if (m_psVideoFrames.size() < 100) { - LOG_DEBUG("[{}]ParsePacket push", m_deviceID); + // LOG_DEBUG("[{}]ParsePacket push", m_deviceID); m_psVideoFrames.push(new Frame(frameBuf, offset, false)); } else { diff --git a/src/gb28181/RTPReceiver.h b/src/gb28181/RTPReceiver.h index 8a7f8f9..6ec70dd 100644 --- a/src/gb28181/RTPReceiver.h +++ b/src/gb28181/RTPReceiver.h @@ -32,7 +32,7 @@ typedef void(*CallBack_VodFileEnd)(void* userdata); /** * 请求流 */ -typedef bool(*CallBack_Request_Stream)(); +typedef bool(*CallBack_Request_Stream)(const char* deviceId); // 标识帧, 注意buffer需要自己开辟和释放 struct Frame { @@ -85,7 +85,7 @@ class RTPReceiver{ public: RTPReceiver(); - ~RTPReceiver(); + virtual ~RTPReceiver(); virtual bool Open(uint16_t localPort) = 0; virtual bool IsOpened() = 0; diff --git a/src/gb28181/RTPTcpReceiver.cpp b/src/gb28181/RTPTcpReceiver.cpp index 3e20a78..4cb85f7 100644 --- a/src/gb28181/RTPTcpReceiver.cpp +++ b/src/gb28181/RTPTcpReceiver.cpp @@ -1,14 +1,7 @@ #include"RTPTcpReceiver.h" -#include "../logger.hpp" +#include "common_header.h" -static long long get_cur_time() { - - chrono::time_point tpMicro - = chrono::time_point_cast(chrono::system_clock::now()); - - return tpMicro.time_since_epoch().count(); -} // class TcpRTPSession : public RTPSession // { @@ -65,7 +58,7 @@ public: LOG_ERROR("Error sending over socket {}, removing destination", sock); DeleteDestination(RTPTCPAddress(sock)); if(nullptr != tcpReceiver && !tcpReceiver->isClosing()){ - tcpReceiver->RequestStream(); + tcpReceiver->ReConnect(); } } @@ -90,6 +83,16 @@ static int rtp_revc_thread_(void* param) return self->OnRtpRecv(); } +static int listen_finish_thread_(void* param) +{ + if (!param) + { + return -1; + } + + RTPTcpReceiver* self = (RTPTcpReceiver*)param; + return self->ListenFinish(); +} RTPTcpReceiver::RTPTcpReceiver() : m_bRtpExit(false) @@ -143,11 +146,19 @@ bool RTPTcpReceiver::IsOpened(){ } void RTPTcpReceiver::Close(){ + m_bRtpExit = true; + + if(m_listenFinishThread.joinable()){ + m_listenFinishThread.join(); + } +} + +void RTPTcpReceiver::close_task(){ + m_bRtpExit = true; m_bClosing = true; m_bAccepted = true; - m_bRtpExit = true; LOG_DEBUG("[{}] 1.", m_deviceID); @@ -207,20 +218,22 @@ int RTPTcpReceiver::initSession(int localPort){ status = m_rtpSessionPtr->Create(*m_pSessparams, m_pTrans); if (status < 0) { - LOG_ERROR("[{}] create session error!!", m_deviceID); + // 若status = -59 ,需运行 export LOGNAME=root ,见 https://blog.csdn.net/m0_37876242/article/details/128588162 + LOG_ERROR("[{}] create session error: {}", m_deviceID, status); return -1; } m_rtpThread = std::thread(rtp_revc_thread_, this); + m_listenFinishThread = std::thread(listen_finish_thread_, this); InitPS(); - bool bRet = RequestStream(); - if (!bRet) - { - LOG_INFO("[{}] 请求流失败!", m_deviceID); - return -1; - } + // bool bRet = RequestStream(); + // if (!bRet) + // { + // LOG_INFO("[{}] 请求流失败!", m_deviceID); + // return -1; + // } LOG_INFO("[{}] 初始化成功, congratulations !!!", m_deviceID); @@ -240,17 +253,56 @@ int RTPTcpReceiver::OnRtpRecv() SocketType nServer = -1; LOG_INFO("[{}] Poll started.", m_deviceID); - int status = -1; + int reconn_times = 0; + int reaccept_times = 0; + bool bReconn = false; while(!m_bRtpExit){ while(!m_bAccepted){ + if(m_bRtpExit){ + goto end_flag; + } + + while (!bReconn){ + if(m_bRtpExit){ + goto end_flag; + } + + reconn_times++; + if(reconn_times > 10){ + // 10次请求都失败,结束任务 + m_bRtpExit = true; + goto end_flag; + } + LOG_DEBUG("[{}] RequestStream...", m_deviceID); + bReconn = RequestStream(); + if (bReconn){ + LOG_DEBUG("[{}] RequestStream, True", m_deviceID); + continue; + } + LOG_DEBUG("[{}] RequestStream, False", m_deviceID); + + std::this_thread::sleep_for(std::chrono::seconds(3)); + } + LOG_DEBUG("[{}] accepting...", m_deviceID); nServer = accept(m_nListener, (sockaddr*)&clientAddr, (socklen_t * ) &nLen); if (-1 == nServer){ - std::this_thread::sleep_for(std::chrono::milliseconds(10)); + reaccept_times++; + LOG_DEBUG("[{}] reaccept_times = {}", m_deviceID, reaccept_times); + if(reaccept_times > 600){ + LOG_DEBUG("[{}] reaccept_times > 600", m_deviceID); + bReconn = false; + reaccept_times = 0; + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); continue; } + LOG_DEBUG("[{}] accept success", m_deviceID); m_rtpSessionPtr->AddDestination(RTPTCPAddress(nServer)); m_bAccepted = true; + bReconn = false; + reconn_times = 0; + reaccept_times = 0; LOG_INFO("[{}] nServer={}", m_deviceID, nServer); break; @@ -265,7 +317,7 @@ int RTPTcpReceiver::OnRtpRecv() while ((pack = m_rtpSessionPtr->GetNextPacket()) != NULL) { - LOG_DEBUG("[{}] time: {} ", m_deviceID, get_cur_time()); + // LOG_DEBUG("[{}] time: {} ", m_deviceID, UtilTools::get_cur_time_ms()); ParsePacket(pack); m_rtpSessionPtr->DeletePacket(pack); @@ -279,6 +331,8 @@ int RTPTcpReceiver::OnRtpRecv() std::this_thread::sleep_for(std::chrono::milliseconds(10)); } +end_flag: + m_rtpSessionPtr->Destroy(); if(nServer > 0){ @@ -293,13 +347,18 @@ int RTPTcpReceiver::OnRtpRecv() return 0; } -bool RTPTcpReceiver::RequestStream(){ - bool bConnect = m_callback_request_stream(); - if(!bConnect){ - Close(); - return false; +int RTPTcpReceiver::ListenFinish(){ + while(!m_bRtpExit){ + std::this_thread::sleep_for(std::chrono::seconds(3)); } + + close_task(); +} + +bool RTPTcpReceiver::ReConnect(){ m_bAccepted = false; +} - return true; +bool RTPTcpReceiver::RequestStream(){ + return m_callback_request_stream(m_deviceID.c_str()); } \ No newline at end of file diff --git a/src/gb28181/RTPTcpReceiver.h b/src/gb28181/RTPTcpReceiver.h index aa78e58..d7e9d30 100644 --- a/src/gb28181/RTPTcpReceiver.h +++ b/src/gb28181/RTPTcpReceiver.h @@ -57,11 +57,14 @@ public: public: int OnRtpRecv(); + bool ReConnect(); + int ListenFinish(); bool RequestStream(); bool isClosing(); private: int initSession(int localPort); + void close_task(); private: @@ -77,9 +80,12 @@ private: std::thread m_rtpThread; // RTP接收线程 SocketType m_nListener; - RTPSession* m_rtpSessionPtr; // RTP会话 - RTPSessionParams* m_pSessparams; - MyTCPTransmitter* m_pTrans; + RTPSession* m_rtpSessionPtr; // RTP会话 + RTPSessionParams* m_pSessparams; + MyTCPTransmitter* m_pTrans; + + std::thread m_listenFinishThread; // RTP接收线程 + }; #endif // _RTP_TCP_RECEIVER_H_ diff --git a/src/gb28181/RTPUdpReceiver.cpp b/src/gb28181/RTPUdpReceiver.cpp index 49818ea..5c93368 100644 --- a/src/gb28181/RTPUdpReceiver.cpp +++ b/src/gb28181/RTPUdpReceiver.cpp @@ -6,7 +6,7 @@ #include #include -#include "../logger.hpp" +#include "common_header.h" using namespace std; @@ -42,15 +42,6 @@ private: } }; - -static long long get_cur_time() { - - chrono::time_point tpMicro - = chrono::time_point_cast(chrono::system_clock::now()); - - return tpMicro.time_since_epoch().count(); -} - static int rtp_revc_thread_(void* param) { if (!param) @@ -175,7 +166,7 @@ int RTPUdpReceiver::OnRtpRecv() if (m_rtpSessionPtr->GotoFirstSourceWithData()) { LOG_INFO("OnRtpRecv GotoFirstSourceWithData --{}", m_deviceID); - last_recv_ts = get_cur_time(); + last_recv_ts = UtilTools::get_cur_time_ms(); m_idleCount = 0; m_noDataCount = 0; do @@ -261,7 +252,7 @@ int RTPUdpReceiver::OnRtpRecv() // //若是30000,时长大约 18s // if(m_idleCount > 30000) // { - // uint64_t cts = get_cur_time(); + // uint64_t cts = UtilTools::get_cur_time_ms(); // float duration_not_recv = (cts - last_recv_ts) / 1000.0; // // //LOG_ERROR("************I haven't got stream from hik gateway exceed {}s,send eof********{}******", duration_not_recv, m_deviceID); diff --git a/src/gb28181/common_header.h b/src/gb28181/common_header.h new file mode 100644 index 0000000..2f0c324 --- /dev/null +++ b/src/gb28181/common_header.h @@ -0,0 +1,8 @@ +#ifndef _COMMON_HEADER_H_ +#define _COMMON_HEADER_H_ + + +#include "../logger.hpp" +#include "../utiltools.hpp" + +#endif \ No newline at end of file diff --git a/src/gb28181/demuxer.h b/src/gb28181/demuxer.h index 7e6ab5c..ca89aba 100644 --- a/src/gb28181/demuxer.h +++ b/src/gb28181/demuxer.h @@ -8,9 +8,11 @@ { CMpeg2Demux class. } { } {*******************************************************/ + #ifndef _DEMUXER_H_ #define _DEMUXER_H_ + #include #include "buffer.h" diff --git a/src/jpegNPP.cpp-1 b/src/jpegNPP.cpp-1 new file mode 100644 index 0000000..f0bf2e6 --- /dev/null +++ b/src/jpegNPP.cpp-1 @@ -0,0 +1,1193 @@ +/* +* Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO USER: +* +* This source code is subject to NVIDIA ownership rights under U.S. and +* international Copyright laws. +* +* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +* OR PERFORMANCE OF THIS SOURCE CODE. +* +* U.S. Government End Users. This source code is a "commercial item" as +* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +* "commercial computer software" and "commercial computer software +* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +* and is provided to the U.S. Government only as a commercial end item. +* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +* source code with only those rights set forth herein. +*/ + +// This sample needs at least CUDA 5.5 and a GPU that has at least Compute Capability 2.0 + +// This sample demonstrates a simple image processing pipeline. +// First, a JPEG file is huffman decoded and inverse DCT transformed and dequantized. +// Then the different planes are resized. Finally, the resized image is quantized, forward +// DCT transformed and huffman encoded. + +#include "cuda_kernels.h" + +#include +#include +#include "common/UtilNPP/Exceptions.h" + +#include "Endianess.h" +#include + +#include +#include +#include + +#include "common/inc/helper_string.h" +#include "common/inc/helper_cuda.h" +//#include "MacroDef.h" +#include "cuda.h" + +using namespace std; + +struct FrameHeader +{ + unsigned char nSamplePrecision; + unsigned short nHeight; + unsigned short nWidth; + unsigned char nComponents; + unsigned char aComponentIdentifier[3]; + unsigned char aSamplingFactors[3]; + unsigned char aQuantizationTableSelector[3]; +}; + +struct ScanHeader +{ + unsigned char nComponents; + unsigned char aComponentSelector[3]; + unsigned char aHuffmanTablesSelector[3]; + unsigned char nSs; + unsigned char nSe; + unsigned char nA; +}; + +struct QuantizationTable +{ + unsigned char nPrecisionAndIdentifier; + unsigned char aTable[64]; +}; + +struct HuffmanTable +{ + unsigned char nClassAndIdentifier; + unsigned char aCodes[16]; + unsigned char aTable[256]; +}; + +//??准?炼??藕?量??模?? +//unsigned char std_Y_QT[64] = +//{ +// 16, 11, 10, 16, 24, 40, 51, 61, +// 12, 12, 14, 19, 26, 58, 60, 55, +// 14, 13, 16, 24, 40, 57, 69, 56, +// 14, 17, 22, 29, 51, 87, 80, 62, +// 18, 22, 37, 56, 68, 109, 103, 77, +// 24, 35, 55, 64, 81, 104, 113, 92, +// 49, 64, 78, 87, 103, 121, 120, 101, +// 72, 92, 95, 98, 112, 100, 103, 99 +//}; +// +////??准色???藕?量??模?? +//unsigned char std_UV_QT[64] = +//{ +// 17, 18, 24, 47, 99, 99, 99, 99, +// 18, 21, 26, 66, 99, 99, 99, 99, +// 24, 26, 56, 99, 99, 99, 99, 99, +// 47, 66, 99, 99, 99, 99, 99, 99, +// 99, 99, 99, 99, 99, 99, 99, 99, +// 99, 99, 99, 99, 99, 99, 99, 99, +// 99, 99, 99, 99, 99, 99, 99, 99, +// 99, 99, 99, 99, 99, 99, 99, 99 +//}; + +////?炼??藕?量??模?? +//unsigned char std_Y_QT[64] = +//{ +// 6, 4, 5, 6, 5, 4, 6, 6, +// 5, 6, 7, 7, 6, 8, 10, 16, +// 10, 10, 9, 9, 10, 20, 14, 15, +// 12, 16, 23, 20, 24, 24, 23, 20, +// 22, 22, 26, 29, 37, 31, 26, 27, +// 35, 28, 22, 22, 32, 44, 32, 35, +// 38, 39, 41, 42, 41, 25, 31, 45, +// 48, 45, 40, 48, 37, 40, 41, 40 +//}; +// +////色???藕?量??模?? +//unsigned char std_UV_QT[64] = +//{ +// 7, 7, 7, 10, 8, 10, 19, 10, +// 10, 19, 40, 26, 22, 26, 40, 40, +// 40, 40, 40, 40, 40, 40, 40, 40, +// 40, 40, 40, 40, 40, 40, 40, 40, +// 40, 40, 40, 40, 40, 40, 40, 40, +// 40, 40, 40, 40, 40, 40, 40, 40, +// 40, 40, 40, 40, 40, 40, 40, 40, +// 40, 40, 40, 40, 40, 40, 40, 40 +//}; + +//?炼??藕?量??模?? +unsigned char std_Y_QT[64] = +{ + 0.75 * 6, 0.75 * 4, 0.75 * 5, 0.75 * 6, 0.75 * 5, 0.75 * 4, 0.75 * 6, 0.75 * 6, + 0.75 * 5, 0.75 * 6, 0.75 * 7, 0.75 * 7, 0.75 * 6, 0.75 * 8, 0.75 * 10, 0.75 * 16, + 0.75 * 10, 0.75 * 10, 0.75 * 9, 0.75 * 9, 0.75 * 10, 0.75 * 20, 0.75 * 14, 0.75 * 15, + 0.75 * 12, 0.75 * 16, 0.75 * 23, 0.75 * 20, 0.75 * 24, 0.75 * 24, 0.75 * 23, 0.75 * 20, + 0.75 * 22, 0.75 * 22, 0.75 * 26, 0.75 * 29, 0.75 * 37, 0.75 * 31, 0.75 * 26, 0.75 * 27, + 0.75 * 35, 0.75 * 28, 0.75 * 22, 0.75 * 22, 0.75 * 32, 0.75 * 44, 0.75 * 32, 0.75 * 35, + 0.75 * 38, 0.75 * 39, 0.75 * 41, 0.75 * 42, 0.75 * 41, 0.75 * 25, 0.75 * 31, 0.75 * 45, + 0.75 * 48, 0.75 * 45, 0.75 * 40, 0.75 * 48, 0.75 * 37, 0.75 * 40, 0.75 * 41, 0.75 * 40 +}; + +//色???藕?量??模?? +unsigned char std_UV_QT[64] = +{ + 0.75 * 7, 0.75 * 7, 0.75 * 7, 0.75 * 10, 0.75 * 8, 0.75 * 10, 0.75 * 19, 0.75 * 10, + 0.75 * 10, 0.75 * 19, 0.75 * 40, 0.75 * 26, 0.75 * 22, 0.75 * 26, 0.75 * 40, 0.75 * 40, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30 +}; + +unsigned char STD_DC_Y_NRCODES[16] = { 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 }; +unsigned char STD_DC_Y_VALUES[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; + +unsigned char STD_DC_UV_NRCODES[16] = { 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 }; +unsigned char STD_DC_UV_VALUES[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; + +unsigned char STD_AC_Y_NRCODES[16] = { 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0X7D }; +unsigned char STD_AC_Y_VALUES[162] = +{ + 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, + 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07, + 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08, + 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0, + 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16, + 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28, + 0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, + 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, + 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, + 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, + 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, + 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, + 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, + 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, + 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2, + 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, + 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, + 0xf9, 0xfa +}; + +unsigned char STD_AC_UV_NRCODES[16] = { 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0X77 }; +unsigned char STD_AC_UV_VALUES[162] = +{ + 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, + 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71, + 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, + 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0, + 0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34, + 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26, + 0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, + 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, + 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, + 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, + 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, + 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, + 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, + 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, + 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, + 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, + 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, + 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, + 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, + 0xf9, 0xfa +}; + +int DivUp(int x, int d) +{ + return (x + d - 1) / d; +} + +template +void writeAndAdvance(unsigned char *&pData, T nElement) +{ + writeBigEndian(pData, nElement); + pData += sizeof(T); +} + +void writeMarker(unsigned char nMarker, unsigned char *&pData) +{ + *pData++ = 0x0ff; + *pData++ = nMarker; +} + +void writeJFIFTag(unsigned char *&pData) +{ + const char JFIF_TAG[] = + { + 0x4a, 0x46, 0x49, 0x46, 0x00, + 0x01, 0x02, + 0x00, + 0x00, 0x01, 0x00, 0x01, + 0x00, 0x00 + }; + + writeMarker(0x0e0, pData); + writeAndAdvance(pData, sizeof(JFIF_TAG) + sizeof(unsigned short)); + memcpy(pData, JFIF_TAG, sizeof(JFIF_TAG)); + pData += sizeof(JFIF_TAG); +} + +void writeFrameHeader(const FrameHeader &header, unsigned char *&pData) +{ + unsigned char aTemp[128]; + unsigned char *pTemp = aTemp; + + writeAndAdvance(pTemp, header.nSamplePrecision); + writeAndAdvance(pTemp, header.nHeight); + writeAndAdvance(pTemp, header.nWidth); + writeAndAdvance(pTemp, header.nComponents); + + for (int c = 0; c(pTemp, header.aComponentIdentifier[c]); + writeAndAdvance(pTemp, header.aSamplingFactors[c]); + writeAndAdvance(pTemp, header.aQuantizationTableSelector[c]); + } + + unsigned short nLength = (unsigned short)(pTemp - aTemp); + + writeMarker(0x0C0, pData); + writeAndAdvance(pData, nLength + 2); + memcpy(pData, aTemp, nLength); + pData += nLength; +} + +void writeScanHeader(const ScanHeader &header, unsigned char *&pData) +{ + unsigned char aTemp[128]; + unsigned char *pTemp = aTemp; + + writeAndAdvance(pTemp, header.nComponents); + + for (int c = 0; c(pTemp, header.aComponentSelector[c]); + writeAndAdvance(pTemp, header.aHuffmanTablesSelector[c]); + } + + writeAndAdvance(pTemp, header.nSs); + writeAndAdvance(pTemp, header.nSe); + writeAndAdvance(pTemp, header.nA); + + unsigned short nLength = (unsigned short)(pTemp - aTemp); + + writeMarker(0x0DA, pData); + writeAndAdvance(pData, nLength + 2); + memcpy(pData, aTemp, nLength); + pData += nLength; +} + +void writeQuantizationTable(const QuantizationTable &table, unsigned char *&pData) +{ + writeMarker(0x0DB, pData); + writeAndAdvance(pData, sizeof(QuantizationTable) + 2); + memcpy(pData, &table, sizeof(QuantizationTable)); + pData += sizeof(QuantizationTable); +} + +void writeHuffmanTable(const HuffmanTable &table, unsigned char *&pData) +{ + writeMarker(0x0C4, pData); + + // Number of Codes for Bit Lengths [1..16] + int nCodeCount = 0; + + for (int i = 0; i < 16; ++i) + { + nCodeCount += table.aCodes[i]; + } + + writeAndAdvance(pData, 17 + nCodeCount + 2); + memcpy(pData, &table, 17 + nCodeCount); + pData += 17 + nCodeCount; +} + +bool printfNPPinfo(int cudaVerMajor, int cudaVerMinor) +{ + const NppLibraryVersion *libVer = nppGetLibVersion(); + + printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); + + int driverVersion, runtimeVersion; + cudaDriverGetVersion(&driverVersion); + cudaRuntimeGetVersion(&runtimeVersion); + + printf(" CUDA Driver Version: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); + printf(" CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10); + + bool bVal = checkCudaCapabilities(cudaVerMajor, cudaVerMinor); + return bVal; +} + +NppiDCTState *pDCTState; +FrameHeader oFrameHeader; +FrameHeader oFrameHeaderFixedSize; +ScanHeader oScanHeader; +QuantizationTable aQuantizationTables[4]; +Npp8u *pdQuantizationTables; +HuffmanTable aHuffmanTables[4]; +HuffmanTable *pHuffmanDCTables; +HuffmanTable *pHuffmanACTables; +int nMCUBlocksH; +int nMCUBlocksV; +int nMCUBlocksHFixedSize; +int nMCUBlocksVFixedSize; +Npp8u *pdScan; +NppiEncodeHuffmanSpec *apHuffmanDCTable[3]; +NppiEncodeHuffmanSpec *apHuffmanACTable[3]; +unsigned char *pDstJpeg; +unsigned char *pDstOutput; +int nRestartInterval; + +int initTable() +{ + NPP_CHECK_NPP(nppiDCTInitAlloc(&pDCTState)); + + nRestartInterval = -1; + + cudaMalloc(&pdQuantizationTables, 64 * 4); + pHuffmanDCTables = aHuffmanTables; + pHuffmanACTables = &aHuffmanTables[2]; + memset(aQuantizationTables, 0, 4 * sizeof(QuantizationTable)); + memset(aHuffmanTables, 0, 4 * sizeof(HuffmanTable)); + memset(&oFrameHeader, 0, sizeof(FrameHeader)); + + + //????Huffman?? + aHuffmanTables[0].nClassAndIdentifier = 0; + memcpy(aHuffmanTables[0].aCodes, STD_DC_Y_NRCODES, 16); + memcpy(aHuffmanTables[0].aTable, STD_DC_Y_VALUES, 12); + + aHuffmanTables[1].nClassAndIdentifier = 1; + memcpy(aHuffmanTables[1].aCodes, STD_DC_UV_NRCODES, 16); + memcpy(aHuffmanTables[1].aTable, STD_DC_UV_VALUES, 12); + + aHuffmanTables[2].nClassAndIdentifier = 16; + memcpy(aHuffmanTables[2].aCodes, STD_AC_Y_NRCODES, 16); + memcpy(aHuffmanTables[2].aTable, STD_AC_Y_VALUES, 162); + + aHuffmanTables[3].nClassAndIdentifier = 17; + memcpy(aHuffmanTables[3].aCodes, STD_AC_UV_NRCODES, 16); + memcpy(aHuffmanTables[3].aTable, STD_AC_UV_VALUES, 162); + + + //????量???? + aQuantizationTables[0].nPrecisionAndIdentifier = 0; + memcpy(aQuantizationTables[0].aTable, std_Y_QT, 64); + aQuantizationTables[1].nPrecisionAndIdentifier = 1; + memcpy(aQuantizationTables[1].aTable, std_UV_QT, 64); + + NPP_CHECK_CUDA(cudaMemcpyAsync(pdQuantizationTables, aQuantizationTables[0].aTable, 64, cudaMemcpyHostToDevice)); + NPP_CHECK_CUDA(cudaMemcpyAsync(pdQuantizationTables + 64, aQuantizationTables[1].aTable, 64, cudaMemcpyHostToDevice)); + + oFrameHeader.nSamplePrecision = 8; + oFrameHeader.nComponents = 3; + oFrameHeader.aComponentIdentifier[0] = 1; + oFrameHeader.aComponentIdentifier[1] = 2; + oFrameHeader.aComponentIdentifier[2] = 3; + oFrameHeader.aSamplingFactors[0] = 34; + oFrameHeader.aSamplingFactors[1] = 17; + oFrameHeader.aSamplingFactors[2] = 17; + oFrameHeader.aQuantizationTableSelector[0] = 0; + oFrameHeader.aQuantizationTableSelector[1] = 1; + oFrameHeader.aQuantizationTableSelector[2] = 1; + + for (int i = 0; i < oFrameHeader.nComponents; ++i) + { + nMCUBlocksV = max(nMCUBlocksV, oFrameHeader.aSamplingFactors[i] & 0x0f); + nMCUBlocksH = max(nMCUBlocksH, oFrameHeader.aSamplingFactors[i] >> 4); + } + NPP_CHECK_CUDA(cudaMalloc(&pdScan, 4 << 20)); + + + + oScanHeader.nComponents = 3; + oScanHeader.aComponentSelector[0] = 1; + oScanHeader.aComponentSelector[1] = 2; + oScanHeader.aComponentSelector[2] = 3; + oScanHeader.aHuffmanTablesSelector[0] = 0; + oScanHeader.aHuffmanTablesSelector[1] = 17; + oScanHeader.aHuffmanTablesSelector[2] = 17; + oScanHeader.nSs = 0; + oScanHeader.nSe = 63; + oScanHeader.nA = 0; + + + return 0; +} + +NppiSize aSrcSize[3]; +Npp16s *apdDCT[3];// = { 0, 0, 0 }; +Npp32s aDCTStep[3]; + +Npp8u *apSrcImage[3];// = { 0, 0, 0 }; +Npp32s aSrcImageStep[3]; +size_t aSrcPitch[3]; + + +int releaseJpegNPP() +{ + nppiDCTFree(pDCTState); + cudaFree(pdQuantizationTables); + cudaFree(pdScan); + for (int i = 0; i < 3; ++i) + { + cudaFree(apdDCT[i]); + cudaFree(apSrcImage[i]); + } + return 0; +} + + +int initTable(int flag, int width, int height) +{ + //????帧头 + oFrameHeaderFixedSize.nSamplePrecision = 8; + oFrameHeaderFixedSize.nComponents = 3; + oFrameHeaderFixedSize.aComponentIdentifier[0] = 1; + oFrameHeaderFixedSize.aComponentIdentifier[1] = 2; + oFrameHeaderFixedSize.aComponentIdentifier[2] = 3; + oFrameHeaderFixedSize.aSamplingFactors[0] = 34; + oFrameHeaderFixedSize.aSamplingFactors[1] = 17; + oFrameHeaderFixedSize.aSamplingFactors[2] = 17; + oFrameHeaderFixedSize.aQuantizationTableSelector[0] = 0; + oFrameHeaderFixedSize.aQuantizationTableSelector[1] = 1; + oFrameHeaderFixedSize.aQuantizationTableSelector[2] = 1; + oFrameHeaderFixedSize.nWidth = width; + oFrameHeaderFixedSize.nHeight = height; + + for (int i = 0; i < oFrameHeaderFixedSize.nComponents; ++i) + { + nMCUBlocksVFixedSize = max(nMCUBlocksVFixedSize, oFrameHeaderFixedSize.aSamplingFactors[i] & 0x0f); + nMCUBlocksHFixedSize = max(nMCUBlocksHFixedSize, oFrameHeaderFixedSize.aSamplingFactors[i] >> 4); + } + + for (int i = 0; i < oFrameHeaderFixedSize.nComponents; ++i) + { + NppiSize oBlocks; + NppiSize oBlocksPerMCU = { oFrameHeaderFixedSize.aSamplingFactors[i] >> 4, oFrameHeaderFixedSize.aSamplingFactors[i] & 0x0f }; + + oBlocks.width = (int)ceil((oFrameHeaderFixedSize.nWidth + 7) / 8 * + static_cast(oBlocksPerMCU.width) / nMCUBlocksHFixedSize); + oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; + + oBlocks.height = (int)ceil((oFrameHeaderFixedSize.nHeight + 7) / 8 * + static_cast(oBlocksPerMCU.height) / nMCUBlocksVFixedSize); + oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; + + aSrcSize[i].width = oBlocks.width * 8; + aSrcSize[i].height = oBlocks.height * 8; + + // Allocate Memory + size_t nPitch; + NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height)); + aDCTStep[i] = static_cast(nPitch); + + NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height)); + + aSrcPitch[i] = nPitch; + aSrcImageStep[i] = static_cast(nPitch); + } + + return 0; +} + +int jpegNPP(const char *szOutputFile, float* d_srcRGB) +{ + //RGB2YUV + cudaError_t cudaStatus; + cudaStatus = cuda_common::RGB2YUV(d_srcRGB, oFrameHeaderFixedSize.nWidth, oFrameHeaderFixedSize.nHeight, + apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height, + apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height, + apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height); + + /** + * Forward DCT, quantization and level shift part of the JPEG encoding. + * Input is expected in 8x8 macro blocks and output is expected to be in 64x1 + * macro blocks. The new version of the primitive takes the ROI in image pixel size and + * works with DCT coefficients that are in zig-zag order. + */ + int k = 0; + //LOG_INFO("NPP_CHECK_NPP:%d", 1); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0], + apdDCT[0], aDCTStep[0], + pdQuantizationTables + k * 64, + aSrcSize[0], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + k = 1; + //LOG_INFO("NPP_CHECK_NPP:%d", 2); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1], + apdDCT[1], aDCTStep[1], + pdQuantizationTables + k * 64, + aSrcSize[1], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + //LOG_INFO("NPP_CHECK_NPP:%d", 3); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2], + apdDCT[2], aDCTStep[2], + pdQuantizationTables + k * 64, + aSrcSize[2], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + // Huffman Encoding + + Npp32s nScanLength; + Npp8u *pJpegEncoderTemp; + +#if (CUDA_VERSION == 8000) + Npp32s nTempSize; //when using CUDA8 +#else + size_t nTempSize; //when using CUDA9 +#endif + //modified by Junlin 190221 + + //LOG_INFO("NPP_CHECK_NPP:%d",4); + if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize))) + { + printf("nppiEncodeHuffmanGetSize Failed!\n"); + return EXIT_FAILURE; + } + + //LOG_INFO("NPP_CHECK_CUDA:%d",5); + NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize)); + + /** + * Allocates memory and creates a Huffman table in a format that is suitable for the encoder. + */ + NppStatus t_status; + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]); + + /** + * Huffman Encoding of the JPEG Encoding. + * Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan. + */ + Npp32s nSs = 0; + Npp32s nSe = 63; + Npp32s nH = 0; + Npp32s nL = 0; + //LOG_INFO("NPP_CHECK_NPP:%d",6); + if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep, + 0, nSs, nSe, nH, nL, + pdScan, &nScanLength, + apHuffmanDCTable, + apHuffmanACTable, + aSrcSize, + pJpegEncoderTemp))) + { + printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n"); + return EXIT_FAILURE; + } + + for (int i = 0; i < 3; ++i) + { + nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]); + nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]); + } + // Write JPEG + pDstJpeg = new unsigned char[4 << 20]{}; + pDstOutput = pDstJpeg; + + writeMarker(0x0D8, pDstOutput); + writeJFIFTag(pDstOutput); + writeQuantizationTable(aQuantizationTables[0], pDstOutput); + writeQuantizationTable(aQuantizationTables[1], pDstOutput); + writeHuffmanTable(pHuffmanDCTables[0], pDstOutput); + writeHuffmanTable(pHuffmanACTables[0], pDstOutput); + writeHuffmanTable(pHuffmanDCTables[1], pDstOutput); + writeHuffmanTable(pHuffmanACTables[1], pDstOutput); + writeFrameHeader(oFrameHeaderFixedSize, pDstOutput); + writeScanHeader(oScanHeader, pDstOutput); + + //LOG_INFO("NPP_CHECK_CUDA:%d",7); + NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost)); + + pDstOutput += nScanLength; + writeMarker(0x0D9, pDstOutput); + { + // Write result to file. + std::ofstream outputFile(szOutputFile, ios::out | ios::binary); + outputFile.write(reinterpret_cast(pDstJpeg), static_cast(pDstOutput - pDstJpeg)); + } + + // Cleanup + cudaFree(pJpegEncoderTemp); + delete[] pDstJpeg; + + + return EXIT_SUCCESS; +} + +int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB) +{ + //RGB2YUV + cudaError_t cudaStatus; + cudaStatus = cuda_common::RGB2YUV(d_srcRGB, oFrameHeaderFixedSize.nWidth, oFrameHeaderFixedSize.nHeight, + apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height, + apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height, + apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height); + + /** + * Forward DCT, quantization and level shift part of the JPEG encoding. + * Input is expected in 8x8 macro blocks and output is expected to be in 64x1 + * macro blocks. The new version of the primitive takes the ROI in image pixel size and + * works with DCT coefficients that are in zig-zag order. + */ + int k = 0; + //LOG_INFO("NPP_CHECK_NPP:%d", 1); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0], + apdDCT[0], aDCTStep[0], + pdQuantizationTables + k * 64, + aSrcSize[0], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + k = 1; + //LOG_INFO("NPP_CHECK_NPP:%d", 2); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1], + apdDCT[1], aDCTStep[1], + pdQuantizationTables + k * 64, + aSrcSize[1], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + //LOG_INFO("NPP_CHECK_NPP:%d", 3); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2], + apdDCT[2], aDCTStep[2], + pdQuantizationTables + k * 64, + aSrcSize[2], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + // Huffman Encoding + + Npp32s nScanLength; + Npp8u *pJpegEncoderTemp; + +#if (CUDA_VERSION == 8000) + Npp32s nTempSize; //when using CUDA8 +#else + size_t nTempSize; //when using CUDA9 +#endif + //modified by Junlin 190221 + + //LOG_INFO("NPP_CHECK_NPP:%d",4); + if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize))) + { + printf("nppiEncodeHuffmanGetSize Failed!\n"); + return EXIT_FAILURE; + } + + //LOG_INFO("NPP_CHECK_CUDA:%d",5); + NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize)); + + /** + * Allocates memory and creates a Huffman table in a format that is suitable for the encoder. + */ + NppStatus t_status; + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]); + + /** + * Huffman Encoding of the JPEG Encoding. + * Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan. + */ + Npp32s nSs = 0; + Npp32s nSe = 63; + Npp32s nH = 0; + Npp32s nL = 0; + //LOG_INFO("NPP_CHECK_NPP:%d",6); + if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep, + 0, nSs, nSe, nH, nL, + pdScan, &nScanLength, + apHuffmanDCTable, + apHuffmanACTable, + aSrcSize, + pJpegEncoderTemp))) + { + printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n"); + return EXIT_FAILURE; + } + + for (int i = 0; i < 3; ++i) + { + nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]); + nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]); + } + // Write JPEG + pDstJpeg = new unsigned char[4 << 20]{}; + pDstOutput = pDstJpeg; + + writeMarker(0x0D8, pDstOutput); + writeJFIFTag(pDstOutput); + writeQuantizationTable(aQuantizationTables[0], pDstOutput); + writeQuantizationTable(aQuantizationTables[1], pDstOutput); + writeHuffmanTable(pHuffmanDCTables[0], pDstOutput); + writeHuffmanTable(pHuffmanACTables[0], pDstOutput); + writeHuffmanTable(pHuffmanDCTables[1], pDstOutput); + writeHuffmanTable(pHuffmanACTables[1], pDstOutput); + writeFrameHeader(oFrameHeaderFixedSize, pDstOutput); + writeScanHeader(oScanHeader, pDstOutput); + + //LOG_INFO("NPP_CHECK_CUDA:%d",7); + NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost)); + + pDstOutput += nScanLength; + writeMarker(0x0D9, pDstOutput); + { + // Write result to file. + std::ofstream outputFile(szOutputFile, ios::out | ios::binary); + outputFile.write(reinterpret_cast(pDstJpeg), static_cast(pDstOutput - pDstJpeg)); + } + + // Cleanup + cudaFree(pJpegEncoderTemp); + delete[] pDstJpeg; + + + return EXIT_SUCCESS; +} + + +int jpegNPP(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height) +{ + NppiSize aSrcSize[3]; + Npp16s *apdDCT[3] = { 0, 0, 0 }; + Npp32s aDCTStep[3]; + + Npp8u *apSrcImage[3] = { 0, 0, 0 }; + Npp32s aSrcImageStep[3]; + size_t aSrcPitch[3]; + + + //????帧头 + oFrameHeader.nWidth = img_width; + oFrameHeader.nHeight = img_height; + + for (int i = 0; i < oFrameHeader.nComponents; ++i) + { + NppiSize oBlocks; + NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] >> 4, oFrameHeader.aSamplingFactors[i] & 0x0f }; + + oBlocks.width = (int)ceil((oFrameHeader.nWidth + 7) / 8 * + static_cast(oBlocksPerMCU.width) / nMCUBlocksH); + oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; + + oBlocks.height = (int)ceil((oFrameHeader.nHeight + 7) / 8 * + static_cast(oBlocksPerMCU.height) / nMCUBlocksV); + oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; + + aSrcSize[i].width = oBlocks.width * 8; + aSrcSize[i].height = oBlocks.height * 8; + + // Allocate Memory + size_t nPitch; + //LOG_INFO("NPP_CHECK_CUDA:%d",1); + NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height)); + aDCTStep[i] = static_cast(nPitch); + + //LOG_INFO("NPP_CHECK_CUDA:%d",2); + NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height)); + + aSrcPitch[i] = nPitch; + aSrcImageStep[i] = static_cast(nPitch); + } + + //RGB2YUV + cudaError_t cudaStatus; + cudaStatus = cuda_common::RGB2YUV(d_srcRGB, img_width, img_height, + apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height, + apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height, + apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height); + + /** + * Forward DCT, quantization and level shift part of the JPEG encoding. + * Input is expected in 8x8 macro blocks and output is expected to be in 64x1 + * macro blocks. The new version of the primitive takes the ROI in image pixel size and + * works with DCT coefficients that are in zig-zag order. + */ + int k = 0; + //LOG_INFO("NPP_CHECK_CUDA:%d",3); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0], + apdDCT[0], aDCTStep[0], + pdQuantizationTables + k * 64, + aSrcSize[0], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + k = 1; + + //LOG_INFO("NPP_CHECK_CUDA:%d",4); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1], + apdDCT[1], aDCTStep[1], + pdQuantizationTables + k * 64, + aSrcSize[1], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + //LOG_INFO("NPP_CHECK_CUDA:%d",5); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2], + apdDCT[2], aDCTStep[2], + pdQuantizationTables + k * 64, + aSrcSize[2], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + // Huffman Encoding + + Npp32s nScanLength; + Npp8u *pJpegEncoderTemp; + +#if (CUDA_VERSION == 8000) + Npp32s nTempSize; //when using CUDA8 +#else + size_t nTempSize; //when using CUDA9 +#endif + //modified by Junlin 190221 + + //LOG_INFO("NPP_CHECK_CUDA:%d",6); + if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize))) + { + printf("nppiEncodeHuffmanGetSize Failed!\n"); + return EXIT_FAILURE; + } + + //LOG_INFO("NPP_CHECK_CUDA:%d",7); + NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize)); + + /** + * Allocates memory and creates a Huffman table in a format that is suitable for the encoder. + */ + NppStatus t_status; + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]); + + /** + * Huffman Encoding of the JPEG Encoding. + * Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan. + */ + Npp32s nSs = 0; + Npp32s nSe = 63; + Npp32s nH = 0; + Npp32s nL = 0; + //LOG_INFO("NPP_CHECK_CUDA:%d",8); + if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep, + 0, nSs, nSe, nH, nL, + pdScan, &nScanLength, + apHuffmanDCTable, + apHuffmanACTable, + aSrcSize, + pJpegEncoderTemp))) + { + printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n"); + return EXIT_FAILURE; + } + + for (int i = 0; i < 3; ++i) + { + nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]); + nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]); + } + // Write JPEG + pDstJpeg = new unsigned char[4 << 20]{}; + pDstOutput = pDstJpeg; + + writeMarker(0x0D8, pDstOutput); + writeJFIFTag(pDstOutput); + writeQuantizationTable(aQuantizationTables[0], pDstOutput); + writeQuantizationTable(aQuantizationTables[1], pDstOutput); + writeHuffmanTable(pHuffmanDCTables[0], pDstOutput); + writeHuffmanTable(pHuffmanACTables[0], pDstOutput); + writeHuffmanTable(pHuffmanDCTables[1], pDstOutput); + writeHuffmanTable(pHuffmanACTables[1], pDstOutput); + writeFrameHeader(oFrameHeader, pDstOutput); + writeScanHeader(oScanHeader, pDstOutput); + + //LOG_INFO("NPP_CHECK_CUDA:%d",9); + NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost)); + + pDstOutput += nScanLength; + writeMarker(0x0D9, pDstOutput); + + { + // Write result to file. + std::ofstream outputFile(szOutputFile, ios::out | ios::binary); + outputFile.write(reinterpret_cast(pDstJpeg), static_cast(pDstOutput - pDstJpeg)); + } + + // Cleanup + cudaFree(pJpegEncoderTemp); + delete[] pDstJpeg; + for (int i = 0; i < 3; ++i) + { + cudaFree(apdDCT[i]); + cudaFree(apSrcImage[i]); + } + + return EXIT_SUCCESS; +} + + +int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height) +{ + NppiSize aSrcSize[3]; + Npp16s *apdDCT[3] = { 0, 0, 0 }; + Npp32s aDCTStep[3]; + + Npp8u *apSrcImage[3] = { 0, 0, 0 }; + Npp32s aSrcImageStep[3]; + size_t aSrcPitch[3]; + + + //????帧头 + oFrameHeader.nWidth = img_width; + oFrameHeader.nHeight = img_height; + + for (int i = 0; i < oFrameHeader.nComponents; ++i) + { + NppiSize oBlocks; + NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] >> 4, oFrameHeader.aSamplingFactors[i] & 0x0f }; + + oBlocks.width = (int)ceil((oFrameHeader.nWidth + 7) / 8 * + static_cast(oBlocksPerMCU.width) / nMCUBlocksH); + oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; + + oBlocks.height = (int)ceil((oFrameHeader.nHeight + 7) / 8 * + static_cast(oBlocksPerMCU.height) / nMCUBlocksV); + oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; + + aSrcSize[i].width = oBlocks.width * 8; + aSrcSize[i].height = oBlocks.height * 8; + + // Allocate Memory + size_t nPitch; + //LOG_INFO("NPP_CHECK_CUDA:%d",1); + NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height)); + aDCTStep[i] = static_cast(nPitch); + + //LOG_INFO("NPP_CHECK_CUDA:%d",2); + NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height)); + + aSrcPitch[i] = nPitch; + aSrcImageStep[i] = static_cast(nPitch); + } + + //RGB2YUV + cudaError_t cudaStatus; + cudaStatus = cuda_common::RGB2YUV(d_srcRGB, img_width, img_height, + apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height, + apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height, + apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height); + + /** + * Forward DCT, quantization and level shift part of the JPEG encoding. + * Input is expected in 8x8 macro blocks and output is expected to be in 64x1 + * macro blocks. The new version of the primitive takes the ROI in image pixel size and + * works with DCT coefficients that are in zig-zag order. + */ + int k = 0; + //LOG_INFO("NPP_CHECK_CUDA:%d",3); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0], + apdDCT[0], aDCTStep[0], + pdQuantizationTables + k * 64, + aSrcSize[0], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + k = 1; + + //LOG_INFO("NPP_CHECK_CUDA:%d",4); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1], + apdDCT[1], aDCTStep[1], + pdQuantizationTables + k * 64, + aSrcSize[1], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + //LOG_INFO("NPP_CHECK_CUDA:%d",5); + if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2], + apdDCT[2], aDCTStep[2], + pdQuantizationTables + k * 64, + aSrcSize[2], + pDCTState))) + { + printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); + return EXIT_FAILURE; + } + + // Huffman Encoding + + Npp32s nScanLength; + Npp8u *pJpegEncoderTemp; + +#if (CUDA_VERSION == 8000) + Npp32s nTempSize; //when using CUDA8 +#else + size_t nTempSize; //when using CUDA9 +#endif + //modified by Junlin 190221 + + //LOG_INFO("NPP_CHECK_CUDA:%d",6); + if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize))) + { + printf("nppiEncodeHuffmanGetSize Failed!\n"); + return EXIT_FAILURE; + } + + //LOG_INFO("NPP_CHECK_CUDA:%d",7); + NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize)); + + /** + * Allocates memory and creates a Huffman table in a format that is suitable for the encoder. + */ + NppStatus t_status; + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]); + t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]); + + /** + * Huffman Encoding of the JPEG Encoding. + * Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan. + */ + Npp32s nSs = 0; + Npp32s nSe = 63; + Npp32s nH = 0; + Npp32s nL = 0; + //LOG_INFO("NPP_CHECK_CUDA:%d",8); + if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep, + 0, nSs, nSe, nH, nL, + pdScan, &nScanLength, + apHuffmanDCTable, + apHuffmanACTable, + aSrcSize, + pJpegEncoderTemp))) + { + printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n"); + return EXIT_FAILURE; + } + + for (int i = 0; i < 3; ++i) + { + nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]); + nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]); + } + // Write JPEG + pDstJpeg = new unsigned char[4 << 20]{}; + pDstOutput = pDstJpeg; + + writeMarker(0x0D8, pDstOutput); + writeJFIFTag(pDstOutput); + writeQuantizationTable(aQuantizationTables[0], pDstOutput); + writeQuantizationTable(aQuantizationTables[1], pDstOutput); + writeHuffmanTable(pHuffmanDCTables[0], pDstOutput); + writeHuffmanTable(pHuffmanACTables[0], pDstOutput); + writeHuffmanTable(pHuffmanDCTables[1], pDstOutput); + writeHuffmanTable(pHuffmanACTables[1], pDstOutput); + writeFrameHeader(oFrameHeader, pDstOutput); + writeScanHeader(oScanHeader, pDstOutput); + + //LOG_INFO("NPP_CHECK_CUDA:%d",9); + NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost)); + + pDstOutput += nScanLength; + writeMarker(0x0D9, pDstOutput); + + { + // Write result to file. + std::ofstream outputFile(szOutputFile, ios::out | ios::binary); + outputFile.write(reinterpret_cast(pDstJpeg), static_cast(pDstOutput - pDstJpeg)); + } + + // Cleanup + cudaFree(pJpegEncoderTemp); + delete[] pDstJpeg; + for (int i = 0; i < 3; ++i) + { + cudaFree(apdDCT[i]); + cudaFree(apSrcImage[i]); + } + + return EXIT_SUCCESS; +} diff --git a/src/main.cpp b/src/main.cpp index 9a43f7f..d24e8f4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -10,8 +10,6 @@ #include - - #include @@ -24,6 +22,7 @@ #include "arpa/inet.h" #endif +#include "utiltools.hpp" #define MIN_RTP_PORT 10000 #define MAX_RTP_PORT 60000 @@ -88,7 +87,7 @@ int sum2 = 0; cudaStream_t stream[2]; -string data_home = "/data/tongtu/"; +string data_home = "/mnt/data/cmhu/tmp/"; #define checkCudaErrors(S) do {CUresult status; \ @@ -183,7 +182,7 @@ void postDecoded(const void * userPtr, AVFrame * gpuFrame){ cudaError_t cudaStatus; if(pHwRgb[0] == nullptr){ // cudaStreamCreate(&stream[0]); - cuda_common::setColorSpace( ITU709, 0 ); + cuda_common::setColorSpace( ITU_709, 0 ); cudaStatus = cudaMalloc((void **)&pHwRgb[0], 3 * gpuFrame->width * gpuFrame->height * sizeof(unsigned char)); } cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], pHwRgb[0], gpuFrame->width, gpuFrame->height); @@ -208,7 +207,7 @@ void postDecoded(const void * userPtr, AVFrame * gpuFrame){ cudaError_t cudaStatus; if(pHwRgb[1] == nullptr){ // cudaStreamCreate(&stream[1]); - cuda_common::setColorSpace( ITU709, 0 ); + cuda_common::setColorSpace( ITU_709, 0 ); cudaStatus = cudaMalloc((void **)&pHwRgb[1], 3 * gpuFrame->width * gpuFrame->height * sizeof(unsigned char)); } cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], pHwRgb[1], gpuFrame->width, gpuFrame->height); @@ -231,13 +230,6 @@ bool count_flag = false; int count = 0; int count_std = 100; -static long long get_cur_time(){ - // 获取操作系统当前时间点(精确到ms) - chrono::time_point tpMicro - = chrono::time_point_cast(chrono::system_clock::now()); - - return tpMicro.time_since_epoch().count(); -} static int sum = 0; unsigned char *pHwData = nullptr; @@ -255,13 +247,13 @@ void postDecoded0(const void * userPtr, AVFrame * gpuFrame){ { count_flag = true; count = 0; - end_time = start_time = get_cur_time(); + end_time = start_time = UtilTools::get_cur_time_ms(); } count++; sum ++ ; if (count >= count_std) { - // end_time = get_cur_time(); + // end_time = UtilTools::get_cur_time_ms(); // long time_using = end_time - start_time; // double time_per_frame = double(time_using)/count_std ; // cout << count_std << "帧用时:" << time_using << "ms 每帧用时:" << time_per_frame << "ms" << endl; @@ -278,7 +270,7 @@ void postDecoded0(const void * userPtr, AVFrame * gpuFrame){ // cout << "gpu id : " << decoder->m_cfg.gpuid.c_str() << endl; cudaError_t cudaStatus; if(pHwData == nullptr){ - cuda_common::setColorSpace( ITU709, 0 ); + cuda_common::setColorSpace( ITU_709, 0 ); cudaStatus = cudaMalloc((void **)&pHwData, 3 * gpuFrame->width * gpuFrame->height * sizeof(unsigned char)); } cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], pHwData, gpuFrame->width, gpuFrame->height); @@ -296,10 +288,10 @@ void postDecoded0(const void * userPtr, AVFrame * gpuFrame){ } void decode_finished_cbk(const void* userPtr){ - cout << "当前时间戳: " << get_cur_time() << endl; + cout << "当前时间戳: " << UtilTools::get_cur_time_ms() << endl; } -bool decode_request_stream_cbk(){ +bool decode_request_stream_cbk(const char* deviceId){ cout << "需在此请求流" << endl; return true; } @@ -374,7 +366,7 @@ void logFF(void *, int level, const char *fmt, va_list ap) int main(int argc, char* argv[]){ - test_uri = argv[1]; + test_uri = "rtsp://admin:admin@123456@192.168.60.176:554/cam/realmonitor?channel=1&subtype=0";//argv[1]; char* gpuid = argv[2]; int port = atoi(argv[3]); cout << test_uri << " gpu_id:" << gpuid << " port:" << port << endl; @@ -393,7 +385,7 @@ int main(int argc, char* argv[]){ std::this_thread::sleep_for(std::chrono::minutes(1)); FFNvDecoderManager* pDecManager = FFNvDecoderManager::getInstance(); int count = pDecManager->count(); - cout << "当前时间:" << get_cur_time() << " 当前运行路数: " << pDecManager->count() << endl; + cout << "当前时间:" << UtilTools::get_cur_time_ms() << " 当前运行路数: " << pDecManager->count() << endl; } return (void*)0; diff --git a/src/utiltools.hpp b/src/utiltools.hpp new file mode 100644 index 0000000..8caff91 --- /dev/null +++ b/src/utiltools.hpp @@ -0,0 +1,18 @@ +#ifndef _UTIL_TOOLS_HPP_ +#define _UTIL_TOOLS_HPP_ + +#include + +using namespace std; + +namespace UtilTools{ + + static long get_cur_time_ms() { + chrono::time_point tpMicro + = chrono::time_point_cast(chrono::system_clock::now()); + return tpMicro.time_since_epoch().count(); + } + +} + +#endif \ No newline at end of file -- libgit2 0.21.4