/* * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. * * NOTICE TO USER: * * This source code is subject to NVIDIA ownership rights under U.S. and * international Copyright laws. * * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE * OR PERFORMANCE OF THIS SOURCE CODE. * * U.S. Government End Users. This source code is a "commercial item" as * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of * "commercial computer software" and "commercial computer software * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) * and is provided to the U.S. Government only as a commercial end item. * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the * source code with only those rights set forth herein. */ // This sample needs at least CUDA 5.5 and a GPU that has at least Compute Capability 2.0 // This sample demonstrates a simple image processing pipeline. // First, a JPEG file is huffman decoded and inverse DCT transformed and dequantized. // Then the different planes are resized. Finally, the resized image is quantized, forward // DCT transformed and huffman encoded. #include "cuda_kernels.h" #include #include #include "common/UtilNPP/Exceptions.h" #include "Endianess.h" #include #include #include #include #include "common/inc/helper_string.h" #include "common/inc/helper_cuda.h" //#include "MacroDef.h" #include "cuda.h" using namespace std; struct FrameHeader { unsigned char nSamplePrecision; unsigned short nHeight; unsigned short nWidth; unsigned char nComponents; unsigned char aComponentIdentifier[3]; unsigned char aSamplingFactors[3]; unsigned char aQuantizationTableSelector[3]; }; struct ScanHeader { unsigned char nComponents; unsigned char aComponentSelector[3]; unsigned char aHuffmanTablesSelector[3]; unsigned char nSs; unsigned char nSe; unsigned char nA; }; struct QuantizationTable { unsigned char nPrecisionAndIdentifier; unsigned char aTable[64]; }; struct HuffmanTable { unsigned char nClassAndIdentifier; unsigned char aCodes[16]; unsigned char aTable[256]; }; //??准?炼??藕?量??模?? //unsigned char std_Y_QT[64] = //{ // 16, 11, 10, 16, 24, 40, 51, 61, // 12, 12, 14, 19, 26, 58, 60, 55, // 14, 13, 16, 24, 40, 57, 69, 56, // 14, 17, 22, 29, 51, 87, 80, 62, // 18, 22, 37, 56, 68, 109, 103, 77, // 24, 35, 55, 64, 81, 104, 113, 92, // 49, 64, 78, 87, 103, 121, 120, 101, // 72, 92, 95, 98, 112, 100, 103, 99 //}; // ////??准色???藕?量??模?? //unsigned char std_UV_QT[64] = //{ // 17, 18, 24, 47, 99, 99, 99, 99, // 18, 21, 26, 66, 99, 99, 99, 99, // 24, 26, 56, 99, 99, 99, 99, 99, // 47, 66, 99, 99, 99, 99, 99, 99, // 99, 99, 99, 99, 99, 99, 99, 99, // 99, 99, 99, 99, 99, 99, 99, 99, // 99, 99, 99, 99, 99, 99, 99, 99, // 99, 99, 99, 99, 99, 99, 99, 99 //}; ////?炼??藕?量??模?? //unsigned char std_Y_QT[64] = //{ // 6, 4, 5, 6, 5, 4, 6, 6, // 5, 6, 7, 7, 6, 8, 10, 16, // 10, 10, 9, 9, 10, 20, 14, 15, // 12, 16, 23, 20, 24, 24, 23, 20, // 22, 22, 26, 29, 37, 31, 26, 27, // 35, 28, 22, 22, 32, 44, 32, 35, // 38, 39, 41, 42, 41, 25, 31, 45, // 48, 45, 40, 48, 37, 40, 41, 40 //}; // ////色???藕?量??模?? //unsigned char std_UV_QT[64] = //{ // 7, 7, 7, 10, 8, 10, 19, 10, // 10, 19, 40, 26, 22, 26, 40, 40, // 40, 40, 40, 40, 40, 40, 40, 40, // 40, 40, 40, 40, 40, 40, 40, 40, // 40, 40, 40, 40, 40, 40, 40, 40, // 40, 40, 40, 40, 40, 40, 40, 40, // 40, 40, 40, 40, 40, 40, 40, 40, // 40, 40, 40, 40, 40, 40, 40, 40 //}; //?炼??藕?量??模?? unsigned char std_Y_QT[64] = { 0.75 * 6, 0.75 * 4, 0.75 * 5, 0.75 * 6, 0.75 * 5, 0.75 * 4, 0.75 * 6, 0.75 * 6, 0.75 * 5, 0.75 * 6, 0.75 * 7, 0.75 * 7, 0.75 * 6, 0.75 * 8, 0.75 * 10, 0.75 * 16, 0.75 * 10, 0.75 * 10, 0.75 * 9, 0.75 * 9, 0.75 * 10, 0.75 * 20, 0.75 * 14, 0.75 * 15, 0.75 * 12, 0.75 * 16, 0.75 * 23, 0.75 * 20, 0.75 * 24, 0.75 * 24, 0.75 * 23, 0.75 * 20, 0.75 * 22, 0.75 * 22, 0.75 * 26, 0.75 * 29, 0.75 * 37, 0.75 * 31, 0.75 * 26, 0.75 * 27, 0.75 * 35, 0.75 * 28, 0.75 * 22, 0.75 * 22, 0.75 * 32, 0.75 * 44, 0.75 * 32, 0.75 * 35, 0.75 * 38, 0.75 * 39, 0.75 * 41, 0.75 * 42, 0.75 * 41, 0.75 * 25, 0.75 * 31, 0.75 * 45, 0.75 * 48, 0.75 * 45, 0.75 * 40, 0.75 * 48, 0.75 * 37, 0.75 * 40, 0.75 * 41, 0.75 * 40 }; //色???藕?量??模?? unsigned char std_UV_QT[64] = { 0.75 * 7, 0.75 * 7, 0.75 * 7, 0.75 * 10, 0.75 * 8, 0.75 * 10, 0.75 * 19, 0.75 * 10, 0.75 * 10, 0.75 * 19, 0.75 * 40, 0.75 * 26, 0.75 * 22, 0.75 * 26, 0.75 * 40, 0.75 * 40, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30 }; unsigned char STD_DC_Y_NRCODES[16] = { 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 }; unsigned char STD_DC_Y_VALUES[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; unsigned char STD_DC_UV_NRCODES[16] = { 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 }; unsigned char STD_DC_UV_VALUES[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; unsigned char STD_AC_Y_NRCODES[16] = { 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0X7D }; unsigned char STD_AC_Y_VALUES[162] = { 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08, 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0, 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa }; unsigned char STD_AC_UV_NRCODES[16] = { 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0X77 }; unsigned char STD_AC_UV_VALUES[162] = { 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71, 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0, 0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34, 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa }; int DivUp(int x, int d) { return (x + d - 1) / d; } template void writeAndAdvance(unsigned char *&pData, T nElement) { writeBigEndian(pData, nElement); pData += sizeof(T); } void writeMarker(unsigned char nMarker, unsigned char *&pData) { *pData++ = 0x0ff; *pData++ = nMarker; } void writeJFIFTag(unsigned char *&pData) { const char JFIF_TAG[] = { 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01, 0x02, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00 }; writeMarker(0x0e0, pData); writeAndAdvance(pData, sizeof(JFIF_TAG) + sizeof(unsigned short)); memcpy(pData, JFIF_TAG, sizeof(JFIF_TAG)); pData += sizeof(JFIF_TAG); } void writeFrameHeader(const FrameHeader &header, unsigned char *&pData) { unsigned char aTemp[128]; unsigned char *pTemp = aTemp; writeAndAdvance(pTemp, header.nSamplePrecision); writeAndAdvance(pTemp, header.nHeight); writeAndAdvance(pTemp, header.nWidth); writeAndAdvance(pTemp, header.nComponents); for (int c = 0; c(pTemp, header.aComponentIdentifier[c]); writeAndAdvance(pTemp, header.aSamplingFactors[c]); writeAndAdvance(pTemp, header.aQuantizationTableSelector[c]); } unsigned short nLength = (unsigned short)(pTemp - aTemp); writeMarker(0x0C0, pData); writeAndAdvance(pData, nLength + 2); memcpy(pData, aTemp, nLength); pData += nLength; } void writeScanHeader(const ScanHeader &header, unsigned char *&pData) { unsigned char aTemp[128]; unsigned char *pTemp = aTemp; writeAndAdvance(pTemp, header.nComponents); for (int c = 0; c(pTemp, header.aComponentSelector[c]); writeAndAdvance(pTemp, header.aHuffmanTablesSelector[c]); } writeAndAdvance(pTemp, header.nSs); writeAndAdvance(pTemp, header.nSe); writeAndAdvance(pTemp, header.nA); unsigned short nLength = (unsigned short)(pTemp - aTemp); writeMarker(0x0DA, pData); writeAndAdvance(pData, nLength + 2); memcpy(pData, aTemp, nLength); pData += nLength; } void writeQuantizationTable(const QuantizationTable &table, unsigned char *&pData) { writeMarker(0x0DB, pData); writeAndAdvance(pData, sizeof(QuantizationTable) + 2); memcpy(pData, &table, sizeof(QuantizationTable)); pData += sizeof(QuantizationTable); } void writeHuffmanTable(const HuffmanTable &table, unsigned char *&pData) { writeMarker(0x0C4, pData); // Number of Codes for Bit Lengths [1..16] int nCodeCount = 0; for (int i = 0; i < 16; ++i) { nCodeCount += table.aCodes[i]; } writeAndAdvance(pData, 17 + nCodeCount + 2); memcpy(pData, &table, 17 + nCodeCount); pData += 17 + nCodeCount; } bool printfNPPinfo(int cudaVerMajor, int cudaVerMinor) { const NppLibraryVersion *libVer = nppGetLibVersion(); printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); int driverVersion, runtimeVersion; cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); printf(" CUDA Driver Version: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); printf(" CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10); bool bVal = checkCudaCapabilities(cudaVerMajor, cudaVerMinor); return bVal; } NppiDCTState *pDCTState; FrameHeader oFrameHeader; FrameHeader oFrameHeaderFixedSize; ScanHeader oScanHeader; QuantizationTable aQuantizationTables[4]; Npp8u *pdQuantizationTables; HuffmanTable aHuffmanTables[4]; HuffmanTable *pHuffmanDCTables; HuffmanTable *pHuffmanACTables; int nMCUBlocksH; int nMCUBlocksV; int nMCUBlocksHFixedSize; int nMCUBlocksVFixedSize; Npp8u *pdScan; NppiEncodeHuffmanSpec *apHuffmanDCTable[3]; NppiEncodeHuffmanSpec *apHuffmanACTable[3]; unsigned char *pDstJpeg; unsigned char *pDstOutput; int nRestartInterval; int initTable() { NPP_CHECK_NPP(nppiDCTInitAlloc(&pDCTState)); nRestartInterval = -1; cudaMalloc(&pdQuantizationTables, 64 * 4); pHuffmanDCTables = aHuffmanTables; pHuffmanACTables = &aHuffmanTables[2]; memset(aQuantizationTables, 0, 4 * sizeof(QuantizationTable)); memset(aHuffmanTables, 0, 4 * sizeof(HuffmanTable)); memset(&oFrameHeader, 0, sizeof(FrameHeader)); //????Huffman?? aHuffmanTables[0].nClassAndIdentifier = 0; memcpy(aHuffmanTables[0].aCodes, STD_DC_Y_NRCODES, 16); memcpy(aHuffmanTables[0].aTable, STD_DC_Y_VALUES, 12); aHuffmanTables[1].nClassAndIdentifier = 1; memcpy(aHuffmanTables[1].aCodes, STD_DC_UV_NRCODES, 16); memcpy(aHuffmanTables[1].aTable, STD_DC_UV_VALUES, 12); aHuffmanTables[2].nClassAndIdentifier = 16; memcpy(aHuffmanTables[2].aCodes, STD_AC_Y_NRCODES, 16); memcpy(aHuffmanTables[2].aTable, STD_AC_Y_VALUES, 162); aHuffmanTables[3].nClassAndIdentifier = 17; memcpy(aHuffmanTables[3].aCodes, STD_AC_UV_NRCODES, 16); memcpy(aHuffmanTables[3].aTable, STD_AC_UV_VALUES, 162); //????量???? aQuantizationTables[0].nPrecisionAndIdentifier = 0; memcpy(aQuantizationTables[0].aTable, std_Y_QT, 64); aQuantizationTables[1].nPrecisionAndIdentifier = 1; memcpy(aQuantizationTables[1].aTable, std_UV_QT, 64); NPP_CHECK_CUDA(cudaMemcpyAsync(pdQuantizationTables, aQuantizationTables[0].aTable, 64, cudaMemcpyHostToDevice)); NPP_CHECK_CUDA(cudaMemcpyAsync(pdQuantizationTables + 64, aQuantizationTables[1].aTable, 64, cudaMemcpyHostToDevice)); oFrameHeader.nSamplePrecision = 8; oFrameHeader.nComponents = 3; oFrameHeader.aComponentIdentifier[0] = 1; oFrameHeader.aComponentIdentifier[1] = 2; oFrameHeader.aComponentIdentifier[2] = 3; oFrameHeader.aSamplingFactors[0] = 34; oFrameHeader.aSamplingFactors[1] = 17; oFrameHeader.aSamplingFactors[2] = 17; oFrameHeader.aQuantizationTableSelector[0] = 0; oFrameHeader.aQuantizationTableSelector[1] = 1; oFrameHeader.aQuantizationTableSelector[2] = 1; for (int i = 0; i < oFrameHeader.nComponents; ++i) { nMCUBlocksV = max(nMCUBlocksV, oFrameHeader.aSamplingFactors[i] & 0x0f); nMCUBlocksH = max(nMCUBlocksH, oFrameHeader.aSamplingFactors[i] >> 4); } NPP_CHECK_CUDA(cudaMalloc(&pdScan, 4 << 20)); oScanHeader.nComponents = 3; oScanHeader.aComponentSelector[0] = 1; oScanHeader.aComponentSelector[1] = 2; oScanHeader.aComponentSelector[2] = 3; oScanHeader.aHuffmanTablesSelector[0] = 0; oScanHeader.aHuffmanTablesSelector[1] = 17; oScanHeader.aHuffmanTablesSelector[2] = 17; oScanHeader.nSs = 0; oScanHeader.nSe = 63; oScanHeader.nA = 0; return 0; } NppiSize aSrcSize[3]; Npp16s *apdDCT[3];// = { 0, 0, 0 }; Npp32s aDCTStep[3]; Npp8u *apSrcImage[3];// = { 0, 0, 0 }; Npp32s aSrcImageStep[3]; size_t aSrcPitch[3]; int releaseJpegNPP() { nppiDCTFree(pDCTState); cudaFree(pdQuantizationTables); cudaFree(pdScan); for (int i = 0; i < 3; ++i) { cudaFree(apdDCT[i]); cudaFree(apSrcImage[i]); } return 0; } int initTable(int flag, int width, int height) { //????帧头 oFrameHeaderFixedSize.nSamplePrecision = 8; oFrameHeaderFixedSize.nComponents = 3; oFrameHeaderFixedSize.aComponentIdentifier[0] = 1; oFrameHeaderFixedSize.aComponentIdentifier[1] = 2; oFrameHeaderFixedSize.aComponentIdentifier[2] = 3; oFrameHeaderFixedSize.aSamplingFactors[0] = 34; oFrameHeaderFixedSize.aSamplingFactors[1] = 17; oFrameHeaderFixedSize.aSamplingFactors[2] = 17; oFrameHeaderFixedSize.aQuantizationTableSelector[0] = 0; oFrameHeaderFixedSize.aQuantizationTableSelector[1] = 1; oFrameHeaderFixedSize.aQuantizationTableSelector[2] = 1; oFrameHeaderFixedSize.nWidth = width; oFrameHeaderFixedSize.nHeight = height; for (int i = 0; i < oFrameHeaderFixedSize.nComponents; ++i) { nMCUBlocksVFixedSize = max(nMCUBlocksVFixedSize, oFrameHeaderFixedSize.aSamplingFactors[i] & 0x0f); nMCUBlocksHFixedSize = max(nMCUBlocksHFixedSize, oFrameHeaderFixedSize.aSamplingFactors[i] >> 4); } for (int i = 0; i < oFrameHeaderFixedSize.nComponents; ++i) { NppiSize oBlocks; NppiSize oBlocksPerMCU = { oFrameHeaderFixedSize.aSamplingFactors[i] >> 4, oFrameHeaderFixedSize.aSamplingFactors[i] & 0x0f }; oBlocks.width = (int)ceil((oFrameHeaderFixedSize.nWidth + 7) / 8 * static_cast(oBlocksPerMCU.width) / nMCUBlocksHFixedSize); oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; oBlocks.height = (int)ceil((oFrameHeaderFixedSize.nHeight + 7) / 8 * static_cast(oBlocksPerMCU.height) / nMCUBlocksVFixedSize); oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; aSrcSize[i].width = oBlocks.width * 8; aSrcSize[i].height = oBlocks.height * 8; // Allocate Memory size_t nPitch; NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height)); aDCTStep[i] = static_cast(nPitch); NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height)); aSrcPitch[i] = nPitch; aSrcImageStep[i] = static_cast(nPitch); } return 0; } int jpegNPP(const char *szOutputFile, float* d_srcRGB) { //RGB2YUV cudaError_t cudaStatus; cudaStatus = cuda_common::RGB2YUV(d_srcRGB, oFrameHeaderFixedSize.nWidth, oFrameHeaderFixedSize.nHeight, apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height, apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height, apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height); /** * Forward DCT, quantization and level shift part of the JPEG encoding. * Input is expected in 8x8 macro blocks and output is expected to be in 64x1 * macro blocks. The new version of the primitive takes the ROI in image pixel size and * works with DCT coefficients that are in zig-zag order. */ int k = 0; //LOG_INFO("NPP_CHECK_NPP:%d", 1); if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0], apdDCT[0], aDCTStep[0], pdQuantizationTables + k * 64, aSrcSize[0], pDCTState))) { printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); return EXIT_FAILURE; } k = 1; //LOG_INFO("NPP_CHECK_NPP:%d", 2); if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1], apdDCT[1], aDCTStep[1], pdQuantizationTables + k * 64, aSrcSize[1], pDCTState))) { printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); return EXIT_FAILURE; } //LOG_INFO("NPP_CHECK_NPP:%d", 3); if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2], apdDCT[2], aDCTStep[2], pdQuantizationTables + k * 64, aSrcSize[2], pDCTState))) { printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); return EXIT_FAILURE; } // Huffman Encoding Npp32s nScanLength; Npp8u *pJpegEncoderTemp; #if (CUDA_VERSION == 8000) Npp32s nTempSize; //when using CUDA8 #else size_t nTempSize; //when using CUDA9 #endif //modified by Junlin 190221 //LOG_INFO("NPP_CHECK_NPP:%d",4); if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize))) { printf("nppiEncodeHuffmanGetSize Failed!\n"); return EXIT_FAILURE; } //LOG_INFO("NPP_CHECK_CUDA:%d",5); NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize)); /** * Allocates memory and creates a Huffman table in a format that is suitable for the encoder. */ NppStatus t_status; t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]); /** * Huffman Encoding of the JPEG Encoding. * Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan. */ Npp32s nSs = 0; Npp32s nSe = 63; Npp32s nH = 0; Npp32s nL = 0; //LOG_INFO("NPP_CHECK_NPP:%d",6); if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep, 0, nSs, nSe, nH, nL, pdScan, &nScanLength, apHuffmanDCTable, apHuffmanACTable, aSrcSize, pJpegEncoderTemp))) { printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n"); return EXIT_FAILURE; } for (int i = 0; i < 3; ++i) { nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]); nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]); } // Write JPEG pDstJpeg = new unsigned char[4 << 20]{}; pDstOutput = pDstJpeg; writeMarker(0x0D8, pDstOutput); writeJFIFTag(pDstOutput); writeQuantizationTable(aQuantizationTables[0], pDstOutput); writeQuantizationTable(aQuantizationTables[1], pDstOutput); writeHuffmanTable(pHuffmanDCTables[0], pDstOutput); writeHuffmanTable(pHuffmanACTables[0], pDstOutput); writeHuffmanTable(pHuffmanDCTables[1], pDstOutput); writeHuffmanTable(pHuffmanACTables[1], pDstOutput); writeFrameHeader(oFrameHeaderFixedSize, pDstOutput); writeScanHeader(oScanHeader, pDstOutput); //LOG_INFO("NPP_CHECK_CUDA:%d",7); NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost)); pDstOutput += nScanLength; writeMarker(0x0D9, pDstOutput); { // Write result to file. std::ofstream outputFile(szOutputFile, ios::out | ios::binary); outputFile.write(reinterpret_cast(pDstJpeg), static_cast(pDstOutput - pDstJpeg)); } // Cleanup cudaFree(pJpegEncoderTemp); delete[] pDstJpeg; return EXIT_SUCCESS; } int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB) { //RGB2YUV cudaError_t cudaStatus; cudaStatus = cuda_common::RGB2YUV(d_srcRGB, oFrameHeaderFixedSize.nWidth, oFrameHeaderFixedSize.nHeight, apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height, apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height, apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height); /** * Forward DCT, quantization and level shift part of the JPEG encoding. * Input is expected in 8x8 macro blocks and output is expected to be in 64x1 * macro blocks. The new version of the primitive takes the ROI in image pixel size and * works with DCT coefficients that are in zig-zag order. */ int k = 0; //LOG_INFO("NPP_CHECK_NPP:%d", 1); if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0], apdDCT[0], aDCTStep[0], pdQuantizationTables + k * 64, aSrcSize[0], pDCTState))) { printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); return EXIT_FAILURE; } k = 1; //LOG_INFO("NPP_CHECK_NPP:%d", 2); if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1], apdDCT[1], aDCTStep[1], pdQuantizationTables + k * 64, aSrcSize[1], pDCTState))) { printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); return EXIT_FAILURE; } //LOG_INFO("NPP_CHECK_NPP:%d", 3); if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2], apdDCT[2], aDCTStep[2], pdQuantizationTables + k * 64, aSrcSize[2], pDCTState))) { printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); return EXIT_FAILURE; } // Huffman Encoding Npp32s nScanLength; Npp8u *pJpegEncoderTemp; #if (CUDA_VERSION == 8000) Npp32s nTempSize; //when using CUDA8 #else size_t nTempSize; //when using CUDA9 #endif //modified by Junlin 190221 //LOG_INFO("NPP_CHECK_NPP:%d",4); if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize))) { printf("nppiEncodeHuffmanGetSize Failed!\n"); return EXIT_FAILURE; } //LOG_INFO("NPP_CHECK_CUDA:%d",5); NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize)); /** * Allocates memory and creates a Huffman table in a format that is suitable for the encoder. */ NppStatus t_status; t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]); /** * Huffman Encoding of the JPEG Encoding. * Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan. */ Npp32s nSs = 0; Npp32s nSe = 63; Npp32s nH = 0; Npp32s nL = 0; //LOG_INFO("NPP_CHECK_NPP:%d",6); if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep, 0, nSs, nSe, nH, nL, pdScan, &nScanLength, apHuffmanDCTable, apHuffmanACTable, aSrcSize, pJpegEncoderTemp))) { printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n"); return EXIT_FAILURE; } for (int i = 0; i < 3; ++i) { nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]); nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]); } // Write JPEG pDstJpeg = new unsigned char[4 << 20]{}; pDstOutput = pDstJpeg; writeMarker(0x0D8, pDstOutput); writeJFIFTag(pDstOutput); writeQuantizationTable(aQuantizationTables[0], pDstOutput); writeQuantizationTable(aQuantizationTables[1], pDstOutput); writeHuffmanTable(pHuffmanDCTables[0], pDstOutput); writeHuffmanTable(pHuffmanACTables[0], pDstOutput); writeHuffmanTable(pHuffmanDCTables[1], pDstOutput); writeHuffmanTable(pHuffmanACTables[1], pDstOutput); writeFrameHeader(oFrameHeaderFixedSize, pDstOutput); writeScanHeader(oScanHeader, pDstOutput); //LOG_INFO("NPP_CHECK_CUDA:%d",7); NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost)); pDstOutput += nScanLength; writeMarker(0x0D9, pDstOutput); { // Write result to file. std::ofstream outputFile(szOutputFile, ios::out | ios::binary); outputFile.write(reinterpret_cast(pDstJpeg), static_cast(pDstOutput - pDstJpeg)); } // Cleanup cudaFree(pJpegEncoderTemp); delete[] pDstJpeg; return EXIT_SUCCESS; } int jpegNPP(const char *szOutputFile, float* d_srcRGB, int img_width, int img_height) { NppiSize aSrcSize[3]; Npp16s *apdDCT[3] = { 0, 0, 0 }; Npp32s aDCTStep[3]; Npp8u *apSrcImage[3] = { 0, 0, 0 }; Npp32s aSrcImageStep[3]; size_t aSrcPitch[3]; //????帧头 oFrameHeader.nWidth = img_width; oFrameHeader.nHeight = img_height; for (int i = 0; i < oFrameHeader.nComponents; ++i) { NppiSize oBlocks; NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] >> 4, oFrameHeader.aSamplingFactors[i] & 0x0f }; oBlocks.width = (int)ceil((oFrameHeader.nWidth + 7) / 8 * static_cast(oBlocksPerMCU.width) / nMCUBlocksH); oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; oBlocks.height = (int)ceil((oFrameHeader.nHeight + 7) / 8 * static_cast(oBlocksPerMCU.height) / nMCUBlocksV); oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; aSrcSize[i].width = oBlocks.width * 8; aSrcSize[i].height = oBlocks.height * 8; // Allocate Memory size_t nPitch; //LOG_INFO("NPP_CHECK_CUDA:%d",1); NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height)); aDCTStep[i] = static_cast(nPitch); //LOG_INFO("NPP_CHECK_CUDA:%d",2); NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height)); aSrcPitch[i] = nPitch; aSrcImageStep[i] = static_cast(nPitch); } //RGB2YUV cudaError_t cudaStatus; cudaStatus = cuda_common::RGB2YUV(d_srcRGB, img_width, img_height, apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height, apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height, apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height); /** * Forward DCT, quantization and level shift part of the JPEG encoding. * Input is expected in 8x8 macro blocks and output is expected to be in 64x1 * macro blocks. The new version of the primitive takes the ROI in image pixel size and * works with DCT coefficients that are in zig-zag order. */ int k = 0; //LOG_INFO("NPP_CHECK_CUDA:%d",3); if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0], apdDCT[0], aDCTStep[0], pdQuantizationTables + k * 64, aSrcSize[0], pDCTState))) { printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); return EXIT_FAILURE; } k = 1; //LOG_INFO("NPP_CHECK_CUDA:%d",4); if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1], apdDCT[1], aDCTStep[1], pdQuantizationTables + k * 64, aSrcSize[1], pDCTState))) { printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); return EXIT_FAILURE; } //LOG_INFO("NPP_CHECK_CUDA:%d",5); if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2], apdDCT[2], aDCTStep[2], pdQuantizationTables + k * 64, aSrcSize[2], pDCTState))) { printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); return EXIT_FAILURE; } // Huffman Encoding Npp32s nScanLength; Npp8u *pJpegEncoderTemp; #if (CUDA_VERSION == 8000) Npp32s nTempSize; //when using CUDA8 #else size_t nTempSize; //when using CUDA9 #endif //modified by Junlin 190221 //LOG_INFO("NPP_CHECK_CUDA:%d",6); if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize))) { printf("nppiEncodeHuffmanGetSize Failed!\n"); return EXIT_FAILURE; } //LOG_INFO("NPP_CHECK_CUDA:%d",7); NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize)); /** * Allocates memory and creates a Huffman table in a format that is suitable for the encoder. */ NppStatus t_status; t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]); /** * Huffman Encoding of the JPEG Encoding. * Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan. */ Npp32s nSs = 0; Npp32s nSe = 63; Npp32s nH = 0; Npp32s nL = 0; //LOG_INFO("NPP_CHECK_CUDA:%d",8); if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep, 0, nSs, nSe, nH, nL, pdScan, &nScanLength, apHuffmanDCTable, apHuffmanACTable, aSrcSize, pJpegEncoderTemp))) { printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n"); return EXIT_FAILURE; } for (int i = 0; i < 3; ++i) { nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]); nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]); } // Write JPEG pDstJpeg = new unsigned char[4 << 20]{}; pDstOutput = pDstJpeg; writeMarker(0x0D8, pDstOutput); writeJFIFTag(pDstOutput); writeQuantizationTable(aQuantizationTables[0], pDstOutput); writeQuantizationTable(aQuantizationTables[1], pDstOutput); writeHuffmanTable(pHuffmanDCTables[0], pDstOutput); writeHuffmanTable(pHuffmanACTables[0], pDstOutput); writeHuffmanTable(pHuffmanDCTables[1], pDstOutput); writeHuffmanTable(pHuffmanACTables[1], pDstOutput); writeFrameHeader(oFrameHeader, pDstOutput); writeScanHeader(oScanHeader, pDstOutput); //LOG_INFO("NPP_CHECK_CUDA:%d",9); NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost)); pDstOutput += nScanLength; writeMarker(0x0D9, pDstOutput); { // Write result to file. std::ofstream outputFile(szOutputFile, ios::out | ios::binary); outputFile.write(reinterpret_cast(pDstJpeg), static_cast(pDstOutput - pDstJpeg)); } // Cleanup cudaFree(pJpegEncoderTemp); delete[] pDstJpeg; for (int i = 0; i < 3; ++i) { cudaFree(apdDCT[i]); cudaFree(apSrcImage[i]); } return EXIT_SUCCESS; } int jpegNPP(const char *szOutputFile, unsigned char* d_srcRGB, int img_width, int img_height) { NppiSize aSrcSize[3]; Npp16s *apdDCT[3] = { 0, 0, 0 }; Npp32s aDCTStep[3]; Npp8u *apSrcImage[3] = { 0, 0, 0 }; Npp32s aSrcImageStep[3]; size_t aSrcPitch[3]; //????帧头 oFrameHeader.nWidth = img_width; oFrameHeader.nHeight = img_height; for (int i = 0; i < oFrameHeader.nComponents; ++i) { NppiSize oBlocks; NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] >> 4, oFrameHeader.aSamplingFactors[i] & 0x0f }; oBlocks.width = (int)ceil((oFrameHeader.nWidth + 7) / 8 * static_cast(oBlocksPerMCU.width) / nMCUBlocksH); oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; oBlocks.height = (int)ceil((oFrameHeader.nHeight + 7) / 8 * static_cast(oBlocksPerMCU.height) / nMCUBlocksV); oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; aSrcSize[i].width = oBlocks.width * 8; aSrcSize[i].height = oBlocks.height * 8; // Allocate Memory size_t nPitch; //LOG_INFO("NPP_CHECK_CUDA:%d",1); NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height)); aDCTStep[i] = static_cast(nPitch); //LOG_INFO("NPP_CHECK_CUDA:%d",2); NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height)); aSrcPitch[i] = nPitch; aSrcImageStep[i] = static_cast(nPitch); } //RGB2YUV cudaError_t cudaStatus; cudaStatus = cuda_common::RGB2YUV(d_srcRGB, img_width, img_height, apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height, apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height, apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height); /** * Forward DCT, quantization and level shift part of the JPEG encoding. * Input is expected in 8x8 macro blocks and output is expected to be in 64x1 * macro blocks. The new version of the primitive takes the ROI in image pixel size and * works with DCT coefficients that are in zig-zag order. */ int k = 0; //LOG_INFO("NPP_CHECK_CUDA:%d",3); if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0], apdDCT[0], aDCTStep[0], pdQuantizationTables + k * 64, aSrcSize[0], pDCTState))) { printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); return EXIT_FAILURE; } k = 1; //LOG_INFO("NPP_CHECK_CUDA:%d",4); if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1], apdDCT[1], aDCTStep[1], pdQuantizationTables + k * 64, aSrcSize[1], pDCTState))) { printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); return EXIT_FAILURE; } //LOG_INFO("NPP_CHECK_CUDA:%d",5); if (NPP_SUCCESS != (nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2], apdDCT[2], aDCTStep[2], pdQuantizationTables + k * 64, aSrcSize[2], pDCTState))) { printf("nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW Failed!\n"); return EXIT_FAILURE; } // Huffman Encoding Npp32s nScanLength; Npp8u *pJpegEncoderTemp; #if (CUDA_VERSION == 8000) Npp32s nTempSize; //when using CUDA8 #else size_t nTempSize; //when using CUDA9 #endif //modified by Junlin 190221 //LOG_INFO("NPP_CHECK_CUDA:%d",6); if (NPP_SUCCESS != (nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize))) { printf("nppiEncodeHuffmanGetSize Failed!\n"); return EXIT_FAILURE; } //LOG_INFO("NPP_CHECK_CUDA:%d",7); NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize)); /** * Allocates memory and creates a Huffman table in a format that is suitable for the encoder. */ NppStatus t_status; t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]); t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]); /** * Huffman Encoding of the JPEG Encoding. * Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan. */ Npp32s nSs = 0; Npp32s nSe = 63; Npp32s nH = 0; Npp32s nL = 0; //LOG_INFO("NPP_CHECK_CUDA:%d",8); if (NPP_SUCCESS != (nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep, 0, nSs, nSe, nH, nL, pdScan, &nScanLength, apHuffmanDCTable, apHuffmanACTable, aSrcSize, pJpegEncoderTemp))) { printf("nppiEncodeHuffmanScan_JPEG_8u16s_P3R Failed!\n"); return EXIT_FAILURE; } for (int i = 0; i < 3; ++i) { nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]); nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]); } // Write JPEG pDstJpeg = new unsigned char[4 << 20]{}; pDstOutput = pDstJpeg; writeMarker(0x0D8, pDstOutput); writeJFIFTag(pDstOutput); writeQuantizationTable(aQuantizationTables[0], pDstOutput); writeQuantizationTable(aQuantizationTables[1], pDstOutput); writeHuffmanTable(pHuffmanDCTables[0], pDstOutput); writeHuffmanTable(pHuffmanACTables[0], pDstOutput); writeHuffmanTable(pHuffmanDCTables[1], pDstOutput); writeHuffmanTable(pHuffmanACTables[1], pDstOutput); writeFrameHeader(oFrameHeader, pDstOutput); writeScanHeader(oScanHeader, pDstOutput); //LOG_INFO("NPP_CHECK_CUDA:%d",9); NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost)); pDstOutput += nScanLength; writeMarker(0x0D9, pDstOutput); { // Write result to file. std::ofstream outputFile(szOutputFile, ios::out | ios::binary); outputFile.write(reinterpret_cast(pDstJpeg), static_cast(pDstOutput - pDstJpeg)); } // Cleanup cudaFree(pJpegEncoderTemp); delete[] pDstJpeg; for (int i = 0; i < 3; ++i) { cudaFree(apdDCT[i]); cudaFree(apSrcImage[i]); } return EXIT_SUCCESS; }