// Copyright (C) 2019 The Android Open Source Project // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "host-common/MediaCudaVideoHelper.h" #include "host-common/MediaCudaDriverHelper.h" #include "host-common/MediaCudaUtils.h" #include "host-common/YuvConverter.h" #include "android/utils/debug.h" extern "C" { #define INIT_CUDA_GL 1 #include "host-common/dynlink_cuda.h" #include "host-common/dynlink_cudaGL.h" #include "host-common/dynlink_nvcuvid.h" } #define MEDIA_CUDA_DEBUG 0 #if MEDIA_CUDA_DEBUG #define CUDA_DPRINT(fmt, ...) \ fprintf(stderr, "media-cuda-video-helper: %s:%d " fmt "\n", __func__, \ __LINE__, ##__VA_ARGS__); #else #define CUDA_DPRINT(fmt, ...) #endif #define NVDEC_API_CALL(cuvidAPI) \ do { \ CUresult errorCode = cuvidAPI; \ if (errorCode != CUDA_SUCCESS) { \ CUDA_DPRINT("%s failed with error code %d\n", #cuvidAPI, \ (int)errorCode); \ } \ } while (0) namespace android { namespace emulation { bool MediaCudaVideoHelper::s_isCudaDecoderGood = true; using TextureFrame = MediaTexturePool::TextureFrame; using FrameInfo = MediaSnapshotState::FrameInfo; using ColorAspects = MediaSnapshotState::ColorAspects; MediaCudaVideoHelper::MediaCudaVideoHelper(OutputTreatmentMode oMode, FrameStorageMode fMode, cudaVideoCodec cudaVideoCodecType) : mUseGpuTexture(fMode == FrameStorageMode::USE_GPU_TEXTURE), mCudaVideoCodecType(cudaVideoCodecType) { mIgnoreDecoderOutput = (oMode == OutputTreatmentMode::IGNORE_RESULT); } MediaCudaVideoHelper::~MediaCudaVideoHelper() { deInit(); } void MediaCudaVideoHelper::deInit() { CUDA_DPRINT("deInit calling"); mSavedDecodedFrames.clear(); if (mCudaContext != nullptr) { NVDEC_API_CALL(cuCtxPushCurrent(mCudaContext)); if (mCudaParser != nullptr) { NVDEC_API_CALL(cuvidDestroyVideoParser(mCudaParser)); mCudaParser = nullptr; } if (mCudaDecoder != nullptr) { NVDEC_API_CALL(cuvidDestroyDecoder(mCudaDecoder)); mCudaDecoder = nullptr; } NVDEC_API_CALL(cuCtxPopCurrent(NULL)); NVDEC_API_CALL(cuvidCtxLockDestroy(mCtxLock)); } if (mCudaContext != nullptr) { CUresult myres = cuCtxDestroy(mCudaContext); if (myres != CUDA_SUCCESS) { CUDA_DPRINT("Failed to destroy cuda context; error code %d", (int)myres); } mCudaContext = nullptr; } } bool MediaCudaVideoHelper::init() { if (!s_isCudaDecoderGood) { CUDA_DPRINT( "Already verified: cuda decoder does not work on this host"); return false; } if (!MediaCudaDriverHelper::initCudaDrivers()) { CUDA_DPRINT("Failed to initCudaDrivers"); mIsGood = false; mErrorCode = 1; s_isCudaDecoderGood = false; return false; } if (mCudaContext != nullptr) { deInit(); } // cudat stuff const int gpuIndex = 0; const int cudaFlags = 0; CUdevice cudaDevice = 0; CUresult myres = cuDeviceGet(&cudaDevice, gpuIndex); if (myres != CUDA_SUCCESS) { mIsGood = false; mErrorCode = 2; s_isCudaDecoderGood = false; CUDA_DPRINT("Failed to get cuda device, error code %d", (int)myres); return false; } char buf[1024]; myres = cuDeviceGetName(buf, sizeof(buf), cudaDevice); if (myres != CUDA_SUCCESS) { mIsGood = false; mErrorCode = 3; s_isCudaDecoderGood = false; CUDA_DPRINT("Failed to get gpu device name, error code %d", (int)myres); return false; } CUDA_DPRINT("using gpu device %s", buf); myres = cuCtxCreate(&mCudaContext, cudaFlags, cudaDevice); if (myres != CUDA_SUCCESS) { mIsGood = false; s_isCudaDecoderGood = false; CUDA_DPRINT("Failed to create cuda context, error code %d", (int)myres); return false; } NVDEC_API_CALL(cuvidCtxLockCreate(&mCtxLock, mCudaContext)); CUVIDPARSERPARAMS videoParserParameters = {}; // videoParserParameters.CodecType = (mType == MediaCodecType::VP8Codec) ? // cudaVideoCodec_VP8 : cudaVideoCodec_VP9; videoParserParameters.CodecType = mCudaVideoCodecType; videoParserParameters.ulMaxNumDecodeSurfaces = 1; videoParserParameters.ulMaxDisplayDelay = 1; videoParserParameters.pUserData = this; videoParserParameters.pfnSequenceCallback = HandleVideoSequenceProc; videoParserParameters.pfnDecodePicture = HandlePictureDecodeProc; videoParserParameters.pfnDisplayPicture = HandlePictureDisplayProc; NVDEC_API_CALL( cuvidCreateVideoParser(&mCudaParser, &videoParserParameters)); CUDA_DPRINT("Successfully created cuda context %p", mCudaContext); dprint("successfully created cuda video decoder for %s, with gpu texture " "mode %s", mCudaVideoCodecType == cudaVideoCodec_H264 ? "H264" : (mCudaVideoCodecType == cudaVideoCodec_VP8 ? "VP8" : "VP9"), mUseGpuTexture ? "on" : "off"); return true; } void MediaCudaVideoHelper::decode(const uint8_t* frame, size_t szBytes, uint64_t inputPts) { CUDA_DPRINT("%s(frame=%p, sz=%zu)", __func__, frame, szBytes); CUVIDSOURCEDATAPACKET packet = {0}; packet.payload = frame; packet.payload_size = szBytes; packet.flags = CUVID_PKT_TIMESTAMP; packet.timestamp = inputPts; if (!frame || szBytes == 0) { packet.flags |= CUVID_PKT_ENDOFSTREAM; } else { ++mNumInputFrame; } NVDEC_API_CALL(cuvidParseVideoData(mCudaParser, &packet)); } void MediaCudaVideoHelper::flush() { CUDA_DPRINT("started flushing"); CUVIDSOURCEDATAPACKET packet = {0}; packet.payload = NULL; packet.payload_size = 0; packet.flags |= CUVID_PKT_ENDOFSTREAM; NVDEC_API_CALL(cuvidParseVideoData(mCudaParser, &packet)); CUDA_DPRINT("done one flushing"); } int MediaCudaVideoHelper::HandleVideoSequence(CUVIDEOFORMAT* pVideoFormat) { int nDecodeSurface = 8; // need 8 for 4K video CUVIDDECODECAPS decodecaps; memset(&decodecaps, 0, sizeof(decodecaps)); decodecaps.eCodecType = pVideoFormat->codec; decodecaps.eChromaFormat = pVideoFormat->chroma_format; decodecaps.nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8; NVDEC_API_CALL(cuCtxPushCurrent(mCudaContext)); NVDEC_API_CALL(cuvidGetDecoderCaps(&decodecaps)); NVDEC_API_CALL(cuCtxPopCurrent(NULL)); if (!decodecaps.bIsSupported) { mIsGood = false; mErrorCode = 4; CUDA_DPRINT("Codec not supported on this GPU."); return nDecodeSurface; } if ((pVideoFormat->coded_width > decodecaps.nMaxWidth) || (pVideoFormat->coded_height > decodecaps.nMaxHeight)) { CUDA_DPRINT("Resolution not supported on this GPU"); mIsGood = false; mErrorCode = 5; return nDecodeSurface; } if ((pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4) > decodecaps.nMaxMBCount) { CUDA_DPRINT("MBCount not supported on this GPU"); mIsGood = false; mErrorCode = 6; return nDecodeSurface; } mLumaWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left; mLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top; mChromaHeight = mLumaHeight * 0.5; // NV12 mBPP = pVideoFormat->bit_depth_luma_minus8 > 0 ? 2 : 1; if (mCudaVideoCodecType == cudaVideoCodec_H264) { if (pVideoFormat->video_signal_description.video_full_range_flag) mColorRange = 2; else mColorRange = 0; mColorPrimaries = pVideoFormat->video_signal_description.color_primaries; mColorTransfer = pVideoFormat->video_signal_description.transfer_characteristics; mColorSpace = pVideoFormat->video_signal_description.matrix_coefficients; } CUVIDDECODECREATEINFO videoDecodeCreateInfo = {0}; videoDecodeCreateInfo.CodecType = pVideoFormat->codec; videoDecodeCreateInfo.ChromaFormat = pVideoFormat->chroma_format; videoDecodeCreateInfo.OutputFormat = cudaVideoSurfaceFormat_NV12; CUDA_DPRINT("output format is %d", videoDecodeCreateInfo.OutputFormat); videoDecodeCreateInfo.bitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8; if (pVideoFormat->progressive_sequence) videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave; else videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Adaptive; videoDecodeCreateInfo.ulNumOutputSurfaces = 1; // With PreferCUVID, JPEG is still decoded by CUDA while video is decoded by // NVDEC hardware videoDecodeCreateInfo.ulCreationFlags = cudaVideoCreate_PreferCUVID; videoDecodeCreateInfo.ulNumDecodeSurfaces = nDecodeSurface; videoDecodeCreateInfo.vidLock = mCtxLock; videoDecodeCreateInfo.ulWidth = pVideoFormat->coded_width; videoDecodeCreateInfo.ulHeight = pVideoFormat->coded_height; if (mOutputHeight != mLumaHeight || mOutputWidth != mLumaWidth) { CUDA_DPRINT("old width %d old height %d", mOutputWidth, mOutputHeight); mOutputWidth = mLumaWidth; mOutputHeight = mLumaHeight; CUDA_DPRINT("new width %d new height %d", mOutputWidth, mOutputHeight); unsigned int newOutBufferSize = mOutputWidth * mOutputHeight * 3 / 2; if (mOutBufferSize < newOutBufferSize) { mOutBufferSize = newOutBufferSize; } } videoDecodeCreateInfo.ulTargetWidth = pVideoFormat->coded_width; videoDecodeCreateInfo.ulTargetHeight = pVideoFormat->coded_height; mSurfaceWidth = videoDecodeCreateInfo.ulTargetWidth; mSurfaceHeight = videoDecodeCreateInfo.ulTargetHeight; NVDEC_API_CALL(cuCtxPushCurrent(mCudaContext)); if (mCudaDecoder != nullptr) { NVDEC_API_CALL(cuvidDestroyDecoder(mCudaDecoder)); mCudaDecoder = nullptr; } { size_t free, total; cuMemGetInfo(&free, &total); CUDA_DPRINT("free memory %g M, total %g M", free / 1048576.0, total / 1048576.0); } NVDEC_API_CALL(cuCtxPopCurrent(NULL)); NVDEC_API_CALL(cuCtxPushCurrent(mCudaContext)); NVDEC_API_CALL(cuvidCreateDecoder(&mCudaDecoder, &videoDecodeCreateInfo)); NVDEC_API_CALL(cuCtxPopCurrent(NULL)); CUDA_DPRINT("successfully called. decoder %p", mCudaDecoder); return nDecodeSurface; } int MediaCudaVideoHelper::HandlePictureDecode(CUVIDPICPARAMS* pPicParams) { NVDEC_API_CALL(cuvidDecodePicture(mCudaDecoder, pPicParams)); CUDA_DPRINT("successfully called."); return 1; } int MediaCudaVideoHelper::HandlePictureDisplay(CUVIDPARSERDISPINFO* pDispInfo) { if (mIgnoreDecoderOutput) { return 1; } constexpr int MAX_NUM_INPUT_WITHOUT_OUTPUT = 16; if (mNumOutputFrame == 0 && mNumInputFrame > MAX_NUM_INPUT_WITHOUT_OUTPUT) { // after more than 16 inputs, there is still no output, // probably corrupted stream, ignore everything from now on dprint("WARNING: %d frames decoded witout any output, possibly bad " "input stream. Ignore output frames (they might be corrupted) " "from now on.", MAX_NUM_INPUT_WITHOUT_OUTPUT); return 0; } CUVIDPROCPARAMS videoProcessingParameters = {}; videoProcessingParameters.progressive_frame = pDispInfo->progressive_frame; videoProcessingParameters.second_field = pDispInfo->repeat_first_field + 1; videoProcessingParameters.top_field_first = pDispInfo->top_field_first; videoProcessingParameters.unpaired_field = pDispInfo->repeat_first_field < 0; videoProcessingParameters.output_stream = 0; uint64_t myOutputPts = pDispInfo->timestamp; CUdeviceptr dpSrcFrame = 0; unsigned int nSrcPitch = 0; CUresult errorCode = cuvidMapVideoFrame(mCudaDecoder, pDispInfo->picture_index, &dpSrcFrame, &nSrcPitch, &videoProcessingParameters); if (errorCode != CUDA_SUCCESS) { CUDA_DPRINT("failed to call cuvidMapVideoFrame with error code %d\n", (int)errorCode); return 0; } NVDEC_API_CALL(cuCtxPushCurrent(mCudaContext)); unsigned int newOutBufferSize = mOutputWidth * mOutputHeight * 3 / 2; std::vector myFrame; TextureFrame texFrame; if (mUseGpuTexture && mTexturePool != nullptr) { media_cuda_utils_copy_context my_copy_context{ .src_frame = dpSrcFrame, .src_pitch = nSrcPitch, .src_surface_height = mSurfaceHeight, .dest_width = mOutputWidth, .dest_height = mOutputHeight, }; texFrame = mTexturePool->getTextureFrame(mOutputWidth, mOutputHeight); mTexturePool->saveDecodedFrameToTexture( texFrame, &my_copy_context, (void*)media_cuda_utils_nv12_updater); } else { myFrame.resize(newOutBufferSize); uint8_t* pDecodedFrame = &(myFrame[0]); CUDA_MEMCPY2D m = {0}; m.srcMemoryType = CU_MEMORYTYPE_DEVICE; m.srcDevice = dpSrcFrame; m.srcPitch = nSrcPitch; m.dstMemoryType = CU_MEMORYTYPE_HOST; m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame); m.dstPitch = mOutputWidth * mBPP; m.WidthInBytes = mOutputWidth * mBPP; m.Height = mLumaHeight; CUDA_DPRINT("dstDevice %p, dstPitch %d, WidthInBytes %d Height %d", m.dstHost, (int)m.dstPitch, (int)m.WidthInBytes, (int)m.Height); NVDEC_API_CALL(cuMemcpy2DAsync(&m, 0)); m.srcDevice = (CUdeviceptr)((uint8_t*)dpSrcFrame + m.srcPitch * mSurfaceHeight); m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame + m.dstPitch * mLumaHeight); m.Height = mChromaHeight; NVDEC_API_CALL(cuMemcpy2DAsync(&m, 0)); YuvConverter convert8(mOutputWidth, mOutputHeight); convert8.UVInterleavedToPlanar(pDecodedFrame); } NVDEC_API_CALL(cuStreamSynchronize(0)); NVDEC_API_CALL(cuCtxPopCurrent(NULL)); NVDEC_API_CALL(cuvidUnmapVideoFrame(mCudaDecoder, dpSrcFrame)); { std::lock_guard g(mFrameLock); mSavedDecodedFrames.push_back(MediaSnapshotState::FrameInfo{ std::move(myFrame), std::vector{texFrame.Ytex, texFrame.UVtex}, (int)mOutputWidth, (int)mOutputHeight, myOutputPts, ColorAspects{mColorPrimaries, mColorRange, mColorTransfer, mColorSpace}}); } ++mNumOutputFrame; CUDA_DPRINT("successfully called."); return 1; } } // namespace emulation } // namespace android