Prerequisite
- Install the Invida graphics card on your computer;
- Install Invida driver, version recommendation is more than 10;
- Install cuda and cudnn;
- github downloads opencv, opencv_contribute; local cmake compiles opencv;
target
- In cpp, use the cuda kernel function written by you;
Reasons for writing this article
Recently, I was writing cuda acceleration and found that when I uninstall a kernel function in a.cu file and write a call in a.cpp file, errors always occur, such as:
C2039 "atomicAdd": not a member of "`global namespace'" C2039 "atomicMin": not a member of "`global namespace'" C2664 "uchar1 cv::cudev::max(const uchar1 &, const uchar1 &)": Cannot convert parameter 1 from "const unsigned char" to "const uchar1 &"
I spent some time looking for it on the Internet and couldn't find it, so I thought, write it out, record my own solution process, and help others find the answer as soon as possible;
//.cpp //cuda related header file #include "cuda_runtime.h" #include "device_launch_parameters.h" #include "cuda_device_runtime_api.h" //Standard Library #include <iostream> #include <string> //Header file used by opencv #include "opencv2/core/core.hpp" #include "opencv2/highgui/highgui.hpp" #include "opencv2/imgproc/imgproc.hpp" // Reference some headers that opencv already has; #include "opencv2/cudaarithm.hpp" //cpp must not be referenced //#include "opencv2/cudev.hpp"
////.cu #include <cuda.h> #include "cuda_runtime.h" #include "device_launch_parameters.h" #include "cuda_device_runtime_api.h" //Parameter types using opencv #include "opencv2/cudev.hpp"
Initially, "opencv2/cudev.hpp" was found on the Internet as the basic header file of opencv cuda; therefore, it was naturally referenced to CPP and cu files, and errors were always reported, one by one, because the "opencv2/cudev.hpp" header file was referenced in cpp, resulting in compilation errors;
If you want to use the parameter type of opencv, such as cv::cuda::PtrStep, cv::cuda::PtrStepSz, you can introduce the header file "opencv2/cudev.hpp" in.cu;
Looking at the example of cuda, we usually put both kernel function A and AA in the.cu file and call AA in.cpp.
The code is as follows:
.cpp file
//.cpp #include<iostream> #include <cuda.h> #include "cuda_runtime.h" #include "device_launch_parameters.h" #include "cuda_runtime_api.h" #include "opencv2/cudacodec.hpp" // //#include "opencv2/cudaimgproc.hpp" // //#include "opencv2/cudaobjdetect.hpp" //#include "opencv2/cudaarithm.hpp" using namespace cv; using namespace std; //Initialization cv::cuda::Stream sdr2hdrStream; cv::cuda::GpuMat dev_src_yuv8_GPU(height * 3/2, width, CV_8UC1); cv::cuda::GpuMat dev_src_RGB_32F_GPU(height, width, CV_32FC3); Mat src = imread("lena.jpg"); Mat sc_yuv; cvtColor(src, sc_yuv, COLOR_BGR2YUV_I420); dev_src_yuv8_GPU.upload(sc_yuv, sdr2hdrStream); cv::cuda::PtrStep<uchar> psrc(dev_src_yuv8_GPU.data, dev_src_yuv8_GPU.step); cv::cuda::PtrStep<float3> pdst(dev_src_RGB_32F_GPU.ptr<float3>(0), dev_src_RGB_32F_GPU.step); yuv2rgb420pCudaGPUMat(psrc, pdst, width, height, sdr2hdrStream); //Waiting for stream to finish executing sdr2hdrStream.waitForCompletion(); Mat ttttt; dev_src_RGB_32F_GPU.download(ttttt); cv::imwrite("tttt_dTmp_dst.png", ttttt * 255);
.cu file
#include <cuda.h> #include "cuda_runtime.h" #include "device_launch_parameters.h" #include "cuda_device_runtime_api.h" #include "cuda_runtime_api.h" #include "cuda_texture_types.h" #include "opencv2/cudev.hpp" //cuda // cuda threads int hlg_threads = 16; //kernel function __global__ void yuv2rgb420p_private_GPUMat(cv::cuda::PtrStep<uchar> src, cv::cuda::PtrStep<float3> dst, int w, int h) { int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; if (x < w && y < h) { //Before optimization //float y_ = (src(y,x) - 16) / 219.0; //Optimal Division float y_ = __fdividef((src(y, x) - 16), 219.0F); float u_ = __fdividef((src(h + (y >> 2), x >> 1) - 128), 224.0F); float v_ = __fdividef((src(((h * 5) >> 2) + (y >> 2), x >> 1) - 128), 224.0F); //R dst(y, x).x = __saturatef(__fmul_rn(1.4746F, v_) + y_); //B dst(y, x).z = __saturatef(__fmul_rn(1.8814F, u_) + y_); //G dst(y, x).y = __saturatef(__fmul_rn(1.4749F, y_) - __fmul_rn(0.3875F, dst(y, x).x) - __fmul_rn(0.0875F, dst(y, x).z)); } } //Kernel function call void yuv2rgb420pCudaGPUMat(cv::cuda::PtrStep<uchar> src, cv::cuda::PtrStep<float3> dst, int w, int h, cv::cuda::Stream& sdr2hdrStream_) { int bx = (w + hlg_threads - 1) / hlg_threads; int by = (h + hlg_threads - 1) / hlg_threads; dim3 blocks(bx, by); dim3 threads(hlg_threads, hlg_threads); cudaStream_t s = cv::cuda::StreamAccessor::getStream(sdr2hdrStream_); yuv2rgb420p_private_GPUMat << <blocks, threads, 0, s >> > (src, dst, w, h); }
Kernel functions, the points that can be optimized are as follows:
- Define the variable and write a floating-point number of 0.9F instead of 0.9 when using time; avoid double conversion to float;
- Dividing uses u fdividef, about 20 clock cycles, which is faster than the expression "9/5", 36 clock cycles;
- _u saturatef, 0--1 truncation function, less than 0, greater than 1, other input values;
plan
- Plan to write cuda texture reading, use of texture; compare the use of texture object cudaTextureObject_t;
- Use of multiple streams when cuda is called;