Using cuda core in open CV CPP


  • Install the Invida graphics card on your computer;
  • Install Invida driver, version recommendation is more than 10;
  • Install cuda and cudnn;
  • github downloads opencv, opencv_contribute; local cmake compiles opencv;


  • In cpp, use the cuda kernel function written by you;

Reasons for writing this article

Recently, I was writing cuda acceleration and found that when I uninstall a kernel function in file and write a call in a.cpp file, errors always occur, such as:

C2039 "atomicAdd": not a member of "`global namespace'"
C2039 "atomicMin": not a member of "`global namespace'"
C2664 "uchar1 cv::cudev::max(const uchar1 &, const uchar1 &)": Cannot convert parameter 1 from "const unsigned char" to "const uchar1 &"

I spent some time looking for it on the Internet and couldn't find it, so I thought, write it out, record my own solution process, and help others find the answer as soon as possible;

//cuda related header file
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_device_runtime_api.h"

//Standard Library
#include <iostream>
#include <string>

//Header file used by opencv
#include "opencv2/core/core.hpp"  
#include "opencv2/highgui/highgui.hpp"  
#include "opencv2/imgproc/imgproc.hpp"

// Reference some headers that opencv already has;
#include "opencv2/cudaarithm.hpp"

//cpp must not be referenced
//#include "opencv2/cudev.hpp"
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_device_runtime_api.h"

//Parameter types using opencv 
#include "opencv2/cudev.hpp"

Initially, "opencv2/cudev.hpp" was found on the Internet as the basic header file of opencv cuda; therefore, it was naturally referenced to CPP and cu files, and errors were always reported, one by one, because the "opencv2/cudev.hpp" header file was referenced in cpp, resulting in compilation errors;

If you want to use the parameter type of opencv, such as cv::cuda::PtrStep, cv::cuda::PtrStepSz, you can introduce the header file "opencv2/cudev.hpp";

Looking at the example of cuda, we usually put both kernel function A and AA in file and call AA in.cpp.

The code is as follows:

.cpp file


#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_runtime_api.h"

#include "opencv2/cudacodec.hpp" //
//#include "opencv2/cudaimgproc.hpp" // 
//#include "opencv2/cudaobjdetect.hpp"
//#include "opencv2/cudaarithm.hpp"

using namespace cv;
using namespace std;

cv::cuda::Stream sdr2hdrStream;

cv::cuda::GpuMat dev_src_yuv8_GPU(height * 3/2, width, CV_8UC1);
cv::cuda::GpuMat dev_src_RGB_32F_GPU(height, width, CV_32FC3);

Mat src = imread("lena.jpg");
Mat sc_yuv;
cvtColor(src, sc_yuv, COLOR_BGR2YUV_I420);

dev_src_yuv8_GPU.upload(sc_yuv, sdr2hdrStream);
cv::cuda::PtrStep<uchar> psrc(, dev_src_yuv8_GPU.step);
cv::cuda::PtrStep<float3> pdst(dev_src_RGB_32F_GPU.ptr<float3>(0), dev_src_RGB_32F_GPU.step);

yuv2rgb420pCudaGPUMat(psrc, pdst, width, height, sdr2hdrStream);

//Waiting for stream to finish executing
Mat ttttt;;
cv::imwrite("tttt_dTmp_dst.png", ttttt * 255);

.cu file

#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_device_runtime_api.h"

#include "cuda_runtime_api.h"
#include "cuda_texture_types.h"

#include "opencv2/cudev.hpp" //cuda 

// cuda threads
int hlg_threads = 16;

//kernel function
__global__ void yuv2rgb420p_private_GPUMat(cv::cuda::PtrStep<uchar> src, cv::cuda::PtrStep<float3> dst, int w, int h) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x < w && y < h) {
        //Before optimization
        //float y_ = (src(y,x) - 16) / 219.0;
        //Optimal Division
        float y_ = __fdividef((src(y, x) - 16), 219.0F);
        float u_ = __fdividef((src(h + (y >> 2), x >> 1) - 128), 224.0F);
        float v_ = __fdividef((src(((h * 5) >> 2) + (y >> 2), x >> 1) - 128), 224.0F);
        dst(y, x).x = __saturatef(__fmul_rn(1.4746F, v_) + y_);
        dst(y, x).z = __saturatef(__fmul_rn(1.8814F, u_) + y_);
        dst(y, x).y = __saturatef(__fmul_rn(1.4749F, y_) - __fmul_rn(0.3875F, dst(y, x).x) - __fmul_rn(0.0875F, dst(y, x).z));

//Kernel function call
void yuv2rgb420pCudaGPUMat(cv::cuda::PtrStep<uchar> src, cv::cuda::PtrStep<float3> dst,    int w, int h, cv::cuda::Stream& sdr2hdrStream_) {

    int bx = (w + hlg_threads - 1) / hlg_threads;
    int by = (h + hlg_threads - 1) / hlg_threads;
    dim3 blocks(bx, by);
    dim3 threads(hlg_threads, hlg_threads);
    cudaStream_t s = cv::cuda::StreamAccessor::getStream(sdr2hdrStream_);
    yuv2rgb420p_private_GPUMat << <blocks, threads, 0, s >> > (src, dst, w, h);


Kernel functions, the points that can be optimized are as follows:

  • Define the variable and write a floating-point number of 0.9F instead of 0.9 when using time; avoid double conversion to float;
  • Dividing uses u fdividef, about 20 clock cycles, which is faster than the expression "9/5", 36 clock cycles;
  • _u saturatef, 0--1 truncation function, less than 0, greater than 1, other input values;


  • Plan to write cuda texture reading, use of texture; compare the use of texture object cudaTextureObject_t;
  • Use of multiple streams when cuda is called;

Keywords: C++ OpenCV github cmake less

Added by ultraslacker on Fri, 22 Nov 2019 04:11:58 +0200