In order to better illustrate how to realize the mixed programming of C + + and CUDA in Linux, I will next realize the modular operation of each element of a matrix.
1. Header file and file form
To write CUDA code in C + +, you need to import header files:
#include "cuda_runtime.h" #include "cublas_v2.h" #include "device_launch_parameters.h"
Then change the file suffix from. cpp to. cu
2. Set up procedure framework
The framework of a CUDA program is similar to an ordinary C + + program. Let's show a simple form:
// mod_test.cu #include "cuda_runtime.h" #include "cublas_v2.h" #include "device_launch_parameters.h" // CUDA modular operation function implemented by ourselves __global__ void mod_kernel(args) { // ... } // Perform matrix element modulo operation void mat_mod_cu(args){ // ... // Call their own CUDA modular operation function mod_kernel <<< gridSize, blockSize >>> (args); // ... } int main(void){ // Define cuda message processor cublasHandle_t cuHandle; cublasStatus_t status = cublasCreate(&cuHandle); // ... // Perform matrix element modulo operation mat_mod_cu(args); // ... // Destroy cuda message processor cublasDestroy(cuHandle); return 0; }
In the frame above:
__global__ void mod_kernel(args)
It is the code written by yourself to be executed in the GPU__ global__ Indicate that the function is a kernel function, and configure the parallelization parameters with < < > > syntax when calling; The return value is void; mod_kernel is the CUDA function name; args is a parameter.
Next, we implement CUDA modular operation based on this framework.
2.1. main() function
In order to achieve good encapsulation, the main () function needs to apply to mat_mod() passes in the matrix to be modulus, its row and column number, modulus value and cuda controller. Then you can write the following code in the main() function:
int main(void){ // 1. Define cuda message processor cublasHandle_t cuHandle; cublasStatus_t status = cublasCreate(&cuHandle); // 2. Define the matrix to be modeled long q = 3; // Modulus long rowSize = 4; // Number of matrix rows long colSize = 6; // Number of matrix columns long** M1 = uniformMat(rowSize, colSize, 5, 9); // Generate a random matrix of rowSize*colSize, and the element values are randomly selected in [5, 9] // 3. Perform matrix element modulo operation long** M2 = mat_mod_cu(M1, rowSize, colSize, q, cuHandle); // 4. Output the original matrix M1 and the modular matrix M2 cout << "M1: " << endl; for (int i = 0; i < rowSize; i++) { for (int j = 0; j < colSize; j++) { cout << M1[i][j] << " "; } cout << endl; } cout << "M2: " << endl; for (int i = 0; i < rowSize; i++) { for (int j = 0; j < colSize; j++) { cout << M2[i][j] << " "; } cout << endl; } // 5. Destroy cuda message processor cublasDestroy(cuHandle); return 0; }
The uniformMat() function is used to generate a random matrix. Its implementation is as follows:
long** uniformMat(long rowSize, long colSize, long minValue, long maxValue) { long** mat = new long* [rowSize]; for (long i = 0; i < rowSize; i++) mat[i] = new long[colSize]; srand((unsigned)time(NULL)); for (long i = 0; i < rowSize; i++) { for (long j = 0; j < colSize; j++) { mat[i][j] = (long)(rand() % (maxValue - minValue + 1)) + minValue; } } return mat; }
2.2. mat_mod_cu() function
mat_ mod_ The Cu () function is mainly copied to store the matrix in GPU memory, then the cuda function is called to perform operations, and the result of the operation is transferred from GPU. It is defined as follows:
long** mat_mod_cu(long** M1, long rowSize, long colSize, long q, cublasHandle_t cuHandle) { // 1. Define the result matrix for returning long** M2 = new long* [rowSize]; for (long i = 0; i < rowSize; i++) M2[i] = new long[colSize]; // 2. Allocate CPU resources double* h_M1 = (double*)malloc(rowSize * colSize * sizeof(double)); double* h_M2 = (double*)malloc(rowSize * colSize * sizeof(double)); // Initialization h_M1 for (long i = 0; i < rowSize; i++) { for (long j = 0; j < colSize; j++) { h_M1[i * colSize + j] = (double)M1[i][j]; } } // 3. Allocate GPU resources double* d_M1; double* d_M2; cudaMalloc((void**)&d_M1, rowSize * colSize * sizeof(double)); cudaMalloc((void**)&d_M2, rowSize * colSize * sizeof(double)); // Copy CPU data to GPU cudaMemcpy(d_M1, h_M1, rowSize * colSize * sizeof(double), cudaMemcpyHostToDevice); // 4. Define the execution configuration of the kernel int blockSize; int minGridSize; int gridSize; // Obtain the information of GPU and configure the optimal parameters cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, mod_kernel, 0, rowSize * colSize); gridSize = (rowSize * colSize + blockSize - 1) / blockSize; // 5. Execute kernel function // Take mold mod_kernel <<< gridSize, blockSize >>> (d_M1, d_M2, q, rowSize*colSize); // 6. Copy the GPU data to the CPU cudaMemcpy(h_M2, d_M2, rowSize * colSize * sizeof(double), cudaMemcpyDeviceToHost); // 7. Assign value to result matrix for (int i = 0; i < rowSize; i++) { for (int j = 0; j < colSize; j++) { M2[i][j] = static_cast<long>(h_M2[i * colSize + j]); } } // 8. Clean up the used memory free(h_M1); free(h_M2); cudaFree(d_M1); cudaFree(d_M2); return M2; }
CUDA needs to flatten the matrix into a one-dimensional vector, so there is a second step to allocate CPU resources. Then the one-dimensional vector is transferred to the GPU, which is the third step. Then, the matrix in the GPU is calculated. Before the operation, some parameters for parallel calculation need to be specified. Automatic acquisition has been set in step 4, so manual configuration is no longer required. In this way, the kernel function can be executed for modular operation. After the operation, take out the data from the GPU to the CPU, expand it into a two-dimensional matrix, and return it.
2.3. mod_kernel() function
mod_kernel() is the function that finally performs calculation in GPU. Its definition is as follows:
__global__ void mod_kernel(double* d_M1, double* d_M2, long q, int n) { // index int idx = blockIdx.x * blockDim.x + threadIdx.x; long x; if (idx < n){ x = d_M1[idx]; d_M2[idx] = x % q; } }
It can be seen from the code that the kernel function does not necessarily need a for loop to traverse and execute the whole matrix, because gridSize and blockSize are specified when calling the kernel function, so that the matrix is executed in parallel in the GPU. You only need to explicitly traverse each element in the matrix.
If you want to output information in the kernel function, you can use printf instead of cout.
3. Complete code and calculation results
Full code:
/* mod_test.cu */ #include "cuda_runtime.h" #include "cublas_v2.h" #include "device_launch_parameters.h" #include <iostream> #include <stdio.h> using namespace std; __global__ void mod_kernel(double* d_M1, double* d_M2, long q, int n) { // index int idx = blockIdx.x * blockDim.x + threadIdx.x; long x; if (idx < n){ x = d_M1[idx]; // printf("x: %d\n", x) d_M2[idx] = x % q; } } long** mat_mod_cu(long** M1, long rowSize, long colSize, long q, cublasHandle_t cuHandle) { // 1. Define the result matrix for returning long** M2 = new long* [rowSize]; for (long i = 0; i < rowSize; i++) M2[i] = new long[colSize]; // 2. Allocate CPU resources double* h_M1 = (double*)malloc(rowSize * colSize * sizeof(double)); double* h_M2 = (double*)malloc(rowSize * colSize * sizeof(double)); // Initialization h_M1 for (long i = 0; i < rowSize; i++) { for (long j = 0; j < colSize; j++) { h_M1[i * colSize + j] = (double)M1[i][j]; } } // 3. Allocate GPU resources double* d_M1; double* d_M2; cudaMalloc((void**)&d_M1, rowSize * colSize * sizeof(double)); cudaMalloc((void**)&d_M2, rowSize * colSize * sizeof(double)); // Copy CPU data to GPU cudaMemcpy(d_M1, h_M1, rowSize * colSize * sizeof(double), cudaMemcpyHostToDevice); // 4. Define the execution configuration of the kernel int blockSize; int minGridSize; int gridSize; // Obtain the information of GPU and configure the optimal parameters cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, mod_kernel, 0, rowSize * colSize); gridSize = (rowSize * colSize + blockSize - 1) / blockSize; // 5. Execute kernel function // Take mold mod_kernel <<< gridSize, blockSize >>> (d_M1, d_M2, q, rowSize*colSize); // 6. Copy the GPU data to the CPU cudaMemcpy(h_M2, d_M2, rowSize * colSize * sizeof(double), cudaMemcpyDeviceToHost); // 7. Assign value to result matrix for (int i = 0; i < rowSize; i++) { for (int j = 0; j < colSize; j++) { M2[i][j] = static_cast<long>(h_M2[i * colSize + j]); } } // 8. Clean up the used memory free(h_M1); free(h_M2); cudaFree(d_M1); cudaFree(d_M2); return M2; } long** uniformMat(long rowSize, long colSize, long minValue, long maxValue) { long** mat = new long* [rowSize]; for (long i = 0; i < rowSize; i++) mat[i] = new long[colSize]; srand((unsigned)time(NULL)); for (long i = 0; i < rowSize; i++) { for (long j = 0; j < colSize; j++) { mat[i][j] = (long)(rand() % (maxValue - minValue + 1)) + minValue; } } return mat; } int main(void){ // 1. Define cuda message processor cublasHandle_t cuHandle; cublasStatus_t status = cublasCreate(&cuHandle); // 2. Define the matrix to be modeled long q = 3; // Modulus long rowSize = 4; // Number of matrix rows long colSize = 6; // Number of matrix columns long** M1 = uniformMat(rowSize, colSize, 5, 9); // Generate a random matrix of rowSize*colSize, and the element values are randomly selected in [5, 9] // 3. Perform matrix element modulo operation long** M2 = mat_mod_cu(M1, rowSize, colSize, q, cuHandle); // 4. Output the original matrix M1 and the modular matrix M2 cout << "M1: " << endl; for (int i = 0; i < rowSize; i++) { for (int j = 0; j < colSize; j++) { cout << M1[i][j] << " "; } cout << endl; } cout << "M2: " << endl; for (int i = 0; i < rowSize; i++) { for (int j = 0; j < colSize; j++) { cout << M2[i][j] << " "; } cout << endl; } // 5. Destroy cuda message processor cublasDestroy(cuHandle); return 0; }
Execute on the command line:
nvcc -lcublas mod_test.cu -o mt ./mt
Operation result:
M1: 9 6 5 9 9 8 9 8 6 5 7 9 5 6 6 6 8 7 5 6 5 9 9 5 M2: 0 0 2 0 0 2 0 2 0 2 1 0 2 0 0 0 2 1 2 0 2 0 0 2
success!