First, you need to install two necessary packages tensorrt and torch2trt. If tensorrt, you need to download the tar compressed package on the official website. It is recommended to download the tar package for installation, Official website , I downloaded version 7.2.3. torch2trt can clone projects on GitHub.
My environment (tensorrt seems to work on Linux at present, but I didn't succeed on win10, and there is no corresponding whl file in the tar package):
ubuntu18.4
python3.8
pytorch1.8+cu11
tensorrt7.2.3
pycuda2021
Unzip and install tensorrt
tar -zxvf TensorRT-7.2.3.4.Ubuntu-18.04.x86_64-gnu.cuda-11.1.cudnn8.1.tar.gz cd TensorRT-7.2.3.4 # This directory will have four files, python, UF, graphsurgeon, onnx_graphsurgeon, respectively enter the directory and install the corresponding whl file pip install xxxxx.whl
# Configure environment variables vim /etc/profile # Add the following command at the end. The path depends on your installation path. Note that the lib folder may be a soft connection. I asked for the true path of lib because of this error export TRT_RELEASE=`pwd`/TensorRT-7.2.3.4 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRT_RELEASE/lib # Make the document effective immediately source /etc/profile
Installing torch2trt
git clone https://github.com/NVIDIA-AI-IOT/torch2trt cd torch2trt python setup.py install
After these two are installed, you can turn the model. First, there is one pth file is the model of ptorch training. I use mnist data set for training and testing. The training is as follows
import torch import numpy as np from torch.nn import Module from torch.autograd import Variable import torch.nn as nn from torch import nn,optim from torch.utils.data import DataLoader from torchvision import datasets, transforms batch_size = 64 learning_rate = 1e-2 num_epochs = 10 # Load mnist data set and construct the iterator of training set and test set data_tf = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]) train_dataset = datasets.MNIST(root='./data', train=True, transform=data_tf, download=True) test_dataset = datasets.MNIST(root='./data', train=False, transform=data_tf) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# cnn network class SimpleCnn(Module): def __init__(self): super(SimpleCnn, self).__init__() #b, 3, 32, 32 self.layer1 = nn.Sequential() self.layer1.add_module('conv1', nn.Conv2d(1, 32, 3, 1, padding=1)) self.layer1.add_module('relu', nn.ReLU(True)) self.layer1.add_module('pool', nn.MaxPool2d(2,2)) self.layer2 = nn.Sequential() self.layer2.add_module('conv2', nn.Conv2d(32, 64, 3, 1, padding=1)) self.layer2.add_module('relu', nn.ReLU(True)) self.layer2.add_module('pool', nn.MaxPool2d(2,2)) self.layer3 = nn.Sequential() self.layer3.add_module('fc1', nn.Linear(3136, 64)) self.layer3.add_module('relu', nn.ReLU(True)) self.layer3.add_module('fc_out', nn.Linear(64, 10)) def forward(self, x): conv1 = self.layer1(x) conv2 = self.layer2(conv1) fc_input = conv2.view(conv2.size(0), -1) fc_out = self.layer3(fc_input) return fc_out
# Loss function and optimizer model_cnn = SimpleCnn() model_cnn = model_cnn.cuda() criterion_cnn = nn.CrossEntropyLoss() optimizer_cnn = optim.SGD(model_cnn.parameters(), lr=learning_rate)
# train for epoch in range(num_epochs): acc = 0 loss = 0 i = 0 for data in train_loader: img, label = data x_train = Variable(img).cuda() y_train = Variable(label).cuda() out = model_cnn(x_train) loss_batch = criterion_cnn(out, y_train) loss += loss_batch _,pred = torch.max(out, 1) num_corrrect = (pred == y_train).sum() acc += num_corrrect.data optimizer_cnn.zero_grad() loss_batch.backward() #backward loss optimizer_cnn.step() # update parameters if i % 10 == 0: print("Train Loss: {:.6f}, Acc: {:.6f}".format(loss/len(train_dataset), acc/len(train_dataset))) i+=1
# The final training accuracy is 0.98 Train Loss: 0.000756, Acc: 0.985350
# test model_cnn.eval() #Switch to test mode eval_loss = 0 eval_acc = 0 i = 0 for data in test_loader: img, label = data img = Variable(img, volatile=True) label = Variable(label, volatile=True) out = model_cnn(img) loss = criterion_cnn(out, label) eval_loss += loss.data * label.size(0) _, pred = torch.max(out, 1) num_corrrect = (pred == label).sum() eval_acc += num_corrrect.data if i % 10 == 0: print("Test Loss: {:.6f}, Acc: {:.6f}".format(eval_loss / len(test_dataset), eval_acc / len(test_dataset))) i+=1
# The test accuracy is 0.98 Test Loss: 0.043465, Acc: 0.985400
# Save model as pth file torch.save(model_cnn, 'torch_mnist.pth')
Now? pth file generated pth file to There are two methods for trt files. One is the method on the official website of tensorrt, github , I tried this method and failed. I will encounter many problems, so I'll use the second method pth->. onnx->. TRT file conversion
Starting now. pth->. Onnx steps
# First install onnx pip install onnx
Start conversion
import torch batch_size = 1 # Batch here_ Size is set to 1 The pth model is set to 64 when we train model = './torch_mnist.pth' dummy_input = torch.randn(batch_size, 1, 28, 28, device='cuda') model = torch.load(model) torch.onnx.export(model, dummy_input,"torch_mnist.onnx" , verbose=False)
Here's a point to explain. Here's batch_size=1, so it should also be 1 in the later reasoning. In addition, inputs are required in the transformation process. This inputs only requires the model to know the size of the input data, and the model only takes one piece of data, so it's good to be random, but the size should be the same as the shape during training.
After running, there will be one in the directory onnx file generation shows that this step is successful, which can be referred to Blog.
next step. onnx->. trt
There is a trtexec file in the bin directory of the extracted tensorrt, which can be directly converted to
./trtexec --onnx=torch_mnist.onnx --saveEngine=torch_mnist.trt
It's OK after execution, and a will be generated trt file. If an error is reported, try the onnx tensorrt module for reference Blog.
At this step, the model transformation is completed. It is almost necessary to verify whether the model is accelerated and how to load the model. Let's study it and look at the code!!!
import torch from torch.autograd import Variable import tensorrt as trt import pycuda.driver as cuda import pycuda.gpuarray as gpuarray import pycuda.autoinit from torchvision import datasets, transforms from torch.utils.data import DataLoader import time import numpy as np batch_size = 1 # Batch here_ The size should be consistent with that during conversion trt_model_name = "./torch_mnist.trt" # This operation is a general function def infer(context, input_img, output_size, batch_size): # Convert input data to Float32. If this type needs to be converted, there will be many errors input_img = input_img.astype(np.float32) # Create output array to receive data output = np.empty(output_size, dtype=np.float32) # Allocate device memory d_input = cuda.mem_alloc(batch_size * input_img.nbytes) d_output = cuda.mem_alloc(batch_size * output.nbytes) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() # Transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) # Execute model context.execute_async(batch_size, bindings, stream.handle, None) # Transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) stream.synchronize() # Return predictions return output # Execute test function def do_test(context): # Read mnist dataset data_tf = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]) train_dataset = datasets.MNIST(root='./data', train=True, transform=data_tf, download=True) test_dataset = datasets.MNIST(root='./data', train=False, transform=data_tf) # train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) print("mnist data load successful!!!") accurary = 0 start_time = time.time() for data in test_loader: # Start test img, label = data img = img.numpy() # This data is from torch Tensor to numpy format label = Variable(label, volatile=True) output = infer(context, img, 10, 1) #print(output) conf, pred = torch.max(torch.Tensor(output), -1) num_count = (pred == label).sum() accurary += num_count.data print("Test Acc is {:.6f}".format(accurary / len(test_dataset))) return accurary/len(test_dataset), time.time() - start_time def trt_infer(): # Read trt file def loadEngine2TensorRT(filepath): G_LOGGER = trt.Logger(trt.Logger.WARNING) # Deserialization engine with open(filepath, "rb") as f, trt.Runtime(G_LOGGER) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) return engine engine = loadEngine2TensorRT(trt_model_name) # Create context context = engine.create_execution_context() print("Start TensorRT Test...") acc, times = do_test(context) print('INT8 acc: {}, need time: {}'.format(acc, times)) if __name__ == '__main__': trt_infer()
Then run it and wait for the result~~
My test result is (the data is 10000 mnist pictures)
Operation results of trt model
INT8 acc: 0.9855999946594238, need time: 4.600905656814575
pth model operation results
Test Acc is 0.985600 run succes! time is 20.035008
From the results, we can see that the trt model is 4-5 times faster, which is still OK~