pytorch model tensorrt accelerates - pth to onnx to trt, and tests the model speed in reasoning trt model

First, you need to install two necessary packages tensorrt and torch2trt. If tensorrt, you need to download the tar compressed package on the official website. It is recommended to download the tar package for installation, Official website , I downloaded version 7.2.3. torch2trt can clone projects on GitHub.
My environment (tensorrt seems to work on Linux at present, but I didn't succeed on win10, and there is no corresponding whl file in the tar package):

Unzip and install tensorrt

tar -zxvf TensorRT-
cd TensorRT-
# This directory will have four files, python, UF, graphsurgeon, onnx_graphsurgeon, respectively enter the directory and install the corresponding whl file
pip install xxxxx.whl
# Configure environment variables
vim /etc/profile
# Add the following command at the end. The path depends on your installation path. Note that the lib folder may be a soft connection. I asked for the true path of lib because of this error
export TRT_RELEASE=`pwd`/TensorRT-

# Make the document effective immediately
source /etc/profile

Installing torch2trt

git clone
cd torch2trt
python install

After these two are installed, you can turn the model. First, there is one pth file is the model of ptorch training. I use mnist data set for training and testing. The training is as follows

import torch
import numpy as np
from torch.nn import Module
from torch.autograd import Variable
import torch.nn as nn
from torch import nn,optim
from import DataLoader
from torchvision import datasets, transforms

batch_size = 64
learning_rate = 1e-2
num_epochs = 10

# Load mnist data set and construct the iterator of training set and test set
data_tf = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.5], [0.5])])
train_dataset = datasets.MNIST(root='./data', train=True, transform=data_tf, download=True)
test_dataset = datasets.MNIST(root='./data', train=False, transform=data_tf)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# cnn network
class SimpleCnn(Module):
    def __init__(self):
        super(SimpleCnn, self).__init__() #b, 3, 32, 32
        self.layer1 = nn.Sequential()
        self.layer1.add_module('conv1', nn.Conv2d(1, 32, 3, 1, padding=1))
        self.layer1.add_module('relu', nn.ReLU(True))
        self.layer1.add_module('pool', nn.MaxPool2d(2,2))
        self.layer2 = nn.Sequential()
        self.layer2.add_module('conv2', nn.Conv2d(32, 64, 3, 1, padding=1))
        self.layer2.add_module('relu', nn.ReLU(True))
        self.layer2.add_module('pool', nn.MaxPool2d(2,2))
        self.layer3 = nn.Sequential()
        self.layer3.add_module('fc1', nn.Linear(3136, 64))
        self.layer3.add_module('relu', nn.ReLU(True))
        self.layer3.add_module('fc_out', nn.Linear(64, 10))
    def forward(self, x):
        conv1 = self.layer1(x)
        conv2 = self.layer2(conv1)
        fc_input = conv2.view(conv2.size(0), -1)
        fc_out = self.layer3(fc_input)
        return fc_out
# Loss function and optimizer
model_cnn = SimpleCnn()
model_cnn = model_cnn.cuda()
criterion_cnn = nn.CrossEntropyLoss()
optimizer_cnn = optim.SGD(model_cnn.parameters(), lr=learning_rate)
# train
for epoch in range(num_epochs):
    acc = 0
    loss = 0
    i = 0
    for data in train_loader:
        img, label = data
        x_train = Variable(img).cuda()
        y_train = Variable(label).cuda()
        out = model_cnn(x_train)
        loss_batch = criterion_cnn(out, y_train)
        loss += loss_batch 
        _,pred = torch.max(out, 1)
        num_corrrect = (pred == y_train).sum()
        acc +=
        loss_batch.backward()   #backward loss
        optimizer_cnn.step()   # update parameters
        if i % 10 == 0:
            print("Train Loss: {:.6f}, Acc: {:.6f}".format(loss/len(train_dataset), acc/len(train_dataset)))
# The final training accuracy is 0.98
Train Loss: 0.000756, Acc: 0.985350
# test
model_cnn.eval()   #Switch to test mode
eval_loss = 0
eval_acc = 0
i = 0
for data in test_loader:
    img, label = data
    img = Variable(img, volatile=True)
    label = Variable(label, volatile=True)
    out = model_cnn(img)
    loss = criterion_cnn(out, label)
    eval_loss += * label.size(0)
    _, pred = torch.max(out, 1)
    num_corrrect = (pred == label).sum()
    eval_acc +=
    if i % 10 == 0:
        print("Test Loss: {:.6f}, Acc: {:.6f}".format(eval_loss / len(test_dataset), eval_acc / len(test_dataset)))
# The test accuracy is 0.98
Test Loss: 0.043465, Acc: 0.985400
# Save model as pth file, 'torch_mnist.pth')

Now? pth file generated pth file to There are two methods for trt files. One is the method on the official website of tensorrt, github , I tried this method and failed. I will encounter many problems, so I'll use the second method pth->. onnx->. TRT file conversion
Starting now. pth->. Onnx steps

# First install onnx
pip install onnx

Start conversion

import torch

batch_size = 1    # Batch here_ Size is set to 1 The pth model is set to 64 when we train
model = './torch_mnist.pth'
dummy_input = torch.randn(batch_size, 1, 28, 28, device='cuda')
model = torch.load(model)
torch.onnx.export(model, dummy_input,"torch_mnist.onnx" , verbose=False)

Here's a point to explain. Here's batch_size=1, so it should also be 1 in the later reasoning. In addition, inputs are required in the transformation process. This inputs only requires the model to know the size of the input data, and the model only takes one piece of data, so it's good to be random, but the size should be the same as the shape during training.
After running, there will be one in the directory onnx file generation shows that this step is successful, which can be referred to Blog.
next step. onnx->. trt
There is a trtexec file in the bin directory of the extracted tensorrt, which can be directly converted to

./trtexec --onnx=torch_mnist.onnx --saveEngine=torch_mnist.trt

It's OK after execution, and a will be generated trt file. If an error is reported, try the onnx tensorrt module for reference Blog.
At this step, the model transformation is completed. It is almost necessary to verify whether the model is accelerated and how to load the model. Let's study it and look at the code!!!

import torch
from torch.autograd import Variable
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
from torchvision import datasets, transforms
from import DataLoader
import time
import numpy as np

batch_size = 1    # Batch here_ The size should be consistent with that during conversion

trt_model_name = "./torch_mnist.trt"     

# This operation is a general function
def infer(context, input_img, output_size, batch_size):
    # Convert input data to Float32. If this type needs to be converted, there will be many errors
    input_img = input_img.astype(np.float32)
    # Create output array to receive data
    output = np.empty(output_size, dtype=np.float32)

    # Allocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.nbytes)
    d_output = cuda.mem_alloc(batch_size * output.nbytes)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    # Transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    # Execute model
    context.execute_async(batch_size, bindings, stream.handle, None)
    # Transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)


    # Return predictions
    return output

# Execute test function
def do_test(context):
    # Read mnist dataset
    data_tf = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.5], [0.5])])
    train_dataset = datasets.MNIST(root='./data', train=True, transform=data_tf, download=True)
    test_dataset = datasets.MNIST(root='./data', train=False, transform=data_tf)
    # train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    print("mnist data load successful!!!")
    accurary = 0
    start_time = time.time()
    for data in  test_loader:    # Start test
        img, label = data
        img = img.numpy()    # This data is from torch Tensor to numpy format
        label = Variable(label, volatile=True)
        output = infer(context, img, 10, 1)
        conf, pred = torch.max(torch.Tensor(output), -1)
        num_count = (pred == label).sum()
        accurary +=

        print("Test Acc is {:.6f}".format(accurary / len(test_dataset)))

    return accurary/len(test_dataset), time.time() - start_time

def trt_infer():
    # Read trt file
    def loadEngine2TensorRT(filepath):
        G_LOGGER = trt.Logger(trt.Logger.WARNING)
        # Deserialization engine
        with open(filepath, "rb") as f, trt.Runtime(G_LOGGER) as runtime:
            engine = runtime.deserialize_cuda_engine(
            return engine
    engine = loadEngine2TensorRT(trt_model_name)
    # Create context
    context = engine.create_execution_context()

    print("Start TensorRT Test...")
    acc, times = do_test(context)
    print('INT8 acc: {}, need time: {}'.format(acc, times))

if __name__ == '__main__':


Then run it and wait for the result~~
My test result is (the data is 10000 mnist pictures)
Operation results of trt model

INT8 acc: 0.9855999946594238, need time: 4.600905656814575

pth model operation results

Test Acc is 0.985600
run succes! time is 20.035008

From the results, we can see that the trt model is 4-5 times faster, which is still OK~

