PyTorch模型转TensorRT是怎么实现的?

  • Post category:Python

PyTorch是一种开源机器学习框架,而TensorRT是一个针对深度学习推理的高性能推理软件库。PyTorch模型转换为TensorRT可以提高其推理效率。下面是PyTorch模型转TensorRT的步骤:

第一步:安装TensorRT

在官网上下载最新版的TensorRT,根据 CUDA(compute unified device architecture) 版本进行选择,安装完成后,需要执行以下步骤:

  • (1)添加TensorRT到环境变量中
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/TensorRT-VERSION/lib:/usr/local/cuda-VERSION/lib64

其中,VERSION为TensorRT和CUDA的版本号。

  • (2)测试TensorRT
/usr/local/TensorRT-VERSION/bin/trtexec --model=<model_path> --shape=data:3,<input_shape>,1 --saveEngine=<engine_path>

其中,为待转换的PyTorch模型路径,为模型输入的形状,为最终生成的TensorRT引擎路径。

第二步:导出PyTorch模型到ONNX格式

TensorRT支持ONNX格式,因此需要将PyTorch模型先转换为ONNX格式。下面是一个示例:

import torch
import torchvision
import onnx
from onnx import optimizer

model = torchvision.models.resnet50(pretrained=True)
dummy_input = torch.randn(1, 3, 224, 224)
input_names = ["input1"]
output_names = ["output1"]

torch.onnx.export(model, dummy_input, "resnet50.onnx", verbose=True, input_names=input_names, output_names=output_names)

optimized_model = onnx.load("resnet50.onnx")
passes = ["eliminate_unused_initializer", "eliminate_nop_dropout", "eliminate_nop_transpose", "eliminate_identity"]
opt_model = optimizer.optimize(optimized_model, passes)
onnx.save(opt_model, "resnet50_opt.onnx")

第三步:加载并优化ONNX模型

将ONNX模型加载到TensorRT,并设置优化项,以生成高效的TensorRT引擎。下面是一个示例:

import tensorrt as trt
import onnx
import numpy as np

# 加载ONNX模型到TensorRT
model_path = "resnet50_opt.onnx"
engine_path = "resnet50.trt"
max_batch_size = 1

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
network_creation_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(network_creation_flag) as network, \
      trt.OnnxParser(network, TRT_LOGGER) as parser, builder.create_builder_config() as config:
    builder.max_batch_size = max_batch_size
    builder.max_workspace_size = 1 << 30
    config.max_workspace_size = 1 << 30
    with open(model_path, 'rb') as model:
        if not parser.parse(model.read()):
            print('ERROR: Failed to parse the ONNX file {}.'.format(model_path))
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            exit(0)
    network.get_input(0).shape = [max_batch_size, 3, 224, 224]
    config.add_optimization_profile(profile=trt.OptimizationProfile().set_batch_size_profile([max_batch_size], [np.zeros((max_batch_size,3,224,224), dtype=np.float32), np.zeros((max_batch_size,), dtype=np.int64)]))
    engine = builder.build_engine(network, config)
    with open(engine_path, "wb") as f:
        f.write(engine.serialize())
        print("TensorRT engine has been saved to file: {}".format(engine_path))

示例

下面给出一个示例,说明如何将PyTorch模型转换为TensorRT。

假设有以下的PyTorch模型:

import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, 3)
        self.conv2 = nn.Conv2d(64, 128, 3)
        self.dropout1 = nn.Dropout2d(0.25)
        self.fc1 = nn.Linear(73728, 256)
        self.fc2 = nn.Linear(256, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = x.view(-1, 73728)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

首先,需要将该模型转换为ONNX格式:

import torch
import torchvision
import onnx
from onnx import optimizer

model = Net()
dummy_input = torch.randn(1, 3, 224, 224)
input_names = ["input1"]
output_names = ["output1"]

torch.onnx.export(model, dummy_input, "model.onnx", verbose=True, input_names=input_names, output_names=output_names)

optimized_model = onnx.load("model.onnx")
passes = ["eliminate_unused_initializer", "eliminate_nop_dropout", "eliminate_nop_transpose", "eliminate_identity"]
opt_model = optimizer.optimize(optimized_model, passes)
onnx.save(opt_model, "model_opt.onnx")

然后,需要将转换后的ONNX模型加载到TensorRT,并进行优化:

import tensorrt as trt
import onnx
import numpy as np

# 加载ONNX模型到TensorRT
model_path = "model_opt.onnx"
engine_path = "model.trt"
max_batch_size = 1

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
network_creation_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(network_creation_flag) as network, \
      trt.OnnxParser(network, TRT_LOGGER) as parser, builder.create_builder_config() as config:
    builder.max_batch_size = max_batch_size
    builder.max_workspace_size = 1 << 30
    config.max_workspace_size = 1 << 30
    with open(model_path, 'rb') as model:
        if not parser.parse(model.read()):
            print('ERROR: Failed to parse the ONNX file {}.'.format(model_path))
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            exit(0)
    network.get_input(0).shape = [max_batch_size, 3, 224, 224]
    config.add_optimization_profile(profile=trt.OptimizationProfile().set_batch_size_profile([max_batch_size], [np.zeros((max_batch_size,3,224,224), dtype=np.float32)]))
    engine = builder.build_engine(network, config)
    with open(engine_path, "wb") as f:
        f.write(engine.serialize())
        print("TensorRT engine has been saved to file: {}".format(engine_path))

将模型转换为TensorRT后,可以使用以下代码对其进行推理:

import tensorrt as trt

# 加载TensorRT引擎
engine_file = "model.trt"
with open(engine_file, "rb") as f, trt.Runtime(trt.Logger(trt.Logger.WARNING)) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())

    # 创建推理上下文
    input_shape = [1, 3, 224, 224]
    output_shape = [1, 10]
    device_input = cuda.mem_alloc(trt.volume(input_shape) * trt.float32.itemsize)
    device_output = cuda.mem_alloc(trt.volume(output_shape) * trt.float32.itemsize)
    stream = cuda.Stream()

    # 执行推理
    with engine.create_execution_context() as context:
        input_data = np.random.random(size=input_shape).astype(np.float32)
        cuda.memcpy_htod_async(device_input, input_data, stream)
        context.execute_async_v2(host_bindings=[int(device_input), int(device_output)], stream_handle=stream.handle)
        output_data = np.empty(output_shape, dtype=np.float32)
        cuda.memcpy_dtoh_async(output_data, device_output, stream)
        stream.synchronize()
        print("Output shape: {}".format(output_data.shape))
        print("Output: {}".format(output_data))

这里使用了CUDA来管理输入和输出的Tensor,并执行TensorRT推理。注意,因为TensorRT对输入形状的限制比PyTorch更严格,因此在加载模型时需要注意输入形状的设置。这会影响到TensorRT引擎的生成和TensorRT推理的正确性。

以上就是PyTorch模型转换为TensorRT的完整攻略和两个示例。