Python api构建tensorrt加速模型的步骤详解

当我们使用TensorRT加速深度学习模型时，通常需要先通过Python API来构建TensorRT的加速模型。下面是构建TensorRT加速模型的步骤详解：

步骤一：将原始深度学习模型转换为TensorRT所支持的模型格式

TensorRT支持的模型格式有两种：UFF和ONNX。我们需要将原始深度学习模型转换为这两种格式之一，才能将其输入到TensorRT中加速运算。转换模型的工具有很多，比如TensorFlow自带的tf.contrib.tensorrt，NVIDIA推出的TensorRT inference server等。以tf.contrib.tensorrt为例，我们可以通过以下代码将一个TensorFlow模型转换为UFF格式：

import tensorflow as tf
from tensorflow.python.compiler.tensorrt import trt_convert as trt

converter = trt.TrtGraphConverter(input_saved_model_dir='path/to/saved_model')
converter.convert()
converter.save('path/to/saved_model_trt')

这个代码将在指定目录下输出一个.pb文件和一个.uff文件。.pb文件是输入模型，.uff文件是转换后的模型。

步骤二：使用TensorRT Python API构建加速模型

使用TensorRT Python API创建加速模型的过程包括三个主要步骤：创建TensorRT引擎、构建TensorRT网络、设置TensorRT运行时环境。

1. 创建TensorRT引擎

创建TensorRT引擎通过trt.Builder()来实现。在引擎创建时，需要定义最大批处理大小、最大工作空间大小等参数。

import tensorrt as trt

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)

max_batch_size = 32
max_workspace_size = 1 << 30 # 1GB

builder.max_batch_size = max_batch_size
builder.max_workspace_size = max_workspace_size

2. 构建TensorRT网络

构建TensorRT网络需要用到.uff文件。我们需要将.uff文件中保存的网络结构和参数提取出来，并添加到TensorRT引擎中。

import os

uff_model_path = 'path/to/model.uff'

with open(uff_model_path, 'rb') as f:
    uff_buffer = f.read()

parser = trt.UffParser()
parser.register_input('input', (input_channels, input_height, input_width))
parser.register_output('output')

network = builder.create_network()
parser.parse_buffer(uff_buffer, network)

这个代码将.uff文件中的网络结构和参数提取出来，并通过parser将输入和输出的名称注册到TensorRT网络中。然后，它将TensorRT网络构建出来。

3. 设置TensorRT运行时环境

设置TensorRT运行时环境通过trt.ICudaEngine()实现。这个过程将TensorRT引擎序列化，保存到硬盘上；在每次需要使用TensorRT加速时，读取引擎并确定输入和输出维度。

engine_file_path = 'path/to/engine.trt'

if os.path.isfile(engine_file_path):
    with open(engine_file_path, 'rb') as f:
        engine_data = f.read()
    engine = trt.lite.Engine.deserialize_cuda_engine(engine_data)
else:
    engine = builder.build_cuda_engine(network)
    engine_data = engine.serialize()
    with open(engine_file_path, 'wb') as f:
        f.write(engine_data)

这个代码首先检查硬盘上是否已经有序列化的引擎文件。如果已经存在，则将其读取出来。如果不存在，则使用TensorRT引擎构建器builder创建一个新的引擎；并序列化，将其保存到硬盘上。

到此为止，我们已经成功使用Python API构建了一个加速模型。下面是两个示例说明：

示例一：使用MNIST数据集训练一个手写数字识别模型

import tensorflow as tf
import tensorrt as trt

# Load MNIST dataset
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images, test_images = train_images / 255.0, test_images / 255.0

# Train a TensorFlow model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=(28, 28)))
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(10, activation='softmax'))

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit(train_images, train_labels,
          epochs=10,
          validation_data=(test_images, test_labels))

# Convert the TensorFlow model to UFF format
converter = trt.TrtGraphConverter(input_saved_model_dir='path/to/saved_model')
converter.convert()
converter.save('path/to/saved_model_trt')

# Use TensorRT to accelerate the model
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
max_batch_size = 32
max_workspace_size = 1 << 30 # 1GB
builder.max_batch_size = max_batch_size
builder.max_workspace_size = max_workspace_size

uff_model_path = 'path/to/saved_model_trt.uff'

with open(uff_model_path, 'rb') as f:
    uff_buffer = f.read()

parser = trt.UffParser()
parser.register_input('flatten_input', (1, 28, 28))
parser.register_output('dense_1/Softmax')
network = builder.create_network()
parser.parse_buffer(uff_buffer, network)

engine_file_path = 'path/to/engine.trt'

if os.path.isfile(engine_file_path):
    with open(engine_file_path, 'rb') as f:
        engine_data = f.read()
    engine = trt.lite.Engine.deserialize_cuda_engine(engine_data)
else:
    engine = builder.build_cuda_engine(network)
    engine_data = engine.serialize()
    with open(engine_file_path, 'wb') as f:
        f.write(engine_data)

# Use the TensorRT engine to classify images
import numpy as np

batch_size = 32
inputs = np.random.normal(0, 1, [batch_size, 28, 28]).astype(np.float32)
outputs = np.zeros([batch_size, 10], dtype=np.float32)

with engine.create_execution_context() as context:
    bindings = [int(context.get_binding_shape(0).numel() * np.dtype(np.float32).itemsize)] * 2
    inputs_buffer = cuda.mem_alloc(bindings[0])
    outputs_buffer = cuda.mem_alloc(bindings[1])
    bindings = [int(inputs_buffer), int(outputs_buffer)]
    stream = cuda.Stream()
    cuda.memcpy_htod_async(inputs_buffer, inputs, stream)
    context.execute_async(batch_size, bindings, stream.handle, None)
    cuda.memcpy_dtoh_async(outputs, outputs_buffer, stream)
    stream.synchronize()

print(outputs)

这个代码使用MNIST数据集训练了一个简单的手写数字识别模型，并将其转换为UFF格式。然后它使用TensorRT加速器将其加速，并最终用加速器对图像进行分类。

示例二：在Jetson Nano上部署一个目标检测模型

import tensorrt as trt

# Create TensorRT engine
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
max_batch_size = 1
max_workspace_size = 1 << 30 # 1GB
builder.max_batch_size = max_batch_size
builder.max_workspace_size = max_workspace_size

uff_model_path = 'path/to/model.uff'

with open(uff_model_path, 'rb') as f:
    uff_buffer = f.read()

parser = trt.UffParser()
parser.register_input('input_1', (300, 300, 3))
parser.register_output('filtered_detections/map/TensorArrayStack/TensorArrayGatherV3')
network = builder.create_network()
parser.parse_buffer(uff_buffer, network)

engine_file_path = 'path/to/engine.trt'

if os.path.isfile(engine_file_path):
    with open(engine_file_path, 'rb') as f:
        engine_data = f.read()
    engine = trt.lite.Engine.deserialize_cuda_engine(engine_data)
else:
    engine = builder.build_cuda_engine(network)
    engine_data = engine.serialize()
    with open(engine_file_path, 'wb') as f:
        f.write(engine_data)

# Open camera and detect objects
import cv2
import numpy as np
import time

LABELS = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
          'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird',
          'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
          'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
          'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
          'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife',
          'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
          'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
          'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
          'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
          'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']

conf_threshold = 0.5

cap = cv2.VideoCapture(0)

while True:
    ret, image = cap.read()

    preprocessed_image = cv2.resize(image, (300, 300))
    preprocessed_image = preprocessed_image.transpose((2, 0, 1))
    preprocessed_image = np.expand_dims(preprocessed_image, axis=0)
    preprocessed_image = preprocessed_image.astype(np.float32)

    outputs = np.zeros([max_batch_size, 100, 7], dtype=np.float32)

    with engine.create_execution_context() as context:
        bindings = [int(context.get_binding_shape(0).numel() * np.dtype(np.float32).itemsize)] * 2
        inputs_buffer = cuda.mem_alloc(bindings[0])
        outputs_buffer = cuda.mem_alloc(bindings[1])
        bindings = [int(inputs_buffer), int(outputs_buffer)]
        stream = cuda.Stream()
        cuda.memcpy_htod_async(inputs_buffer, preprocessed_image, stream)
        context.execute_async(max_batch_size, bindings, stream.handle, None)
        cuda.memcpy_dtoh_async(outputs, outputs_buffer, stream)
        stream.synchronize()

    outputs = outputs[0]

    for i in range(100):
        if outputs[i][2] >= conf_threshold:
            x1 = int(outputs[i][3] * image.shape[1])
            y1 = int(outputs[i][4] * image.shape[0])
            x2 = int(outputs[i][5] * image.shape[1])
            y2 = int(outputs[i][6] * image.shape[0])

            label = int(outputs[i][1])
            label_text = LABELS[label]

            cv2.rectangle(image, (x1, y1), (x2, y2), (0, 0, 255), 2)
            cv2.putText(image, label_text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

    cv2.imshow('frame', image)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

这个代码使用TensorRT在Jetson Nano上部署一个目标检测模型，并从摄像头实时获取视频流，检测视频帧中的物体并标记。

步骤一：将原始深度学习模型转换为TensorRT所支持的模型格式

步骤二：使用TensorRT Python API构建加速模型

1. 创建TensorRT引擎

2. 构建TensorRT网络

3. 设置TensorRT运行时环境

示例一：使用MNIST数据集训练一个手写数字识别模型

示例二：在Jetson Nano上部署一个目标检测模型

你可能也喜欢

Numpy中np.random.rand()和np.random.randn() 用法和区别详解

python加速器numba使用详解

基于np.arange与np.linspace细微区别(数据溢出问题)