课程说明
课程目标: 讲解PyTorch->ONNX->TensorRT,模型导出时,TensorRT官方插件如何使用。
软件版本说明:
TensorRT: 8.6.1.6
Python: 3.8
下面练习用到的输入数据与输出数据:
链接:https://pan.baidu.com/s/14NQaxeTIXRi9YAbdSWNNtQ?pwd=y0jm
提取码:y0jm
导出原理
[1] https://blog.csdn.net/blanokvaffy/article/details/128046413 (TensorRT加速Deformable Detr实践_deformable detr onnx-CSDN博客)
[2] https://github.com/talebolano/Tensorrt-Deformable-Detr
[3] https://github.com/NVIDIA/TensorRT/tree/main/plugin/multiscaleDeformableAttnPlugin
[4] https://zhuanlan.zhihu.com/p/513387413 (模型部署入门教程(四):在 PyTorch 中支持更多 ONNX 算子 - 知乎)
在文献4,下面有一个评论,特别棒。
提问人:意思是,onnx只是制定了一套标准,如果要真的实现deform_conv这个算子功能,就需要在推理引擎(比如onnxruntime, tensorrt)里面根据这个标准去实现算子,是这么理解吗?
作者:是的。
举个例子:我们的模型,有一个resize操作,导出的onnx,会有一个resize节点,但是resize节点内部如何实现,onnx就不再具体描述了。
即一个onnx节点的内容,可粗可细,细的话,一个函数实现细节都可以描述出来。粗的话,一个onnx算子可以描述整个函数模块。
我们具体,来看下面的例子(来自于文献1,2),理解下上面的事情。
class Etmpy_MultiScaleDeformableAttnFunction(torch.autograd.Function):
@staticmethod
def symbolic(g,value, value_spatial_shapes, value_level_start_index,
sampling_locations, attention_weights, im2col_step):
return g.op('com.microsoft::MultiscaleDeformableAttnPlugin_TRT',value, value_spatial_shapes, value_level_start_index,
sampling_locations, attention_weights)
@staticmethod
def forward(ctx, value, value_spatial_shapes, value_level_start_index,
sampling_locations, attention_weights, im2col_step):
'''
no real mean,just for inference
'''
bs, _, mum_heads, embed_dims_num_heads = value.shape
bs ,num_queries, _, _, _, _ = sampling_locations.shape
return value.new_zeros(bs, num_queries, mum_heads, embed_dims_num_heads)
@staticmethod
def backward(ctx, grad_output):
pass
上面是可变形注意力机制插件使用,在pytorch代码中需要做的事情:
1 替换原有的MultiScaleDeformableAttnFunction
为Etmpy_MultiScaleDeformableAttnFunction
。
2 定义符号函数,实现从pytorchMultiScaleDeformableAttnFunction
到onnx算子的映射。
符号函数中g.op函数,第1个参数,即为onnx算子的名字。
这个onnx算子的名字,com.microsoft::MultiscaleDeformableAttnPlugin_TRT
,::前面是命名空间,可以随意取,不影响,::是onnx插件的名字。这里,我们使用nvidia官方提供的插件,要与nvidia官方插件的名字相同。
要与nvidia官方提供的算子名字保持一致,在插件的cpp文件中,有这样的代码,其中"MultiscaleDeformableAttnPlugin_TRT",即为插件的名字。
// https://github.com/NVIDIA/TensorRT/blob/main/plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableAttnPlugin.cpp
namespace
{
static char const* DMHA_VERSION{"1"};
static char const* DMHA_NAME{"MultiscaleDeformableAttnPlugin_TRT"};
} // namespace
我们在使用trtexec导出engine时,trtexec --verbose
,在输出日志中,也能看到它会加载的官方自定义的插件名称,可以通过上面的方法快速确认。
有了上面两步的工作,即可实现这个模块的onnx trace方式导出。
gpu版多尺度注意力机制插件的使用测试
1 编写一段pytorch代码,定义一个模型,这个模型,在导出onnx时,会映射到tensorrt插件。
2 导出这个模型,到onnx。
3 onnx to tensort engine。
4 tensort engine file推理。
pytorch导出onnx hello world demo
import torch
import torch.onnx
# 定义一个简单的 PyTorch 模型
class SimpleModel(torch.nn.Module):
def __init__(self):
super(SimpleModel, self).__init__()
self.fc = torch.nn.Linear(10, 1)
def forward(self, x):
return self.fc(x)
# 创建模型实例
model = SimpleModel()
# 创建一个示例输入
example_input = torch.randn(1, 10)
# 导出模型到 ONNX 格式
output_path = "simple_model.onnx"
torch.onnx.export(model, example_input, output_path)
print("Model exported to:", output_path)
"""
在这个示例中,我们首先定义了一个简单的 PyTorch 模型 SimpleModel,该模型包含一个线性层。然后,我们创建了一个模型实例,并准备了一个示例输入 example_input。最后,我们使用 torch.onnx.export 函数将模型导出为 ONNX 格式,并指定输出路径。
"""
pytorch代码
在pytorch模型源码中的MSDeformAttnFunction:
try:
output = MSDeformAttnFunction.apply(
value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
except:
# CPU
output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
保存输入变量
torch.save(value, 'res/value.pt')
torch.save(input_spatial_shapes, 'res/input_spatial_shapes.pt')
torch.save(input_level_start_index, 'res/input_level_start_index.pt')
torch.save(sampling_locations, 'res/sampling_locations.pt')
torch.save(attention_weights, 'res/attention_weights.pt')
torch.save(im2col_step, 'res/im2col_step.pt')
torch.save(output, '/home/demo/assets/layer_05/output.pt')
pytorch->onnx
下面列举一个测试这个插件的demo程序:
import torch
from torch import nn
import numpy as np
import onnx
class Etmpy_MultiScaleDeformableAttnFunction(torch.autograd.Function):
@staticmethod
def symbolic(g, value, spatial_shapes, level_start_index, sampling_locations,
atttention_weights, im2col_step):
# return g.op('com.microsoft::MultiscaleDeformableAttnPlugin_TRT',value, value_spatial_shapes, value_level_start_index,
# sampling_locations, attention_weights)
# 按multiscaleDeformableAttnPlugin以下顺序接受 5 个输入: value、spatial_shapes、level_start_index、sampling_locations和atttention_weights。
return g.op('nvinfer1.plugin::MultiscaleDeformableAttnPlugin_TRT', value, spatial_shapes, level_start_index, sampling_locations,
atttention_weights)
@staticmethod
def forward(ctx, value, value_spatial_shapes, value_level_start_index,
sampling_locations, attention_weights, im2col_step):
'''
no real mean,just for inference
'''
bs, _, mum_heads, embed_dims_num_heads = value.shape
bs ,num_queries, _, _, _, _ = sampling_locations.shape
return value.new_ones((bs, num_queries, mum_heads, embed_dims_num_heads))
@staticmethod
def backward(ctx, grad_output):
pass
class MyMSDeformAttnModel(nn.Module):
def __init__(self):
super().__init__()
def forward(self, value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, im2col_step):
output = Etmpy_MultiScaleDeformableAttnFunction.apply(value, input_spatial_shapes,
input_level_start_index, sampling_locations, attention_weights, im2col_step)
return output
def export_py_model_to_onnx():
pass
my_ms_deform_attn_model = My2MSDeformAttnModel()
value = torch.load('/home/demo/assets/layer_07/value.pt')
input_spatial_shapes = torch.load('/home/demo/assets/layer_07/input_spatial_shapes.pt')
input_level_start_index = torch.load('/home/demo/assets/layer_07/input_level_start_index.pt')
sampling_locations = torch.load('/home/demo/assets/layer_07/sampling_locations.pt')
attention_weights = torch.load('/home/demo/assets/layer_07/attention_weights.pt')
im2col_step = torch.load('/home/demo/assets/layer_07/im2col_step.pt')
# The multiscaleDeformableAttnPlugin takes 5 inputs in the following order :
# value, spatial_shapes, level_start_index, sampling_locations, and atttention_weights.
with torch.no_grad():
torch.onnx.export(my_ms_deform_attn_model, (value, input_spatial_shapes, input_level_start_index,
sampling_locations, attention_weights, im2col_step, ),
"res/my_ms_deform_attn_model_v03.onnx",
opset_version=11,
input_names=['value', 'spatial_shapes', 'level_start_index',
'sampling_locations', 'attention_weights', 'im2col_step'],
output_names=['output'])
def tensor_to_numpy():
pass
value = torch.load('/home/demo/assets/layer_07/value.pt')
value_np = value.cpu().numpy()
np.save('/home/demo/assets/layer_07/value_np.npy', value_np)
# value_np_load = np.load('/home/demo/assets/layer_07/value_np.npy')
# print(value_np_load)
input_spatial_shapes = torch.load('/home/demo/assets/layer_07/input_spatial_shapes.pt')
input_spatial_shapes_np = input_spatial_shapes.cpu().numpy()
np.save('/home/demo/assets/layer_07/input_spatial_shapes.npy', input_spatial_shapes_np)
input_level_start_index = torch.load('/home/demo/assets/layer_07/input_level_start_index.pt')
input_level_start_index_np = input_level_start_index.cpu().numpy()
np.save('/home/demo/assets/layer_07/input_level_start_index.npy', input_level_start_index_np)
sampling_locations = torch.load('/home/demo/assets/layer_07/sampling_locations.pt')
sampling_locations_np = sampling_locations.cpu().numpy()
np.save('/home/demo/assets/layer_07/sampling_locations.npy', sampling_locations_np)
attention_weights = torch.load('/home/demo/assets/layer_07/attention_weights.pt')
attention_weights_np = attention_weights.cpu().numpy()
np.save('/home/demo/assets/layer_07/attention_weights.npy', attention_weights_np)
# im2col_step = torch.load('/home/demo/assets/layer_07/im2col_step.pt')
# im2col_step_np = im2col_step.cpu().numpy()
# np.save('/home/demo/assets/layer_07/im2col_step.npy', im2col_step_np)
"""
使用空的模块名与pytorch到onnx的映射(符号函数),符号函数return的是我们自定义的onnx算子名称,
测试这个导出的onnx文件,转成tensorrt需要的engine格式后,能否正常推理;
"""
if __name__ == "__main__":
pass
export_py_model_to_onnx()
# tensor_to_numpy()
已知:Etmpy_MultiScaleDeformableAttnFunction
,它拥有MultiScaleDeformableAttnFunction模块到onnx算子(nvidia官方插件)的符号映射。
1 我定义了一个MyMSDeformAttnModel,它的内容就是调用下Etmpy_MultiScaleDeformableAttnFunction
。
2 使用vscode debug到MultiScaleDeformableAttnFunction
的输出、输出位置,可以使用torch.save
记录下输入、输出变量,并保存到本地。
3 基于1,2,可以调用torch.onnx.export函数,导出这个测试模型的onnx文件。得到下面的模型:
onnx to tensorrt。
trtexec --onnx=my_ms_deform_attn_model_v03.onnx --explicitBatch --workspace=4096 --saveEngine=my_ms_deform_attn_model_v03.engine
tensorrt推理测试
4 有了onnx模型,与输入、输出的变量数据。我们可以编写一个tensorrt推理的脚本,验证下这个插件推理结果,是否与输出变量一致。推理脚本如下所示:
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
from my_debug_tools import *
def load_engine(engine_path):
with open(engine_path, "rb") as f, trt.Runtime(trt.Logger(trt.Logger.WARNING)) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
return engine
def alloc_and_binding_input(np_data, bindings, inputs, inputs_d_mem):
"""
分配GPU内存,并添加到绑定列表
"""
# 并分配设备端内存
input_memory = cuda.mem_alloc(np_data.nbytes)
# 然后添加到绑定列表
bindings.append(int(input_memory)) # 添加设备端输入内存到绑定列表
inputs.append(np_data) # 记录CPU上的输入变量
inputs_d_mem.append(input_memory) # 记录在GPU上分配的"输入"内存变量
def infer_entry():
"""
"""
trt.init_libnvinfer_plugins(None, "")
model_path = 'model/my_ms_deform_attn_model_v04.engine'
engine = load_engine(model_path)
context = engine.create_execution_context()
print(context)
# Prepare input data
value = np.load('input/value_np.npy')
input_spatial_shapes = np.load('input/input_spatial_shapes.npy')
input_spatial_shapes = input_spatial_shapes.astype(np.int32)
input_level_start_index = np.load('input/input_level_start_index.npy')
input_level_start_index = input_level_start_index.astype(np.int32)
sampling_locations = np.load('input/sampling_locations.npy')
attention_weights = np.load('input/attention_weights.npy')
im2col_step = 128
# 在GPU上,为输入、输出分配GPU显存, 然后添加到绑定列表
bindings = []
inputs = [] # 依次记录CPU上的变量
inputs_d_mem = [] # 记录在GPU上分配的内存变量
output_buffer = None # 在CPU上分配的输出缓存
output_memory = None # 记录在GPU上分配的"输出"内存变量
for binding in engine:
# 获取当前绑定的索引
binding_idx = engine.get_binding_index(binding)
# 计算当前绑定所需内存的大小
size = trt.volume(context.get_binding_shape(binding_idx))
# 获取当前绑定的数据类型,并转换为NumPy对应的类型
dtype = trt.nptype(engine.get_binding_dtype(binding))
if engine.binding_is_input(binding):
# 当前绑定是模型的输入
# value input_spatial_shapes input_level_start_index sampling_locations attention_weights im2col_step
if binding == 'value':
alloc_and_binding_input(value, bindings, inputs, inputs_d_mem)
elif binding == 'spatial_shapes':
alloc_and_binding_input(input_spatial_shapes, bindings, inputs, inputs_d_mem)
elif binding == 'level_start_index':
alloc_and_binding_input(input_level_start_index, bindings, inputs, inputs_d_mem)
elif binding == 'sampling_locations':
alloc_and_binding_input(sampling_locations, bindings, inputs, inputs_d_mem)
elif binding == 'attention_weights':
alloc_and_binding_input(attention_weights, bindings, inputs, inputs_d_mem)
else:
# 当前绑定是模型的输出
output_buffer = cuda.pagelocked_empty(size, dtype) # 创建一个锁定在内存页的空数组
output_memory = cuda.mem_alloc(output_buffer.nbytes) # 分配设备端内存用于存储推理结果
bindings.append(int(output_memory)) # 添加设备端输出内存到绑定列表
stream = cuda.Stream()
for i in range(len(inputs)):
# 将输入为连续的内存布局
input_buffer = np.ascontiguousarray(inputs[i])
# 将输入数据从内存(CPU)复制到设备端(GPU)
cuda.memcpy_htod_async(inputs_d_mem[i], input_buffer, stream)
stream.synchronize()
# 执行推理
res = context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
print(res)
# Transfer prediction output from the GPU.
# Transfer prediction output from the GPU.
cuda.memcpy_dtoh_async(output_buffer, output_memory, stream)
# Synchronize the stream
stream.synchronize()
output_buffer = output_buffer.view(np.float32).reshape(1, 21000, 256)
# 释放内存
for input_d_mem in inputs_d_mem:
input_d_mem.free()
output_memory.free()
# context.destroy()
engine.destroy()
print('hi ~~~~~~~~~')
if __name__ == "__main__":
pass
infer_entry()
补充:这个脚本注意事项:一开始,这个脚本,推理输出全为0,后面查阅了很多资料,猜测可能与数据类型有关,就把整形数据,由int64,编程切换成int32,即下面的代码。
input_level_start_index = input_level_start_index.astype(np.int32)
input_spatial_shapes = input_spatial_shapes.astype(np.int32)
至此,推理输出结果有值,而且输出结果与pytorch输出结果基本一致。验证了这个插件实现的功能,跟mask dino官方仓库使用的接口函数是一致的。并且我们这样定义与使用插件,是正确的,没有问题的。
实际使用时,pytorch代码的调整
我实际用的代码是这样的:
# 在调用处,把原来的使用MSDeformAttnFunction.apply的代码注释掉,换上Empty_MultiScaleDeformableAttnFunction
# try:
# output = MSDeformAttnFunction.apply(
# value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
# except:
# # CPU
# output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
# my: cuda version,
output = Empty_MultiScaleDeformableAttnFunction.apply(value, input_spatial_shapes, input_level_start_index,
sampling_locations, attention_weights, self.im2col_step)
# 生成multiscaleDeformableAttnPlugin形状 的注意力输出[N, Lq, M, D]。
N, Lq, M, D = output.shape
output = output.view(N, Lq, M * D)
Empty_MultiScaleDeformableAttnFunction的内容:
import torch
# from torch import nn
# import numpy as np
# import onnx
from .ms_deform_attn_func import MSDeformAttnFunction
class Empty_MultiScaleDeformableAttnFunction(torch.autograd.Function):
@staticmethod
def symbolic(g, value, spatial_shapes, level_start_index, sampling_locations,
atttention_weights, im2col_step):
# 按multiscaleDeformableAttnPlugin以下顺序接受 5 个输入: value、spatial_shapes、level_start_index、sampling_locations和atttention_weights。
return g.op('nvinfer1.plugin::MultiscaleDeformableAttnPlugin_TRT', value, spatial_shapes, level_start_index, sampling_locations,
atttention_weights)
@staticmethod
def forward(ctx, value, value_spatial_shapes, value_level_start_index,
sampling_locations, attention_weights, im2col_step):
'''
no real mean,just for inference
'''
# bs, _, num_heads, embed_dims_num_heads = value.shape
bs = value.shape[0]
num_heads = value.shape[2]
embed_dims_num_heads = value.shape[3]
# bs ,num_queries, _, _, _, _ = sampling_locations.shape
num_queries = sampling_locations.shape[1]
# return value.new_zeros(size=(bs, num_queries, num_heads, embed_dims_num_heads), device=value.device, dtype=value.dtype)
# forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step)
output = MSDeformAttnFunction.apply(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step)
output = output.view(bs, num_queries, num_heads, embed_dims_num_heads)
return output
@staticmethod
def backward(ctx, grad_output):
pass
考虑到trace方式,生成onnx,尽量输出有意义的结果,所以这里的Empty_MultiScaleDeformableAttnFunction
的forward函数,调用了MSDeformAttnFunction.apply
,把这个函数输出reshape一下,做为Empty_MultiScaleDeformableAttnFunction
。
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。