本文是基于EAS产品环境部署triton框架实现yolo12模型的服务化部署,本文暂时聚焦于单模型部署;
基于PAI平台的模型在线服务EAS自带的Triton镜像[eas-registry-vpc.cn-wulanchabu.cr.aliyuncs.com/pai-eas/tritonserver:25.03-py3]部署Triton;
为了避免模型pytorch与triton镜像支持的pytorch版本不一致问题,将模型文件转onnx;
from yolov12.ultralytics import YOLO
from pathlib import Path
import subprocess
import shutil
# Define paths
model_name = "yolo"
triton_repo_path = Path("/mnt/data/xxx/xxx/models/triton/")
triton_model_path = triton_repo_path / model_name
# Create directories
(triton_model_path / "1").mkdir(parents=True, exist_ok=True)
# Load a model
model = YOLO('yolov12/yolov12n.pt')
# Retrieve metadata during export. Metadata needs to be added to config.pbtxt. See next section.
metadata = []
def export_cb(exporter):
metadata.append(exporter.metadata)
model.add_callback("on_export_end", export_cb)
# Export the model
onnx_file = model.export(format="onnx", dynamic=True)
# Move ONNX model to Triton Model path
shutil.move(onnx_file, triton_model_path / "1" / "model.onnx")
# Create config file
(triton_model_path / "config.pbtxt").touch()
data = """
# Add metadata
parameters {
key: "metadata"
value {
string_value: "%s"
}
}
# (Optional) Enable TensorRT for GPU inference
# First run will be slow due to TensorRT engine conversion
optimization {
execution_accelerators {
gpu_execution_accelerator {
name: "tensorrt"
parameters {
key: "precision_mode"
value: "FP16"
}
parameters {
key: "max_workspace_size_bytes"
value: "3221225472"
}
parameters {
key: "trt_engine_cache_enable"
value: "1"
}
parameters {
key: "trt_engine_cache_path"
value: "/models/yolo/1"
}
}
}
}
""" % metadata[0] # noqa
with open(triton_model_path / "config.pbtxt", "w") as f:
f.write(data)python -c "import onnx; m=onnx.load('model.onnx'); print([n.name for n in m.graph.output])"
结果:['output0']
python -c "import onnx; m=onnx.load('model.onnx'); print([n.name for n in m.graph.input])"
结果:['images']
注意该输入输出需要triton模型文件中的config.pbtxt相对应;
在OSS存储空间中创建模型存储目录,并根据模型存储目录格式要求配置模型文件与配置文件。
每个模型目录下都至少包含一个模型版本目录和一个模型配置文件:
config.pbtxt。假设模型存储目录在oss://examplebucket/models/triton/路径下,模型存储目录的格式如下:
triton
└──yolo
├── 1
│ └── model.onnx
├── 2
│ └── model.onnx
├── 3
│ └── model.onnx
└── config.pbtxt其中:config.pbtxt 为配置文件,文件内容示例如下:
# Add metadata
name: "yolo"
platform: "onnxruntime_onnx"
max_batch_size: 512
parameters {
key: "metadata"
value {
string_value: "{'description': 'Ultralytics YOLOv12n model ', 'author': 'Ultralytics', 'date': '2025-06-09T11:14:01.872050', 'version': '8.3.63', 'license': 'AGPL-3.0 License (https://ultralytics.com/license)', 'docs': 'https://docs.ultralytics.com', 'stride': 32, 'task': 'detect', 'batch': 1, 'imgsz': [640, 640], 'names': {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'}, 'args': {'batch': 1, 'half': False, 'dynamic': True, 'simplify': True, 'opset': None}}"
}
}
input [
{
name: "images"
data_type: TYPE_FP32
dims: [1, 3, 640, 640]
}
]
output [
{
name: "output0"
data_type: TYPE_FP32
# dims: [1, 84, 8400]
dims: [85, 8400] # YOLO 输出维度
}
]
# 使用GPU推理
instance_group [
{
kind: KIND_GPU
}
]
# (Optional) Enable TensorRT for GPU inference
# First run will be slow due to TensorRT engine conversion
optimization {
execution_accelerators {
gpu_execution_accelerator {
name: "tensorrt"
parameters {
key: "precision_mode"
value: "FP16"
}
parameters {
key: "max_workspace_size_bytes"
value: "3221225472"
}
parameters {
key: "trt_engine_cache_enable"
value: "1"
}
parameters {
key: "trt_engine_cache_path"
value: "/models/yolo/1"
}
}
}
}
# 模型版本配置
# version_policy: { all { }}
version_policy: { latest: { num_versions: 1}}
# version_policy: { specific: { versions: [1,3]}}import numpy as np
import tritonclient.http as httpclient
import cv2
from torchvision import transforms
import contextlib
import time
def preprocess(image_path, target_size=640):
# 读取图像 (BGR格式)
img = cv2.imread(image_path)
# 保持长宽比的缩放 + 填充
h, w = img.shape[:2]
scale = min(target_size / h, target_size / w)
new_h, new_w = int(h * scale), int(w * scale)
# 缩放图像
resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
# 创建填充后的画布 (RGB格式)
padded = np.full((target_size, target_size, 3), 114, dtype=np.uint8)
padded[:new_h, :new_w] = resized
# 转换为Tensor (CHW格式 + 归一化)
tensor = transforms.ToTensor()(padded) # 自动归一化到[0,1]
# 添加batch维度 (BCHW)
return tensor.unsqueeze(0) # 输出形状: [1, 3, 640, 640]
# url为EAS服务部署后生成的访问地址。
url = '16XXXXXXXX.vpc.cn-wulanchabu.pai-eas.aliyuncs.com/api/predict/model_refer'
triton_client = httpclient.InferenceServerClient(url=url)
image_tensor = preprocess('data/1740969332599000000_50_251.jpg')
image = image_tensor.cpu().numpy().astype(np.float32) # 关键修正:去掉额外维度
# 验证数据形状和类型
print(f"Input shape: {image.shape}, dtype: {image.dtype}") # 应为 [1,3,640,640] 和 float32
inputs = []
inputs.append(httpclient.InferInput('images', image.shape, "FP32"))
inputs[0].set_data_from_numpy(image, binary_data=False)
outputs = []
outputs.append(httpclient.InferRequestedOutput('output0', binary_data=False)) # 获取 1000 维的向量
# 指定模型名称、请求Token、输入输出。
results = triton_client.infer(
model_name="yolo",
model_version="1",
inputs=inputs,
outputs=outputs,
headers={"Authorization": "<test-token>"},
)
output_data0 = results.as_numpy('output0')
print(output_data0.shape)
print(output_data0)
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。