【模型高效部署】tensorrtx 深度解读,yolov11高性能推理实战案例

机器学习算法数据库

picture.image

向AI转型的程序员都关注公众号 机器学习AI算法工程

tensorrtx 是由开发者 wang-xinyu 维护的一个针对 NVIDIA TensorRT 的示例项目集合。它提供了多种常用深度学习模型(主要涵盖 目标检测、图像分割、分类 等)的 TensorRT 推理实现示例。这些示例可以帮助开发者把在 PyTorch、TensorFlow 等深度学习框架下训练好的模型,快速转换并部署到 TensorRT 中,从而获得 低延迟、高吞吐量 的推理性能。

核心价值

简洁、直观的示例工程:

每个模型都对应一个独立的文件夹和项目配置,涵盖 模型转换(通常是 .pth / .onnx → TensorRT engine)、推理代码、后处理 等完整流程。

高效部署参考:

tensorrtx 提供了最佳实践示例,许多优化技巧(如 INT8/FP16、Plugins、Cuda Stream 等)在示例中都有体现,能帮助你在实际项目中快速验证和应用。

可持续更新:

该项目随 TensorRT 和 各大主流网络版本 的迭代而不断更新,紧跟前沿模型(如 YOLOv11)的部署需求。

适配多种平台:

tensorrtx 一般在 Linux + NVIDIA GPU 环境下使用。也有部分开发者把它迁移到 Windows 或 Jetson 设备上,通过 CMake 配置可灵活切换。

windows10下yolov11 tensorrtx推理加速

  1. 一. yolov11 python环境安装

  1. 基础环境

CUDA:cuda_11.8.0_522.06_windows

cudnn:cudnn-windows-x86_64-8.6.0.163_cuda11-archive

  
##创建python环境  
conda create --name yolov11 python=3.10 -y  
##安装pytorch  
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118  
##安装yolov11  
pip install ultralytics -i https://pypi.mirrors.ustc.edu.cn/simple/  
##安装必要的库  
pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple/  

二. windows10下yolov11 tensorrtx推理加速

  
##官网下载tensorrtx  
git clone https://github.com/wang-xinyu/tensorrtx.git  
  
##进入yolov11文件夹,转换模型.pt转.wts  
python gen_wts.py -w D:\code\ultralytics-main\yolo11n.pt -o yolo11n.wts -t detect  

picture.image

注意:pytorch2.6需要修改代码中的torch.load,在里面添加weigths_only=False

picture.image

修改cmakeList.txt文件

根据自己的opencv ,tensort,dirent所在目录路径,修改以下文件路径

  
cmake_minimum_required(VERSION 3.10)  
  
project(yolov11)  
  
add_definitions(-std=c++11)  
add_definitions(-DAPI_EXPORTS)  
add_compile_definitions(NOMINMAX)  
  
set(CMAKE_CXX_STANDARD 11)  
set(CMAKE_BUILD_TYPE Debug)  
  
set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86)  
set(CMAKE_CUDA_COMPILER "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/bin/nvcc.exe")  
enable_language(CUDA)  
  
include_directories(${PROJECT_SOURCE_DIR}/include)  
include_directories(${PROJECT_SOURCE_DIR}/plugin)  
  
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different  
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")  
    message("embed_platform on")  
    include_directories(/usr/local/cuda/targets/aarch64-linux/include)  
    link_directories(/usr/local/cuda/targets/aarch64-linux/lib)  
else()  
    message("embed_platform off")  
    # cuda  
    find_package(CUDA REQUIRED)  
    include_directories(${CUDA_INCLUDE_DIRS})  
  
    # tensorrt  
    set(TRT_DIR "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\TensorRT-8.6.0.12")    
    set(TRT_INCLUDE_DIRS ${TRT_DIR}\\include)   
    set(TRT_LIB_DIRS ${TRT_DIR}\\lib)   
    include_directories(${TRT_INCLUDE_DIRS})  
    link_directories(${TRT_LIB_DIRS})  
  
    # opencv  
    set(OpenCV_DIR "D:\\Program Files\\opencv\\build")   
    set(OpenCV_INCLUDE_DIRS ${OpenCV_DIR}\\include)   
    set(OpenCV_LIB_DIRS ${OpenCV_DIR}\\x64\\vc16\\lib)   
    set(OpenCV_Debug_LIBS "opencv_world4110d.lib")   
    set(OpenCV_Release_LIBS "opencv_world4110.lib")   
    include_directories(${OpenCV_INCLUDE_DIRS})  
    link_directories(${OpenCV_LIB_DIRS})  
  
    # dirent  
    set(Dirent_INCLUDE_DIRS "D:\\Program Files\\dirent\\include")  
    include_directories(${Dirent_INCLUDE_DIRS})  
endif()  
  
add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)  
target_link_libraries(myplugins nvinfer cudart)  
  
file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)  
  
add_executable(yolo11_det ${PROJECT_SOURCE_DIR}/yolo11_det.cpp ${SRCS})  
target_link_libraries(yolo11_det nvinfer)  
target_link_libraries(yolo11_det cudart)  
target_link_libraries(yolo11_det myplugins)  
target_link_libraries(yolo11_det ${OpenCV_Debug_LIBS})  
target_link_libraries(yolo11_det ${OpenCV_Release_LIBS})  
  
add_executable(yolo11_cls ${PROJECT_SOURCE_DIR}/yolo11_cls.cpp ${SRCS})  
target_link_libraries(yolo11_cls nvinfer)  
target_link_libraries(yolo11_cls cudart)  
target_link_libraries(yolo11_cls myplugins)  
target_link_libraries(yolo11_cls ${OpenCV_Debug_LIBS})  
target_link_libraries(yolo11_cls ${OpenCV_Release_LIBS})  
  
add_executable(yolo11_seg ${PROJECT_SOURCE_DIR}/yolo11_seg.cpp ${SRCS})  
target_link_libraries(yolo11_seg nvinfer)  
target_link_libraries(yolo11_seg cudart)  
target_link_libraries(yolo11_seg myplugins)  
target_link_libraries(yolo11_seg ${OpenCV_Debug_LIBS})  
target_link_libraries(yolo11_seg ${OpenCV_Release_LIBS})  
  
add_executable(yolo11_pose ${PROJECT_SOURCE_DIR}/yolo11_pose.cpp ${SRCS})  
target_link_libraries(yolo11_pose nvinfer)  
target_link_libraries(yolo11_pose cudart)  
target_link_libraries(yolo11_pose myplugins)  
target_link_libraries(yolo11_pose ${OpenCV_Debug_LIBS})  
target_link_libraries(yolo11_pose ${OpenCV_Release_LIBS})  
  
add_executable(yolo11_obb ${PROJECT_SOURCE_DIR}/yolo11_obb.cpp ${SRCS})  
target_link_libraries(yolo11_obb nvinfer)  
target_link_libraries(yolo11_obb cudart)  
target_link_libraries(yolo11_obb myplugins)  
target_link_libraries(yolo11_obb ${OpenCV_Debug_LIBS})  
target_link_libraries(yolo11_obb ${OpenCV_Release_LIBS})  

构建项目

  
mkdir build  
cd build  
cmake ..  

picture.image

vs打开项目,生成解决方案

picture.image

picture.image

装换.wts为.engine

picture.image

picture.image

picture.image

转换前,这里需要根据自己的模型,修改对应的配置,配置文件在以下位置

picture.image

picture.image

picture.image

picture.image

  
-s ..\yolo11n.wts yolo11n.engine n  

利用转换好的.engine进行推理

  
-d yolo11n.engine D:\code\yolov5-6.1\data\images g  

picture.image

picture.image

picture.image

windows10下qt调用tensorrtx加速的yolov11进行检测

picture.image

修改Qt项目中的cmakeList.txt文件如下:

  
cmake_minimum_required(VERSION 3.5)  
  
project(yolov11Test LANGUAGES CXX)  
  
add_definitions(-std=c++11)  
add_definitions(-DAPI_EXPORTS)  
add_compile_definitions(NOMINMAX)  
  
set(CMAKE_CXX_STANDARD 17)  
set(CMAKE_CXX_STANDARD_REQUIRED ON)  
  
set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86)  
set(CMAKE_CUDA_COMPILER "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/bin/nvcc.exe")  
enable_language(CUDA)  
  
include_directories(${PROJECT_SOURCE_DIR}/include)  
include_directories(${PROJECT_SOURCE_DIR}/plugin)  
  
# cuda  
find_package(CUDA REQUIRED)  
include_directories(${CUDA_INCLUDE_DIRS})  
  
# tensorrt  
set(TRT_DIR "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\TensorRT-8.6.0.12")  
set(TRT_INCLUDE_DIRS ${TRT_DIR}\\include)  
set(TRT_LIB_DIRS ${TRT_DIR}\\lib)  
include_directories(${TRT_INCLUDE_DIRS})  
link_directories(${TRT_LIB_DIRS})  
  
# opencv  
set(OpenCV_DIR "D:\\Program Files\\opencv\\build")  
set(OpenCV_INCLUDE_DIRS ${OpenCV_DIR}\\include)  
set(OpenCV_LIB_DIRS ${OpenCV_DIR}\\x64\\vc16\\lib)  
set(OpenCV_Debug_LIBS "opencv_world4110d.lib")  
set(OpenCV_Release_LIBS "opencv_world4110.lib")  
include_directories(${OpenCV_INCLUDE_DIRS})  
link_directories(${OpenCV_LIB_DIRS})  
  
# dirent  
set(Dirent_INCLUDE_DIRS "D:\\Program Files\\dirent\\include")  
include_directories(${Dirent_INCLUDE_DIRS})  
  
add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)  
target_link_libraries(myplugins nvinfer cudart)  
  
file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)  
  
add_executable(yolov11Test main.cpp ${SRCS})  
  
target_link_libraries(yolov11Test nvinfer)  
target_link_libraries(yolov11Test cudart)  
target_link_libraries(yolov11Test myplugins)  
target_link_libraries(yolov11Test ${OpenCV_Debug_LIBS})  
target_link_libraries(yolov11Test ${OpenCV_Release_LIBS})  
  
install(TARGETS yolov11Test  
    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}  
    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}  
)  
  

main函数代码如下:

  
#include <fstream>  
#include <iostream>  
#include <opencv2/opencv.hpp>  
#include "cuda_utils.h"  
#include "logging.h"  
#include "model.h"  
#include "postprocess.h"  
#include "preprocess.h"  
#include "utils.h"  
  
Logger gLogger;  
using namespace nvinfer1;  
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;  
  
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context)  
{  
    std::ifstream file(engine_name, std::ios::binary);  
    if (!file.good())  
    {  
        std::cerr << "read " << engine_name << " error!" << std::endl;  
        assert(false);  
    }  
    size_t size = 0;  
    file.seekg(0, file.end);  
    size = file.tellg();  
    file.seekg(0, file.beg);  
    char* serialized_engine = new char[size];  
    assert(serialized_engine);  
    file.read(serialized_engine, size);  
    file.close();  
  
    *runtime = createInferRuntime(gLogger);  
    assert(*runtime);  
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);  
    assert(*engine);  
    *context = (*engine)->createExecutionContext();  
    assert(*context);  
    delete[] serialized_engine;  
}  
  
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process)  
{  
    assert(engine->getNbBindings() == 2);  
    // In order to bind the buffers, we need to know the names of the input and output tensors.  
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()  
    const int inputIndex = engine->getBindingIndex(kInputTensorName);  
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);  
    assert(inputIndex == 0);  
    assert(outputIndex == 1);  
    // Create GPU buffers on device  
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));  
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));  
    if (cuda_post_process == "c") {  
        *output_buffer_host = new float[kBatchSize * kOutputSize];  
    } else if (cuda_post_process == "g") {  
        if (kBatchSize > 1) {  
            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;  
            exit(0);  
        }  
        // Allocate memory for decode_ptr_host and copy to device  
        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];  
        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));  
    }  
}  
  
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,  
           float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {  
    // infer on the batch asynchronously, and DMA output back to host  
    auto start = std::chrono::system_clock::now();  
    context.enqueueV2(buffers, stream, nullptr);  
    if (cuda_post_process == "c") {  
        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,  
                                   stream));  
        auto end = std::chrono::system_clock::now();  
        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()  
                  << "ms" << std::endl;  
    } else if (cuda_post_process == "g") {  
        CUDA_CHECK(  
            cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));  
        cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);  
        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms  
        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,  
                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,  
                                   stream));  
        auto end = std::chrono::system_clock::now();  
        std::cout << "inference and gpu postprocess time: "  
                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;  
    }  
  
    CUDA_CHECK(cudaStreamSynchronize(stream));  
}  
  
int main(int argc, char** argv) {  
    // yolo11_det -s ../models/yolo11n.wts ../models/yolo11n.fp32.trt n  
    // yolo11_det -d ../models/yolo11n.fp32.trt ../images c  
    cudaSetDevice(kGpuId);  
    std::string engine_name= "D:\\code\\tensorrtx\\yolo11\\build\\yolo11n.engine"; //转换好的模型文件路径  
    std::string img_dir= "D:\\code\\yolov5-6.1\\data\\images\\"; //要预测的图像文件夹所在路径  
    std::string cuda_post_process = "g";  
    int model_bboxes;  
    float gd = 0, gw = 0;  
    int max_channels = 0;  
  
    // 反序列化模型文件 Deserialize the engine from file  
    IRuntime* runtime = nullptr;  
    ICudaEngine* engine = nullptr;  
    IExecutionContext* context = nullptr;  
    deserialize_engine(engine_name, &runtime, &engine, &context);  
    cudaStream_t stream;  
    CUDA_CHECK(cudaStreamCreate(&stream));  
    cuda_preprocess_init(kMaxInputImageSize);  
    auto out_dims = engine->getBindingDimensions(1);  
    model_bboxes = out_dims.d[0];  
  
    // 准备cpu和gpu缓存 Prepare cpu and gpu buffers  
    float* device_buffers[2];  
    float* output_buffer_host = nullptr;  
    float* decode_ptr_host = nullptr;  
    float* decode_ptr_device = nullptr;  
  
    // 从文件夹中读取图像 Read images from directory  
    std::vector<std::string> file_names;  
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {  
        std::cerr << "read_files_in_dir failed." << std::endl;  
        return -1;  
    }  
  
    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,  
                   &decode_ptr_device, cuda_post_process);  
  
    // 批预测batch predict  
    for (size_t i = 0; i < file_names.size(); i += kBatchSize)  
    {  
        // 通过opencv读取一批图像Get a batch of images  
        std::vector<cv::Mat> img_batch;  
        std::vector<std::string> img_name_batch;  
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++)  
        {  
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);  
            img_batch.push_back(img);  
            img_name_batch.push_back(file_names[j]);  
        }  
        // Preprocess  
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);  
        // 进行推理Run inference  
        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,  
              decode_ptr_device, model_bboxes, cuda_post_process);  
        // 保存output_buffer_host的前100个值,一行一个  
        //        std::ofstream out("../models/output.txt");  
        //        for (int j = 0; j < 100; j++) {  
        //            out << output_buffer_host[j] << std::endl;  
        //        }  
        //        out.close();  
        std::vector<std::vector<Detection>> res_batch;  
        if (cuda_post_process == "c")  
        {  
            // NMS非极大值抑制  
            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);  
        } else if (cuda_post_process == "g")  
        {  
            //GPU非极大值抑制Process gpu decode and nms results  
            batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);  
        }  
  
        // 绘制结果Draw bounding boxes  
        draw_bbox(img_batch, res_batch);  
        //显示图像  
        for (size_t j = 0; j < img_batch.size(); j++)  
        {  
            cv::imshow("results", img_batch[j]);  
            cv::waitKey(0);  
        }  
  
        // 保存图像Save images  
        for (size_t j = 0; j < img_batch.size(); j++)  
        {  
            cv::imwrite("_" + img_name_batch[j], img_batch[j]);  
        }  
    }  
  
    // Release stream and buffers  
    cudaStreamDestroy(stream);  
    CUDA_CHECK(cudaFree(device_buffers[0]));  
    CUDA_CHECK(cudaFree(device_buffers[1]));  
    CUDA_CHECK(cudaFree(decode_ptr_device));  
    delete[] decode_ptr_host;  
    delete[] output_buffer_host;  
    cuda_preprocess_destroy();  
    // Destroy the engine  
    delete context;  
    delete engine;  
    delete runtime;  
  
    // Print histogram of the output distribution  
    //std::cout << "\nOutput:\n\n";  
    //for (unsigned int i = 0; i < kOutputSize; i++)  
    //{  
    //    std::cout << prob[i] << ", ";  
    //    if (i % 10 == 0) std::cout << std::endl;  
    //}  
    //std::cout << std::endl;  
  
    return 0;  
}  
  

机器学习算法AI大数据技术

搜索公众号添加: datanlp

picture.image

长按图片,识别二维码

阅读过本文的人还看了以下文章:

实时语义分割ENet算法,提取书本/票据边缘

整理开源的中文大语言模型,以规模较小、可私有化部署、训练成本较低的模型为主

《大语言模型》PDF下载

动手学深度学习-(李沐)PyTorch版本

YOLOv9电动车头盔佩戴检测,详细讲解模型训练

TensorFlow 2.0深度学习案例实战

基于40万表格数据集TableBank,用MaskRCNN做表格检测

《基于深度学习的自然语言处理》中/英PDF

Deep Learning 中文版初版-周志华团队

【全套视频课】最全的目标检测算法系列讲解,通俗易懂!

《美团机器学习实践》_美团算法团队.pdf

《深度学习入门:基于Python的理论与实现》高清中文PDF+源码

《深度学习:基于Keras的Python实践》PDF和代码

特征提取与图像处理(第二版).pdf

python就业班学习视频,从入门到实战项目

2019最新《PyTorch自然语言处理》英、中文版PDF+源码

《21个项目玩转深度学习:基于TensorFlow的实践详解》完整版PDF+附书代码

《深度学习之pytorch》pdf+附书源码

PyTorch深度学习快速实战入门《pytorch-handbook》

【下载】豆瓣评分8.1,《机器学习实战:基于Scikit-Learn和TensorFlow》

《Python数据分析与挖掘实战》PDF+完整源码

汽车行业完整知识图谱项目实战视频(全23课)

李沐大神开源《动手学深度学习》,加州伯克利深度学习(2019春)教材

笔记、代码清晰易懂!李航《统计学习方法》最新资源全套!

《神经网络与深度学习》最新2018版中英PDF+源码

将机器学习模型部署为REST API

FashionAI服装属性标签图像识别Top1-5方案分享

重要开源!CNN-RNN-CTC 实现手写汉字识别

yolo3 检测出图像中的不规则汉字

同样是机器学习算法工程师,你的面试为什么过不了?

前海征信大数据算法:风险概率预测

【Keras】完整实现‘交通标志’分类、‘票据’分类两个项目,让你掌握深度学习图像分类

VGG16迁移学习,实现医学图像识别分类工程项目

特征工程(一)

特征工程(二) :文本数据的展开、过滤和分块

特征工程(三):特征缩放,从词袋到 TF-IDF

特征工程(四): 类别特征

特征工程(五): PCA 降维

特征工程(六): 非线性特征提取和模型堆叠

特征工程(七):图像特征提取和深度学习

如何利用全新的决策树集成级联结构gcForest做特征工程并打分?

Machine Learning Yearning 中文翻译稿

蚂蚁金服2018秋招-算法工程师(共四面)通过

全球AI挑战-场景分类的比赛源码(多模型融合)

斯坦福CS230官方指南:CNN、RNN及使用技巧速查(打印收藏)

python+flask搭建CNN在线识别手写中文网站

中科院Kaggle全球文本匹配竞赛华人第1名团队-深度学习与特征工程

不断更新资源

深度学习、机器学习、数据分析、python

搜索公众号添加: datayx

picture.image

0
0
0
0
关于作者

文章

0

获赞

0

收藏

0

相关资源
字节跳动 XR 技术的探索与实践
火山引擎开发者社区技术大讲堂第二期邀请到了火山引擎 XR 技术负责人和火山引擎创作 CV 技术负责人,为大家分享字节跳动积累的前沿视觉技术及内外部的应用实践,揭秘现代炫酷的视觉效果背后的技术实现。
相关产品
评论
未登录
看完啦,登录分享一下感受吧~
暂无评论