TensorRT模型部署推理-1 基本流程

pytorch的推理速度常被人诟病，在生产环境中会采用一些特殊的格式去部署，常见的有onnx的静态图，适用性广，在unity3D中也可以直接用，在此之上还可以进一步做优化，具体到硬件层面，有英伟达的GPU，华为的昇腾，各家在静态图的基础上又做了些工作，比如层融合。由于手头没有华为的板子，再结合上次玩过香橙派的经验，此次就尝试用tensorRT做OCR模型的部署推理，在此基础上用C++搭建web服务，正好接入前段时间做的档案系统。

开发环境

CUDA Toolkit + cudnn + tensorRT，注意版本对应即可，认准cuda版本。

安装过程可参考：

https://blog.csdn.net/weixin_44822312/article/details/148653179

https://www.cnblogs.com/wanqieddy/p/17581996.html

第一个博客中介绍了vllm在wsl中的部署方法，包括了安装wsl的过程与CUDA Toolkit的过程；进入wsl的终端中可以跟着播客二继续后两个依赖的安装，注意cudnn与tensorRT的安装方式需要一致（都用tar安装）。此时应该是能运行样例了（在wsl中）。

在wsl中开发确实不便，但是在Windows中安装开发环境更一种折磨，所以在Windows上使用wsl的编译链就是一个可行的方案了，在项目实践中，我使用的clion + wsl，配置方法可见，配置完第一步即可使用wsl的编译工具链。

https://blog.csdn.net/u013250861/article/details/127778345

基本流程

为了更方便复用，我们对引擎相关的API都进行二次封装

头文件

在官方样例的构造函数中我们能看到两个很关键的两个指针，一个用来构建引擎，一个用来执行网络：

1
2

std::shared_ptr<nvinfer1::IRuntime> mRuntime;   //!< The TensorRT runtime used to deserialize the engine
std::shared_ptr<nvinfer1::ICudaEngine> mEngine; //!< The TensorRT engine used to run the network

在此基础上我们封装一个更完善的引擎类，并且继承两个监控性质的类，其中ILogger是必须的，tensorRT所有的日志信息都需要手动传入日志对象，内部框架负责往日志引用对象中写信息；IProfiler用于监控，查看每一层的情况：

//
// TensorRTEngine.h - Improved TensorRT 10 Engine Wrapper
// Created by Altria on 2025/7/18.
//

#ifndef TENSORRTENGINE_H
#define TENSORRTENGINE_H

#pragma once

#include <algorithm>
#include <memory>
#include <vector>
#include <string>
#include <unordered_map>
#include <iostream>
#include <fstream>
#include <mutex>

#include <NvInfer.h>
#include <NvOnnxParser.h>
#include <cuda_runtime.h>

class Logger final : public nvinfer1::ILogger {
public:
    void log(const Severity severity, const char* msg) noexcept override {
        if (severity <= Severity::kWARNING) {
            std::cout << "[TensorRT] " << msg << std::endl;
        }
    }
};

class Profiler final : public nvinfer1::IProfiler {
public:

    void reportLayerTime(const char* layerName, const float ms) noexcept override {
        layer_times_[std::string(layerName)] = ms;
        total_time_ += ms;

        // 可选：实时输出层时间信息
        std::cout << "[Profiler] Layer '" << layerName << "': " << ms << " ms" << std::endl;
    }

    // 额外的便利方法
    void reset() {
        layer_times_.clear();
        total_time_ = 0.0f;
    }

    float getTotalTime() const {
        return total_time_;
    }

    const std::unordered_map<std::string, float>& getLayerTimes() const {
        return layer_times_;
    }

    // 获取最耗时的几个层
    std::vector<std::pair<std::string, float>> getTopLayers(int top_n = 5) const {
        std::vector<std::pair<std::string, float>> layers(layer_times_.begin(), layer_times_.end());
        std::sort(layers.begin(), layers.end(),
                  [](const auto& a, const auto& b) { return a.second > b.second; });

        if (layers.size() > static_cast<size_t>(top_n)) {
            layers.resize(top_n);
        }
        return layers;
    }

private:
    std::unordered_map<std::string, float> layer_times_;
    float total_time_ = 0.0f;
};

class TensorRTEngine {
public:
    struct EngineConfig {
        size_t max_workspace_size = 1ULL << 30;  // 1GB
        int max_batch_size = 1;
        bool use_fp16 = false;
        bool use_int8 = false;
        bool use_tf32 = true;  // TensorRT 10 默认启用TF32
        int dla_core = -1;  // -1 表示不使用DLA
        bool enable_dynamic_shapes = false;
        bool enable_timing_cache = true;
        std::string timing_cache_path = "";
        int optimization_level = 3;  // 0-5, 数值越高优化越激进
    };

    // 支持的数据类型枚举
    enum class DataType {
        FLOAT32,
        FLOAT16,
        INT8,
        INT32,
        BOOL
    };

    TensorRTEngine();
    ~TensorRTEngine();

    // 禁用拷贝构造和赋值
    TensorRTEngine(const TensorRTEngine&) = delete;
    TensorRTEngine& operator=(const TensorRTEngine&) = delete;

    // 从ONNX模型文件初始化
    bool initializeFromOnnx(const std::string& onnx_path, const EngineConfig& config = EngineConfig());

    // 从序列化的引擎文件初始化
    bool initializeFromEngine(const std::string& engine_path);

    // 保存引擎到文件
    bool saveEngine(const std::string& engine_path);

    // 执行推理 - 支持多种数据类型
    bool infer(const std::unordered_map<std::string, std::vector<float>>& inputs,
               std::unordered_map<std::string, std::vector<float>>& outputs);

    // 通用推理接口（支持不同数据类型）
    bool inferGeneric(const std::unordered_map<std::string, void*>& inputs,
                     const std::unordered_map<std::string, size_t>& input_sizes,
                     std::unordered_map<std::string, void*>& outputs,
                     const std::unordered_map<std::string, size_t>& output_sizes);

    // 异步推理
    bool inferAsync(const std::unordered_map<std::string, std::vector<float>>& inputs,
                   std::unordered_map<std::string, std::vector<float>>& outputs,
                   cudaStream_t stream = nullptr);

    // 获取输入/输出信息
    std::vector<std::string> getInputNames() const;
    std::vector<std::string> getOutputNames() const;
    std::vector<std::string> getAllTensorNames() const;

    // 获取tensor维度信息
    nvinfer1::Dims getTensorDims(const std::string& name) const;
    nvinfer1::Dims getInputDims(const std::string& name) const;
    nvinfer1::Dims getOutputDims(const std::string& name) const;

    // 获取tensor数据类型
    nvinfer1::DataType getTensorDataType(const std::string& name) const;
    nvinfer1::TensorIOMode getTensorIOMode(const std::string& name) const;

    // 动态shape支持
    bool setInputShape(const std::string& name, const nvinfer1::Dims& dims);
    bool setOptimizationProfile(int profile_index = 0);

    // 获取tensor大小（字节数）
    size_t getTensorSize(const std::string& name) const;

    // 批处理支持
    bool setBatchSize(int batch_size);
    int64_t getBatchSize() const;

    // 状态查询
    bool isInitialized() const { return is_initialized_; }
    bool isDynamicShape() const { return has_dynamic_shapes_; }

    // 错误处理
    const std::string& getLastError() const { return last_error_; }
    void clearError() { last_error_.clear(); }

    // 性能分析
    void enableProfiling(bool enable = true);
    std::string getProfilingInfo() const;

    // 内存管理
    size_t getUsedGPUMemory() const;
    void warmup(int num_iterations = 10);

private:
    struct TensorInfo {
        std::string name;
        nvinfer1::Dims dims;
        nvinfer1::DataType data_type;
        nvinfer1::TensorIOMode io_mode;
        size_t size;  // 字节大小
        void* device_ptr;
        void* host_ptr;
        bool is_input;
        bool is_dynamic;  // 是否为动态shape
    };

    // 初始化相关
    bool buildEngineFromOnnx(const std::string& onnx_path, const EngineConfig& config);

    static bool loadTimingCache(const std::string& cache_path, nvinfer1::IBuilderConfig* config);
    static bool saveTimingCache(const std::string& cache_path, const nvinfer1::IBuilderConfig* config);

    // 内存管理
    bool allocateBuffers();
    void deallocateBuffers();
    bool reallocateBuffers();  // 动态shape时重新分配

    // 工具函数
    static size_t getElementSize(nvinfer1::DataType data_type);
    static size_t getDimsSize(const nvinfer1::Dims& dims);
    static std::string dimsToString(const nvinfer1::Dims& dims);

    // 错误处理
    void setLastError(const std::string& error);

    // 验证函数
    bool validateInputs(const std::unordered_map<std::string, std::vector<float>>& inputs) const;
    bool validateTensorShape(const std::string& name, const nvinfer1::Dims& dims) const;

    // 成员变量
    Logger logger_;
    Profiler profiler_;
    std::unique_ptr<nvinfer1::IRuntime> runtime_;
    std::unique_ptr<nvinfer1::ICudaEngine> engine_;
    std::unique_ptr<nvinfer1::IExecutionContext> context_;

    std::vector<TensorInfo> tensors_;
    std::unordered_map<std::string, int> tensor_name_to_index_;

    bool is_initialized_;
    bool has_dynamic_shapes_;
    std::string last_error_;

    // CUDA相关
    cudaStream_t stream_;
    bool own_stream_;  // 是否拥有stream

    // 性能相关
    bool profiling_enabled_;
    mutable std::mutex mutex_;  // 线程安全

    // 配置信息
    EngineConfig current_config_;
    size_t total_gpu_memory_;
};

#endif //TENSORRTENGINE_H

从onnx模型初始化一个引擎并且保存文件

便利性与性能只能二选一了，追求性能就需要提前计算大小，申请内存，既然操作内存，就免不了使用指针，而且一般来说代码都是在内存里执行的，对于AI模型来说就需要用到GPU做计算，这里还涉及到内存到GPU的拷贝。扯远了……
我们仿照官方样例，实现bool buildEngineFromOnnx(const std::string& onnx_path, const EngineConfig& config);，加载一个模型进来。

我目前不打算深究tensorRT为什么这样设计，先用再说吧。对于onnx转引擎的过程主要有三个部分组成：IBuilder、INetworkDefinition、Iparser。

// 1、创建构建器，注意此处传入了头文件中定义的logger，如果查看官方文档的话，会发现此处就是引用。
const std::unique_ptr<nvinfer1::IBuilder> builder(nvinfer1::createInferBuilder(logger_));

// 2、创建网络定义，利用刚刚定义的builder创建网络，其中explicit_batch表示：显式Batch模式
constexpr auto explicit_batch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
std::unique_ptr<nvinfer1::INetworkDefinition> network(builder->createNetworkV2(explicit_batch));

// 3、创建onnx解析器，需要传入的参数就是我们刚刚定义好的网络，更准确说是一个指针。
const std::unique_ptr<nvonnxparser::IParser> parser(nvonnxparser::createParser(*network, logger_));

定义好以上三个组件，就可以开始解析onnx了，前面都是些定义，执行完下面的语句，大致可以认为onnx被load进来完成了解析。

1	const bool parsed = parser->parseFromFile(onnx_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kWARNING));

此时我们基本已经有了网络了，现在需要给网络进行一些设定，包括运行时的最大内存限制、模型精度、优化级别等，这些信息由builder进行初始化，然后构建引擎：

std::unique_ptr<nvinfer1::IBuilderConfig> builder_config(builder->createBuilderConfig());

builder_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, config.max_workspace_size);

builder_config->clearFlag(nvinfer1::BuilderFlag::kTF32);

//构建引擎
engine_.reset(builder->buildEngineWithConfig(*network, *builder_config));

到此为止，引擎就已经构建好了。此时可以导出为引擎文件，先序列化，再保存，下次启动就不用从onnx转了，会更快：

1
2

const std::unique_ptr<nvinfer1::IHostMemory> serialized_engine(engine_->serialize());
file.write(static_cast<const char*>(serialized_engine->data()), static_cast<std::streamsize>(serialized_engine->size()));

给engine创建上下文并且申请输入输出的内存

在此之前我们先说一下：运行时runtime_相对比较特殊，它主要用于序列化与反序列化的一些工具方法，比如上面我们把序列化的引擎保存了下来，runtime_提供了反序列的功能，重新构建engine_

创建context需要由引擎engine来实现：

1	context_.reset(engine_->createExecutionContext());

上下文现在也是个“毛坯房”，需要一些基础的设定，其中最重要的就是：输入输出的tensor的位置。目前的理解中engine_已经被放到了GPU显存中的某段位置中，我们需要再GPU显存中找两个位置，告诉输入的数据在什么位置，输出的位置在什么位置，既然要知道位置，说明在执行推理之前，我们需要提前申请内存。

所以接下来的工作就是找到输入输出的tensor数量，以及对应的信息。engine_提供了获取全部输入输出tensor数量的接口，根据数量我们需要给每一个tensor申请内存，在tensorRT 10的API中可以直接从索引拿到tensor名，相对8的API简单了不少，至少不用写tensor name与索引的map了。

bool TensorRTEngine::allocateBuffers(){
    const int num_tensors = engine_->getNbIOTensors();
    
    for (int i = 0; i < num_tensors; ++i) {
        TensorInfo tensor;

        tensor.name = engine_->getIOTensorName(i);
        tensor.dims = engine_->getTensorShape(tensor.name.c_str());
        tensor.data_type = engine_->getTensorDataType(tensor.name.c_str());
        tensor.io_mode = engine_->getTensorIOMode(tensor.name.c_str());
        tensor.is_input = (tensor.io_mode == nvinfer1::TensorIOMode::kINPUT);

        // 计算大小
        const size_t element_size = getElementSize(tensor.data_type);
        const size_t dims_size = getDimsSize(tensor.dims);
        tensor.size = element_size * dims_size;

        if (tensor.size == 0 && !tensor.is_dynamic) {
            setLastError("Invalid tensor size for: " + tensor.name);
            return false;
        }


        // 分配GPU内存
        const cudaError_t err = cudaMalloc(&tensor.device_ptr, tensor.size);
        if (err != cudaSuccess) {
            setLastError("Failed to allocate GPU memory for tensor '" + tensor.name +
                    "': " + std::string(cudaGetErrorString(err)));
            cudaFree(tensor.device_ptr);
            deallocateBuffers();
            return false;
        }

        // 分配CPU内存
        tensor.host_ptr = malloc(tensor.size);
        if (!tensor.host_ptr) {
            setLastError("Failed to allocate CPU memory for tensor '" + tensor.name + "'");
            cudaFree(tensor.device_ptr);
            deallocateBuffers();
            return false;
        }

        total_gpu_memory_ += tensor.size;
        tensor_name_to_index_[tensor.name] = static_cast<int>(tensors_.size());
        tensors_.push_back(tensor);

        std::cout << "[Info] Allocated buffer for tensor '" << tensor.name
              << "' - Shape: " << dimsToString(tensor.dims)
              << ", Size: " << tensor.size << " bytes"
              << ", Dynamic: " << (tensor.is_dynamic ? "Yes" : "No") << std::endl;
    }
}

是时候执行推理了

我们申请的输入输出tensor内存分为两个部分，一部分在CPU的内存，一个GPU的内存，上面都有给输入输出申请的空间，推理时需要先将输入数复制到预留的CPU内存中，再传输到GPU的内存上，执行推理。

此时就需要用：上下文context_。我们需要把张量的位置信息添加到上下文信息中：

// 复制tensor到CUDA stream中
const cudaError_t err = cudaMemcpyAsync(tensor.device_ptr, tensor.host_ptr,tensor.size, cudaMemcpyHostToDevice, stream_);

// 给上下文赋予张量位置信息
const bool success = context_->setTensorAddress(tensor.name.c_str(), tensor.device_ptr);

调用官方接口执行推理，我们用的是stream_的方式：

1	const bool success = context_->enqueueV3(stream_);

推理完成后将数据从GPU内存中复制回来：函数名很像，但是有一个非常重要的标记位：cudaMemcpyDeviceToHost

1	const cudaError_t err = cudaMemcpyAsync(tensor.host_ptr, tensor.device_ptr, tensor.size, cudaMemcpyDeviceToHost, stream_);

之后我们将结果输出即可。