YOLOv8模型部署

这篇具有很好参考价值的文章主要介绍了YOLOv8模型部署。希望对大家有所帮助。如果存在错误或未考虑完全的地方,请大家不吝赐教,您也可以点击"举报违法"按钮提交疑问。

YOLOv8模型部署 (针对道路交通采用多样化场景数据训练的模型 来测试新的场景)

——包含yolov8导出onnx【input shape (1, 3, 544, 960) BCHW ,output shape(s) (1, 10710, 15)】

——包含使用trtexec生成engine文件

——包含使用engine文件部署推理的所有源码

参考:

(1)yolov8之导出onnx(二)_model.export(format=“onnx”)-CSDN博客

说明:

(1)本次针对多样化场景,不同场景取1000+张图片(1920*1080)。

(2)训练YOLOv8模型(包含n和s的模型)

1 训练得到的pt文件转onnx

模型是通过Rect=True, imgsz = 960训练的。

# Ultralytics YOLO 🚀, AGPL-3.0 license
# yolo mode=export model=./weights/best.pt format=onnx imgsz=960,540

from ultralytics import YOLO

model = YOLO("./weights/best.pt") 
# success = model.export(format="onnx") 
success = model.export(format="onnx", imgsz=[544,960]) 

上述,若不改变其它源码,则转化过程是input shape (1, 3, 960, 544) BCHW and output shape(s) (1, 15, 10710) 。

为了“ input shape (1, 3, 960, 544) BCHW and output shape(s) (1, 10710, 15) ”

需作源码调整:改变/home/user/anaconda3/envs/yolov8/lib/python3.10/site-packages/ultralytics/nn/modules/head.py中的源码如下,return前增加’y = y.permute(0,2,1)'。

class Detect(nn.Module):
    ... ...
    def forward(self, x):
        ... ...
        y = y.permute(0,2,1)
        return y if self.export else (y, x)

2 onnx转engine

# trtexec --onnx=*.onnx --workspace=1024 --best --saveEngine=*.engine --calib=*.calib
# 示例:
/usr/src/tensorrt/bin/trtexec --onnx=best.onnx --workspace=2048 --saveEngine=best_dqjjsy.engine --fp16

3 部署测试源码

目录
code
——testimages  # 测试图片集
——testresult  # 检测结果(YOLO格式)
——src
————include
		src/include/cudaimgproc.cuh
		src/include/streamproc.h
————lib
		src/lib/cudaimgproc.cu
		src/lib/cudaimgproc.o
		src/lib/streamproc.cpp
		src/lib/streamproc.o
————main.cpp
————main.o
——weight     # 模型(engine格式)
——Makefile

Makefile

ifeq ($(VERBOSE), 1)
AT =
else
AT = @
endif

APP := main

CPP				= $(AT) g++
NVCC            = $(AT) /usr/local/cuda/bin/nvcc -ccbin $(filter-out $(AT), $(CPP))

NVCCFLAGS := -arch=compute_86 -code=sm_86

CPPFLAGS += -std=c++11 \
	-I/usr/local/cuda/targets/x86_64-linux/include \
	-I/usr/local/cuda/include \
	-I/usr/local/include \
	-I/usr/local/include/opencv4 \
	-I./src/include 

LDFLAGS += \
	-lpthread -lcuda -lcudart -lnvinfer\
	-L/usr/local/cuda/lib64 \
	-L/usr/local/cuda/targets/x86_64-linux/lib \
	-L/usr/local/lib \
	-L./src/lib \
	-L/home/dx/workspace/mylibs/hkv \
	-lopencv_core -lopencv_imgproc \
	-lopencv_videoio -lopencv_highgui -lopencv_imgcodecs


ALL_CPPFLAGS := $(NVCCFLAGS)
ALL_CPPFLAGS += $(addprefix -Xcompiler ,$(filter-out -std=c++11, $(CPPFLAGS)))

SRCS := \
	./src/main.cpp \
	./src/lib/streamproc.cpp

SCRCS_CU := \
	./src/lib/cudaimgproc.cu \

OBJS := $(SRCS:.cpp=.o) $(SCRCS_CU:.cu=.o)

%.o: %.cpp
	@echo "Compiling: $<"
	$(CPP) $(CPPFLAGS) -o $@ -c $<

%.o: %.cu
	@echo "Compiling: $<"
	$(NVCC) $(ALL_CPPFLAGS) -o $@ -c $<

$(APP): $(OBJS)
	@echo "Linking: $@"
	$(CPP) -o $@ $(OBJS) $(CPPFLAGS) $(LDFLAGS)

clean:
	rm -f $(APP) $(OBJS)

main.cpp

#include <iostream>
#include <stdio.h>
#include <string>
#include <pthread.h>
#include <chrono>
#include <cuda_runtime.h>
#include <fstream>

#include "streamproc.h"

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
 

void* video_stream()
{   
    size_t width = 1920, height = 1080;
    // objectdetecter detecter("model.engine");
    objectdetecter detecter("/weight/best.engine");
    
    unsigned char *original_image, *d_original_image;
    cv::Mat picBGR(height, width, CV_8UC3);

    std::vector<std::string> filesPath;
    cv::glob("testimages/*", filesPath);
    cudaMalloc(&d_original_image, width * height * 3 * sizeof(unsigned char));

    std::cout << filesPath.size() << std::endl;
    for (auto file : filesPath)
    {
        cv::Mat org_img = cv::imread(file);
        cudaMemcpy(d_original_image, org_img.data, width * height * 3 * sizeof(unsigned char), cudaMemcpyHostToDevice);
        std::vector<std::vector<double>> detect_result = detecter.detect(d_original_image, width, height);
        std::ofstream txtfile;

        file = file.substr(file.find("/") + 1);
        txtfile.open("testresults/" + file.substr(0, file.size() - 4) + ".txt", std::ios::trunc | std::ios::in);
        for (int i = 0; i < detect_result.size(); i++)
        {
            txtfile << detect_result[i][4] << " " << detect_result[i][0] / width << " " << detect_result[i][1] / height << " " 
                << detect_result[i][2] / width << " " << detect_result[i][3] / height << " " << detect_result[i][5] << "\n";
        }
        txtfile.close();
    }
    cudaFree(d_original_image);

    return 0;
}



int main()
{
    video_stream();
    return 0;
}

src/include/cudaimgproc.cuh

#ifndef __CUDAIMGPROC_CUH__
#define __CUDAIMGPROC_CUH__

void gpuConvertNV12touv420(unsigned char *UV, unsigned char *U, unsigned char *V,
		size_t width, size_t height);

void gpuConvertBGR_HWCtoNV12(unsigned char *src, unsigned char *d_src_pad, unsigned char *Y, unsigned char *UV, 
		size_t width, size_t height, int linesize);

void gpuConvertNV12toBGR_HWC(unsigned char *Y, unsigned char *UV, unsigned char *BGR, unsigned char *d_BGR, 
	size_t width, size_t height, size_t linesize0, size_t linesize1, cudaStream_t stream);

void gpuPreproc(unsigned char *d_BGR, float *d_small, 
	size_t width, size_t swidth, size_t sheight, size_t pad, cudaStream_t stream);

void gpuinfinit(float *img, size_t width, size_t height);

#endif

src/include/streamproc.h

#ifndef __HWSTREAMPROC_H__
#define __HWSTREAMPROC_H__

#include <iostream>
#include <string>
#include <vector>

extern "C"
{
#include "libavformat/avformat.h"
#include "libavutil/hwcontext.h"
#include "libavutil/opt.h"
#include "libavcodec/avcodec.h"
}

#include <cuda_runtime_api.h>
#include "NvInfer.h"
#include <chrono>


class objectdetecter
{
private:
    cudaStream_t stream;
    nvinfer1::IExecutionContext *trt_context;
    float *d_input;
    float *output;
    float *trt_output;
    float *trt_input;
    size_t in_channel;
    size_t in_width;
    size_t in_height;
    size_t out_col;
    size_t out_row;
    const char *in_name;
    const char *out_name;

public:
    objectdetecter(const char *engine_path);
    ~objectdetecter();
    std::vector<std::vector<double>> detect(unsigned char *input, size_t width, size_t height);
};


#endif

src/lib/cudaimgproc.cu文章来源地址https://www.toymoban.com/news/detail-836582.html

#include <cuda_runtime.h>
#include "cudaimgproc.cuh"


__global__ void gpuPreproc_kernel(unsigned char *BGR, float *RGB, size_t width, size_t swidth, size_t sheight, size_t pad)
{
    __shared__ float cache[1536];
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    float den = 255.0f * 4.0f;
    int idx_div = (threadIdx.x / 32) * 192;
    int idx_mod = threadIdx.x % 32;
    int dimxBGRsize = blockDim.x * 3 * 2;
    
    #pragma unroll 
    for (int i = 0; i < sheight - pad * 2; i += 4)
    {   
        #pr

到了这里,关于YOLOv8模型部署的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处: 如若内容造成侵权/违法违规/事实不符,请点击违法举报进行投诉反馈,一经查实,立即删除!

领支付宝红包 赞助服务器费用

相关文章

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

博客赞助

微信扫一扫打赏

请作者喝杯咖啡吧~博客赞助

支付宝扫一扫领取红包,优惠每天领

二维码1

领取红包

二维码2

领红包