RDD2022 数据格式转换与清洗-Toy模板网

这篇具有很好参考价值的文章主要介绍了RDD2022 数据格式转换与清洗。希望对大家有所帮助。如果存在错误或未考虑完全的地方，请大家不吝赐教，您也可以点击"举报违法"按钮提交疑问。

Rdd2022数据集，是关于道路损伤的数据集，与rdd2020相比增加了两万多张图片
但是由于格式不能被yolo直接使用且其中有大量的图片没有符合要求的标注，特写此文章记录数据清洗与格式化的过程

数据集下载

在开始前需要自己下载zip格式的RDD2022数据集，大小为12.4G 点击此处下载

之后，在桌面上新建一个名为my_file 的文件夹，将上面下载的压缩包放进去，将我的main.py放进去，文件夹结构如下

rdd2020数据集数据格式转化与清洗,日常练习,python,rdd2022,python
在pycharm种运行main文件即可
运行完成后my_file结构如下，其中的my_data就是你要的文件夹，其他的都没用：

注意事项

注意：

如果运行过程中出现任何bug使得程序没有进行到底，需要删除所有自动生成的文件，回到最开始的目录结构，从新开始运行main文件
如果报错说有缺了什么包，自己去安装即可
注意设置工作目录为你自己新建的那个my_file文件夹，一般情况下默认就是这个，如果报错找不到目录啥的就看下是不是这个问题
按照个人需求，以国家为单位对数据集进行了train:val=7:3的切割
特别注意：代码中将没有任何标注的图片直接剔除了，这可能会对你的训练产生影响

源代码

main.py的代码如下:文章来源地址https://www.toymoban.com/news/detail-740445.html

import zipfile
import os
import os
import xml.etree.ElementTree as ET
from shutil import copyfile
import shutil
import argparse
from pathlib import Path
import random
from collections import defaultdict
import random

work_dir = os.getcwd()
countries = ["China_Drone", "China_MotorBike", "Czech", "India", "Japan", "Norway", "United_States"]
labels = ["D00", "D10", "D20", "D40"]


# 解压最开始的12.4G的压缩包到工作目录
#    解压之后是一个名为 RDD2022_all_countries 的文件夹
def unzip_rdd2022():
    path = os.path.join(work_dir, 'RDD2022.zip')
    zip_file = zipfile.ZipFile(path)
    zip_list = zip_file.namelist()
    for f in zip_list:
        zip_file.extract(f, work_dir)
    zip_file.close()


# RDD2022_all_countries文件夹里面有6个以国家名称命名的压缩包
#    进入这个文件夹里面继续解压，注意是解压到了RDD2022_all_countries
#    这个文件夹里面，至此所有的压缩文件解压完毕
def unzip_RDD2022_all_countries():
    dir_path = os.path.join(work_dir, 'RDD2022_all_countries')
    all_countries_zip_file_name = os.listdir(dir_path)

    for name in all_countries_zip_file_name:
        print('正在解压{}'.format(name))
        all_countries_zip_file_path = os.path.join(dir_path, name)
        zip_file = zipfile.ZipFile(all_countries_zip_file_path)
        zip_list = zip_file.namelist()
        for f in zip_list:
            zip_file.extract(f, dir_path)
        zip_file.close()
        print('{}已解压完成'.format(name))


# 将所有有标签的图片以及对应的标注移动到一个新的文件夹中
#   然后后续操作都是针对这些有标签的图片进行的，其实就是变相去除了
#   没有标签的图片
def remove_useless_file():
    # 一共6个国家，一个国家一个国家的操作
    RDD2022_all_countries_path = os.path.join(work_dir, 'RDD2022_all_countries')
    for country in countries:
        print("开始对 {} 的标签与图片进行操作".format(country))
        annoFiles = os.listdir(os.path.join(RDD2022_all_countries_path, country + "/train/annotations/xmls/"))
        jpgFiles = os.listdir(os.path.join(RDD2022_all_countries_path, country + "/train/images/"))
        newCountry = "new_" + country
        # 在RDD2022_all_countries文件夹下面新建文件夹，new_countryname/Annotations
        #                                          new_countryname/JPEGImages
        annotations_dir = os.path.join(RDD2022_all_countries_path, newCountry, 'Annotations/')
        jpegimages_dir = os.path.join(RDD2022_all_countries_path, newCountry, 'JPEGImages/')
        os.makedirs(annotations_dir, exist_ok=True)
        os.makedirs(jpegimages_dir, exist_ok=True)

        for annoFile in annoFiles:
            tree = ET.parse(
                os.path.join(RDD2022_all_countries_path + "/" + country + "/train/annotations/xmls/", annoFile))
            root = tree.getroot()
            for obj in root.findall("object"):
                a = obj.find("name").text
                if a not in labels:
                    root.remove(obj)

            if len(root.findall("object")) > 0:
                country_path = os.path.join(RDD2022_all_countries_path, country)
                newCountry_path = os.path.join(RDD2022_all_countries_path, newCountry)
                tree.write(newCountry_path + "/Annotations/" + annoFile)
                copyfile(os.path.join(country_path + "/train/images/", annoFile.split(".")[0]) + ".jpg",
                         newCountry_path + "/JPEGImages/" + annoFile.split(".")[0] + ".jpg")
            else:
                # print(f'{annoFile} 没有标签文件')
                continue
        print("{} 的标签与图片操作完毕".format(country))


# 将所有的图片复制到工作目录下的new_train/jpegimages 文件夹下
# 将所有的标签复制到工作目录下的new_train/annotations 文件夹下


def copy_file_2_new_train_dir():
    # 首先创建new_train文件夹
    os.makedirs(work_dir + "new_train/", exist_ok=True)
    # 创建new_train文件夹下面的两个文件夹
    jpeg_path = os.path.join(work_dir, 'new_train', 'jpegimages/')
    annotation_path = os.path.join(work_dir, 'new_train', 'annotations/')
    os.makedirs(jpeg_path, exist_ok=True)
    os.makedirs(annotation_path, exist_ok=True)

    RDD2022_all_countries_path = os.path.join(work_dir, 'RDD2022_all_countries')
    for country in countries:
        print("{}正在复制".format(country))
        jpeg_dir_path = os.path.join(RDD2022_all_countries_path, 'new_' + country, 'JPEGImages')
        all_jpeg_names = os.listdir(jpeg_dir_path)
        annotation_dir_path = os.path.join(RDD2022_all_countries_path, 'new_' + country, 'Annotations')
        all_anno_names = os.listdir(annotation_dir_path)
        for name in all_jpeg_names:
            source = os.path.join(jpeg_dir_path, name)
            target = os.path.join(work_dir, 'new_train', 'jpegimages')
            shutil.copy(source, target)
        for name in all_anno_names:
            source = os.path.join(annotation_dir_path, name)
            target = os.path.join(work_dir, 'new_train', 'annotations')
            shutil.copy(source, target)
        print("{}复制完毕".format(country))


# 生成一个包含所有xml文件路径的txt文件以便 xml2yolo文件调用
def generate_txt_file():
    annoFiles = os.listdir(os.path.join(work_dir, "new_train/Annotations/"))
    yoloFile = open("./xml2yolo_damage.txt", "w")
    for i in range(len(annoFiles)):
        yoloFile.writelines(work_dir + "/new_train/Annotations/" + annoFiles[i] + "\n")
    yoloFile.close()


def xml2yolo():
    import argparse
    import os
    import xml.etree.ElementTree as ET
    from PIL import Image
    from collections import defaultdict

    # Type of image in Dataset
    imageType = ["jpeg", "png", "jpg", "JPEG", "JPG", "PNG"]
    # dictionary to store list of image paths in each class
    imageListDict = defaultdict(set)

    def convert(size, box):
        dw = 1. / size[0]
        dh = 1. / size[1]
        x = (box[0] + box[1]) / 2.0
        y = (box[2] + box[3]) / 2.0
        w = box[1] - box[0]
        h = box[3] - box[2]
        x = x * dw
        w = w * dw
        y = y * dh
        h = h * dh
        return [x, y, w, h]

    # convert minX,minY,maxX,maxY to normalized numbers required by Yolo
    def getYoloNumbers(imagePath, minX, minY, maxX, maxY):
        image = Image.open(imagePath)
        w = int(image.size[0])
        h = int(image.size[1])
        b = (minX, maxX, minY, maxY)
        bb = convert((w, h), b)
        image.close()
        return bb

    def getFileList3(filePath):
        xmlFiles = []
        with open(filePath, "r") as f:
            xmlFiles = f.readlines()
            for i in range(len(xmlFiles)):
                temp = xmlFiles[i].strip().rsplit('.', 1)[0]
                xmlFiles[i] = os.path.abspath(temp.replace("JPEGImages", "Annotations") + ".xml")
                labels_path = os.path.dirname(xmlFiles[i]).replace("Annotations", "labels")
                if not os.path.exists(labels_path):
                    os.mkdir(labels_path)
                assert (os.path.exists(xmlFiles[i]))

        return xmlFiles

    def main():
        parser = argparse.ArgumentParser(description='run phase2.')

        parser.add_argument('--input-file', type=str,
                            help='location to the list of images/xml files(absolute path). sample file at "./xml2yolo_damagee.txt"',
                            default='./xml2yolo_damage.txt')
        args = parser.parse_args()

        # assign each class of dataset to a number
        outputCtoId = {'D00': 0, 'D10': 1, 'D20': 2, 'D40': 3}

        # read the path of the directory where XML and images are present
        xmlFiles = getFileList3(args.input_file)

        print("total files:", len(xmlFiles))
        print('正在转换......')

        # loop over each file under dirPath
        for file in xmlFiles:
            filePath = file
            # print(filePath)
            tree = ET.parse(filePath)
            root = tree.getroot()

            i = 0
            imageFile = filePath[:-4].replace("Annotations", "JPEGImages") + "." + imageType[i]
            while (not os.path.isfile(imageFile) and i < 2):
                i += 1
                imageFile = filePath[:-4].replace("Annotations", "JPEGImages") + "." + imageType[i]

            if not os.path.isfile(imageFile):
                print("File not found:", imageFile)
                continue

            txtFile = filePath[:-4].replace("Annotations", "labels") + ".txt"
            yoloOutput = open(txtFile, "w")

            # loop over each object tag in annotation tag
            for objects in root.findall('object'):
                surfaceType = objects.find('name').text.replace(" ", "")
                if surfaceType == "D30":
                    continue
                bndbox = objects.find('bndbox')
                [minX, minY, maxX, maxY] = [int(float(child.text)) for child in bndbox]
                [x, y, w, h] = getYoloNumbers(imageFile, int(minX), int(minY), int(maxX), int(maxY))
                yoloOutput.write(
                    str(outputCtoId[surfaceType]) + " " + str(x) + " " + str(y) + " " + str(w) + " " + str(h) + "\n")
                imageListDict[outputCtoId[surfaceType]].add(imageFile)
            yoloOutput.close()
        for cl in imageListDict:
            print(labels[cl], ":", len(imageListDict[cl]))

    main()


def generate_my_data():
    # 首先在工作目录下创建 my_data文件夹，以及他下面的images文件夹,labels文件夹
    os.makedirs(work_dir + 'my_data/', exist_ok=True)
    images_path = os.path.join(work_dir, 'my_data', 'images/')
    labels_path = os.path.join(work_dir, 'my_data', 'labels/')
    os.makedirs(images_path, exist_ok=True)
    os.makedirs(labels_path, exist_ok=True)
    # images和labels文件夹下面各有一个train，val文件夹
    os.makedirs(os.path.join(images_path, 'train/'), exist_ok=True)
    os.makedirs(os.path.join(images_path, 'val/'), exist_ok=True)
    os.makedirs(os.path.join(labels_path, 'train/'), exist_ok=True)
    os.makedirs(os.path.join(labels_path, 'val/'), exist_ok=True)
    print("最终my_data文件夹基本结构创建完毕")

    # 将new_train中6个国家的图片的绝对路径分别放到6个列表中
    new_train_path = os.path.join(work_dir, 'new_train')
    jpeg_dir_path = os.path.join(new_train_path, 'jpegimages')
    labels_dir_path = os.path.join(new_train_path, 'labels')
    all_images_name = os.listdir(jpeg_dir_path)

    all_countries_images = defaultdict(lambda: [])
    for name in all_images_name:
        country_name = '_'.join(name.split('_')[:-1])
        all_countries_images[country_name].append(name)
    images_len = sum([len(i) for i in all_countries_images.values()])
    print("一共有{}张图片".format(images_len))
    for k, v in all_countries_images.items():
        print("{} 一共有 {}张图片".format(k, len(v)))
    print('*************************')
    print("开始切分数据集")
    for country in countries:
        image_len = len(all_countries_images[country])
        train_nums = int(image_len * 0.7)
        val_nums = image_len - train_nums
        print("{}一共{}张图片，训练集7/10一共是{}张，测试集3/10一共是{}张，正在切割".format(country, image_len, train_nums, val_nums))
        # 验证集一共val_nums张图片, 一共image_len张图片，索引  0~image_len-1 ,从里面抽取val_index个数
        val_index = random.sample(range(0, image_len), val_nums)
        for idx, name in enumerate(all_countries_images[country]):
            # 图片的复制
            source = os.path.join(jpeg_dir_path, name)
            # target有两种可能，一种是train,一种是val
            target = os.path.join(images_path, 'train') if idx not in val_index else os.path.join(images_path, 'val')
            shutil.copy(source, target)
            # 图片对应的label的复制
            #    label的名称就是图片的名称改掉后缀
            label_name = name.split('.')[0] + '.txt'
            label_source = os.path.join(work_dir, 'new_train', 'labels', label_name)
            label_target = os.path.join(work_dir, 'my_data', 'labels',
                                        'train') if idx not in val_index else os.path.join(work_dir, 'my_data',
                                                                                           'labels', 'val')
            shutil.copy(label_source, label_target)
    all_train_len = len(os.listdir(os.path.join(work_dir, 'my_data', 'images', 'train')))
    all_val_len = len(os.listdir(os.path.join(work_dir, 'my_data', 'images', 'val')))
    print("所有数据切分完毕,训练集一共{}条，验证集一共{}条".format(all_train_len, all_val_len))
    print("\n\n\n\n*************************")
    print("完成，目标文件夹就是my_data, 其他的文件都可以删除")
    print("注意：一共4种损伤类型，4种类型的名称以及对应的编号为")
    for idx, i in enumerate(labels):
        print("{}: {}".format(i, idx))


if __name__ == '__main__':
    print("正在解压12.4G大的最外面的压缩包")
    unzip_rdd2022()
    print("正在解压6个国家的压缩包")
    unzip_RDD2022_all_countries()
    print("对图片进行去除清洗操作")
    remove_useless_file()
    print("正在将所有的图片以及标签复制到统一的目录下")
    copy_file_2_new_train_dir()
    print("正在生成用于标注转换的txt文件")
    generate_txt_file()
    print("正在转换标签")
    xml2yolo()
    print('正在生成最终文件夹')
    generate_my_data()