数据集分配一些小tip

2024-01-01 About 600 words 3 minutes

Contents

数据集分配

本实验使用的是4.2k HRW yolo dataset [ https://github.com/Whiffe/SCB-dataset.git ]

目录结构

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35


dataset/
|--- images/
|    |--- train/
|    |    |--- image1.jpg
|    |    |--- image2.jpg
|    |    |--- ...
|    |
|    |--- val/
|    |    |--- image3.jpg
|    |    |--- image4.jpg
|    |    |--- ...
|    |
|    |--- test/
|         |--- image5.jpg
|         |--- image6.jpg
|         |--- ...
|
|--- labels/
     |--- train/
     |    |--- label1.txt
     |    |--- label2.txt
     |    |--- ...
     |
     |--- val/
     |    |--- label3.txt
     |    |--- label4.txt
     |    |--- ...
     |
     |--- test/
          |--- label5.txt
          |--- label6.txt
          |--- ...
|--- train.txt
|--- val.txt
|--- test.txt          

随机分配数据集

数据集比例：train:val:test = 6:2:2

split_dataset.py

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63


import os
import shutil
import random

# 定义数据集路径
dataset_path = "./dataset"
images_path = os.path.join(dataset_path, "images")
labels_path = os.path.join(dataset_path, "labels")

# 创建train、val、test文件夹
train_path = os.path.join(images_path, "train")
val_path = os.path.join(images_path, "val")
test_path = os.path.join(images_path, "test")

# 确保文件夹存在，如果不存在则创建
for path in [train_path, val_path, test_path]:
    if not os.path.exists(path):
        os.makedirs(path)

# 定义图片范围
start = 1
end = 4245
total_images = end - start + 1

# 生成图片列表
image_list = [f"{i:07}.png" for i in range(start, end + 1)]

# 随机打乱图片列表
random.shuffle(image_list)

# 计算划分数量
train_count = int(total_images * 0.6)
val_count = int(total_images * 0.2)
test_count = total_images - train_count - val_count

# 将图片分配到对应文件夹
for i, image_file in enumerate(image_list):
    source_image_path = os.path.join(images_path, image_file)
    source_label_path = os.path.join(labels_path, os.path.splitext(image_file)[0] + ".txt")

    if i < train_count:
        folder = "train"
    elif i < train_count + val_count:
        folder = "val"
    else:
        folder = "test"

    destination_image_path = os.path.join(images_path, folder, image_file)
    destination_label_path = os.path.join(labels_path, folder, os.path.splitext(image_file)[0] + ".txt")

    # 添加调试信息
    print(f"Copying {source_image_path} to {destination_image_path}")
    print(f"Copying {source_label_path} to {destination_label_path}")

    # 确保目标文件夹存在
    os.makedirs(os.path.dirname(destination_image_path), exist_ok=True)
    os.makedirs(os.path.dirname(destination_label_path), exist_ok=True)

    # 复制图片文件
    shutil.copyfile(source_image_path, destination_image_path)

    # 复制对应的标签文件
    shutil.copyfile(source_label_path, destination_label_path)

写绝对路径到txt文件

createPathTxt.py

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44


import os

# 输入文件夹和输出文件
dataset_folder = "./dataset/images"
train_folder = "./dataset/images/train"
val_folder = "./dataset/images/val"
test_folder = "./dataset/images/test"

# 输出文件路径
train_txt_path = "./dataset/train.txt"
val_txt_path = "./dataset/val.txt"
test_txt_path = "./dataset/test.txt"

# 获取文件夹中所有文件的路径
def get_file_paths(folder):
    file_paths = []
    for root, _, files in os.walk(folder):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
    return file_paths

# 获取训练集、验证集、测试集文件路径
train_file_paths = get_file_paths(train_folder)
val_file_paths = get_file_paths(val_folder)
test_file_paths = get_file_paths(test_folder)

# 写入文件路径到对应的文本文件
def write_file_paths(file_paths, txt_path):
    with open(txt_path, 'w') as txt_file:
        for path in file_paths:
            txt_file.write(os.path.abspath(path) + '\n')  # 将相对路径转换为绝对路径并写入文本文件

# 写入训练集文件路径
write_file_paths(train_file_paths, train_txt_path)
print(f"训练集文件路径已写入: {os.path.abspath(train_txt_path)}")

# 写入验证集文件路径
write_file_paths(val_file_paths, val_txt_path)
print(f"验证集文件路径已写入: {os.path.abspath(val_txt_path)}")

# 写入测试集文件路径
write_file_paths(test_file_paths, test_txt_path)
print(f"测试集文件路径已写入: {os.path.abspath(test_txt_path)}")