如何使用分层采样将图像文件夹分为测试/训练/验证集?

时间:2018-10-31 00:36:20

标签: python python-3.x

我有一个很大的图像文件夹,还有一个CSV文件,其中包含每个图像的类标签。因为它们都在一个巨大的文件夹中,所以我想将它们分成训练/测试/验证集;也许创建三个新文件夹,然后根据某种Python脚本将图像移动到每个文件夹中。我想进行分层抽样,这样我就可以在所有三个集合中保持相同的类百分比。

制作可以做到这一点的脚本的方法是什么?

4 个答案:

答案 0 :(得分:4)

接受上述Abdul Mukit的答案并将其扩展一点以支持两个以上的类,并遍历每个类以为每个类创建训练/验证/测试。

import os
import numpy as np
import shutil
import random

# # Creating Train / Val / Test folders (One time use)
root_dir = '4_classes'
classes_dir = ['/class1', 'class2', 'class3', 'class4']

val_ratio = 0.15
test_ratio = 0.05

for cls in classes_dir:
    os.makedirs(root_dir +'/train' + cls)
    os.makedirs(root_dir +'/val' + cls)
    os.makedirs(root_dir +'/test' + cls)


    # Creating partitions of the data after shuffeling
    src = root_dir + cls # Folder to copy images from

    allFileNames = os.listdir(src)
    np.random.shuffle(allFileNames)
    train_FileNames, val_FileNames, test_FileNames = np.split(np.array(allFileNames),
                                                              [int(len(allFileNames)* (1 - val_ratio + test_ratio)), 
                                                               int(len(allFileNames)* (1 - test_ratio))])


    train_FileNames = [src+'/'+ name for name in train_FileNames.tolist()]
    val_FileNames = [src+'/' + name for name in val_FileNames.tolist()]
    test_FileNames = [src+'/' + name for name in test_FileNames.tolist()]

    print('Total images: ', len(allFileNames))
    print('Training: ', len(train_FileNames))
    print('Validation: ', len(val_FileNames))
    print('Testing: ', len(test_FileNames))

    # Copy-pasting images
    for name in train_FileNames:
        shutil.copy(name, root_dir +'/train' + cls)

    for name in val_FileNames:
        shutil.copy(name, root_dir +'/val' + cls)

    for name in test_FileNames:
        shutil.copy(name, root_dir +'/test' + cls)

答案 1 :(得分:3)

使用python库拆分文件夹。

pip install split-folders

让所有图像都存储在Data文件夹中。 然后申请如下:

import split_folders
split_folders.ratio('Data', output="output", seed=1337, ratio=(.8, 0.1,0.1)) 

在运行上述代码段时,它将在output目录中创建3个文件夹:

  • 火车
  • val
  • 测试

可以使用ratio自变量(train:val:test)中的值来改变每个文件夹中的图像数量。

答案 2 :(得分:2)

我自己也遇到了类似的问题。我所有的图像都存储在两个文件夹中。 “ Project / Data2 / DPN +”和“ Project / Data2 / DPN-”。这是一个二进制分类问题。这两个类别是“ DPN +”和“ DPN-”。这两个类文件夹中都带有.png。我的目标是将数据集分发到培训,验证和测试文件夹中。这些新文件夹中的每一个将在其中另外两个文件夹-“ DPN +”和“ DPN-”-指示类别。对于分区,我使用70:15:15分配。我是python的初学者,所以,如果我有任何错误,请告诉我。

以下是我的代码:

import os
import numpy as np
import shutil

# # Creating Train / Val / Test folders (One time use)
root_dir = 'Data2'
posCls = '/DPN+'
negCls = '/DPN-'

os.makedirs(root_dir +'/train' + posCls)
os.makedirs(root_dir +'/train' + negCls)
os.makedirs(root_dir +'/val' + posCls)
os.makedirs(root_dir +'/val' + negCls)
os.makedirs(root_dir +'/test' + posCls)
os.makedirs(root_dir +'/test' + negCls)

# Creating partitions of the data after shuffeling
currentCls = posCls
src = "Data2"+currentCls # Folder to copy images from

allFileNames = os.listdir(src)
np.random.shuffle(allFileNames)
train_FileNames, val_FileNames, test_FileNames = np.split(np.array(allFileNames),
                                                          [int(len(allFileNames)*0.7), int(len(allFileNames)*0.85)])


train_FileNames = [src+'/'+ name for name in train_FileNames.tolist()]
val_FileNames = [src+'/' + name for name in val_FileNames.tolist()]
test_FileNames = [src+'/' + name for name in test_FileNames.tolist()]

print('Total images: ', len(allFileNames))
print('Training: ', len(train_FileNames))
print('Validation: ', len(val_FileNames))
print('Testing: ', len(test_FileNames))

# Copy-pasting images
for name in train_FileNames:
    shutil.copy(name, "Data2/train"+currentCls)

for name in val_FileNames:
    shutil.copy(name, "Data2/val"+currentCls)

for name in test_FileNames:
    shutil.copy(name, "Data2/test"+currentCls)

答案 3 :(得分:0)

我有类似的任务。我的图像和XML格式的相应注释存储在一个文件夹中。 我制作了训练和测试文件夹,但在分割文件(请参阅脚本) 后,将原始文件夹用作验证文件夹。

这是我的脚本,用于将文件分成测试/培训/验证集:

import os
from random import choice
import shutil

#arrays to store file names
imgs =[]
xmls =[]

#setup dir names
trainPath = 'train'
valPath = 'val'
testPath = 'test'
crsPath = 'img' #dir where images and annotations stored

#setup ratio (val ratio = rest of the files in origin dir after splitting into train and test)
train_ratio = 0.8
test_ratio = 0.1


#total count of imgs
totalImgCount = len(os.listdir(crsPath))/2

#soring files to corresponding arrays
for (dirname, dirs, files) in os.walk(crsPath):
    for filename in files:
        if filename.endswith('.xml'):
            xmls.append(filename)
        else:
            imgs.append(filename)


#counting range for cycles
countForTrain = int(len(imgs)*train_ratio)
countForTest = int(len(imgs)*test_ratio)

#cycle for train dir
for x in range(countForTrain):

    fileJpg = choice(imgs) # get name of random image from origin dir
    fileXml = fileJpg[:-4] +'.xml' # get name of corresponding annotation file

    #move both files into train dir
    shutil.move(os.path.join(crsPath, fileJpg), os.path.join(trainPath, fileJpg))
    shutil.move(os.path.join(crsPath, fileXml), os.path.join(trainPath, fileXml))

    #remove files from arrays
    imgs.remove(fileJpg)
    xmls.remove(fileXml)



#cycle for test dir   
for x in range(countForTest):

    fileJpg = choice(imgs) # get name of random image from origin dir
    fileXml = fileJpg[:-4] +'.xml' # get name of corresponding annotation file

    #move both files into train dir
    shutil.move(os.path.join(crsPath, fileJpg), os.path.join(testPath, fileJpg))
    shutil.move(os.path.join(crsPath, fileXml), os.path.join(testPath, fileXml))

    #remove files from arrays
    imgs.remove(fileJpg)
    xmls.remove(fileXml)

#rest of files will be validation files, so rename origin dir to val dir
os.rename(crsPath, valPath)

#summary information after splitting
print('Total images: ', totalImgCount)
print('Images in train dir:', len(os.listdir(trainPath))/2)
print('Images in test dir:', len(os.listdir(testPath))/2)
print('Images in validation dir:', len(os.listdir(valPath))/2)