我需要检测图像上的手。 我使用Darkflow:https://github.com/thtrieu/darkflow.git
我在此处使用权重的tiny-yolo配置:https://github.com/digitalbrain79/pyyolo/raw/master/tiny-yolo.weights
我将tiny-yolo.cfg复制到tempCfg.cfg,并更改了自述文件中的参数。
我使用以下数据集:http://www.robots.ox.ac.uk/~vgg/data/hands/
这是我用于转换注释文件的脚本:
import os
from lxml import etree
import xml.etree.cElementTree as ET
import cv2
import scipy.io
import numpy as np
for wrongAnnotation in os.scandir('/content/hand_dataset/training_dataset/training_data/annotations'):
if(wrongAnnotation.path.endswith('.mat') == False): continue
path = wrongAnnotation.path
path = path.replace('.mat', '.xml')
path = path.replace('annotations', 'annotationsXML')
imgPath = path.replace ('annotationsXML', 'images')
imgPath = imgPath.replace ('.xml', '.jpg')
#print(wrongAnnotation.path)
image = cv2.imread(imgPath)
height, width, depth = image.shape
annotation = ET.Element('annotation')
ET.SubElement(annotation, 'folder').text = '/content/hand_dataset/training_dataset/training_data/annotationsXML'
ET.SubElement(annotation, 'filename').text = os.path.basename(imgPath)
ET.SubElement(annotation, 'segmented').text = '0'
size = ET.SubElement(annotation, 'size')
ET.SubElement(size, 'width').text = str(width)
ET.SubElement(size, 'height').text = str(height)
ET.SubElement(size, 'depth').text = str(depth)
data = scipy.io.loadmat(wrongAnnotation.path)
for x in data.get('boxes')[0]:
xTemp = x[0][0]
bottom = np.min([y[0][0] for y in xTemp.item()[:4]])
left = np.min([y[0][1] for y in xTemp.item()[:4]])
right = np.max([y[0][1] for y in xTemp.item()[:4]])
top = np.max([y[0][0] for y in xTemp.item()[:4]])
ob = ET.SubElement(annotation, 'object')
ET.SubElement(ob, 'name').text = 'hand'
ET.SubElement(ob, 'pose').text = 'Unspecified'
ET.SubElement(ob, 'truncated').text = '0'
ET.SubElement(ob, 'difficult').text = '0'
bbox = ET.SubElement(ob, 'bndbox')
ET.SubElement(bbox, 'xmin').text = str(left)
ET.SubElement(bbox, 'ymin').text = str(bottom)
ET.SubElement(bbox, 'xmax').text = str(right)
ET.SubElement(bbox, 'ymax').text = str(top)
xml_str = ET.tostring(annotation)
root = etree.fromstring(xml_str)
xml_str = etree.tostring(root, pretty_print = True)
with open(path, 'wb') as newXML:
newXML.write(xml_str)
它可以从如下形式转换注释:
{'__header__': b'MATLAB 5.0 MAT-file, Platform: MACI, Created on: Mon Jul 18 17:53:23 2011', '__version__': '1.0', '__globals__': [], 'boxes': array([[array([[(array([[215.63338254, 412.85084076]]), array([[223.47936255, 421.93052125]]), array([[250.44014687, 398.63303021]]), array([[242.59416686, 389.55334972]]), array(['L'], dtype='<U1'), array([], shape=(0, 0), dtype=uint8))]],
dtype=[('a', 'O'), ('b', 'O'), ('c', 'O'), ('d', 'O'), ('handtype', 'O'), ('truncated', 'O')]),
array([[(array([[194.93452622, 273.71437979]]), array([[178.00174829, 275.22468831]]), array([[179.83998359, 295.8340073 ]]), array([[196.77276151, 294.32369879]]), array(['R'], dtype='<U1'), array([], shape=(0, 0), dtype=uint8))]],
dtype=[('a', 'O'), ('b', 'O'), ('c', 'O'), ('d', 'O'), ('handtype', 'O'), ('truncated', 'O')]),
array([[(array([[174.40487672, 310.17948749]]), array([[183.82551544, 317.61262721]]), array([[193.01669191, 305.96388169]]), array([[183.59605319, 298.53074197]]), array(['L'], dtype='<U1'), array([], shape=(0, 0), dtype=uint8))]],
dtype=[('a', 'O'), ('b', 'O'), ('c', 'O'), ('d', 'O'), ('handtype', 'O'), ('truncated', 'O')])]],
dtype=object)}
对此:
<annotation>
<folder>/content/hand_dataset/training_dataset/training_data/annotationsXML</folder>
<filename>VOC2007_100.jpg</filename>
<segmented>0</segmented>
<size>
<width>500</width>
<height>375</height>
<depth>3</depth>
</size>
<object>
<name>hand</name>
<pose>Unspecified</pose>
<truncated>0</truncated>
<difficult>0</difficult>
<bndbox>
<xmin>389.5533497217231</xmin>
<ymin>215.6333825448044</ymin>
<xmax>421.9305212460187</xmax>
<ymax>250.44014686696025</ymax>
</bndbox>
</object>
<object>
<name>hand</name>
<pose>Unspecified</pose>
<truncated>0</truncated>
<difficult>0</difficult>
<bndbox>
<xmin>273.7143797917917</xmin>
<ymin>178.00174829348282</ymin>
<xmax>295.8340073049824</xmax>
<ymax>196.77276151043867</ymax>
</bndbox>
</object>
<object>
<name>hand</name>
<pose>Unspecified</pose>
<truncated>0</truncated>
<difficult>0</difficult>
<bndbox>
<xmin>298.53074196966594</xmin>
<ymin>174.40487671681234</ymin>
<xmax>317.6126272059612</xmax>
<ymax>193.01669191063854</ymax>
</bndbox>
</object>
</annotation>
之后,我使用此命令开始训练:
flow --model cfg/tempCfg.cfg --load bin/tiny-yolo.weights --train --annotation /content/hand_dataset/training_dataset/training_data/annotationsXML --dataset /content/hand_dataset/training_dataset/training_data/images --gpu 1.0 --epoch 30 --batch 10 --verbalise false
然后我尝试做:
flow --imgdir /content/hand_dataset/test_dataset/test_data/demoImages --model cfg/tempCfg.cfg --load bin/tiny-yolo.weights --gpu 1.0
结果是完全随机的。它检测到很多空白区域,并且不检测到手。 这是因为我只使用了30个纪元,还是做错了什么?培训需要花费几个小时来进行gpu训练,因此,我认为在此之后它应该会或多或少地产生视觉效果,但只会带来混乱。