我正在尝试使用并行multiple GPUS
和tensorflow
来增加对象检测YOLO的推理时间。我想知道执行推理时是否有更好的方法来使用所有GPU的所有资源!
我从视频中读取帧,然后将帧除以GPU数量。然后,我使用多线程在不同的GPU(目前有2个GPU)中运行不同的帧。
这改善了总推理时间,但是每gpu的平均fps变慢了。例如,当我仅使用1 gpu时,平均fps约为60ms,总推断时间为60秒。但是,使用多线程时,每个线程的平均fps增加到大约110ms,但总侵权时间减少到50秒。
我想知道为什么多线程的平均fps增加了。并且有办法提高平均fps。
run_inference_for_single_image
和detection_gpu
用于推理的功能
def run_inference_for_single_image(frame, lbox_resize, sess, input_data, inp_dim, boxes, scores, labels):
if lbox_resize:
img, resize_ratio, dw, dh = letterbox_resize(frame, inp_dim, inp_dim)
else:
height_ori, width_ori = frame.shape[:2]
img = cv2.resize(frame, tuple([inp_dim, inp_dim]))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = np.asarray(img, np.float32)
img = img[np.newaxis, :] / 255.
boxes_, scores_, labels_ = sess.run([boxes, scores, labels], feed_dict={input_data: img})
# rescale the coordinates to the original image
if lbox_resize:
boxes_[:, [0, 2]] = (boxes_[:, [0, 2]] - dw) / resize_ratio
boxes_[:, [1, 3]] = (boxes_[:, [1, 3]] - dh) / resize_ratio
else:
boxes_[:, [0, 2]] *= (width_ori / float(inp_dim))
boxes_[:, [1, 3]] *= (height_ori / float(inp_dim))
return boxes_, scores_, labels_
def detection_gpu(frame_list, device_name,
letterbox, sess, input_data,
inp_dim, boxes, scores, labels, classes):
frame_with_rect = []
with tf.device(device_name):
for frame in frame_list:
start = time.time()
boxes_, scores_, labels_ = run_inference_for_single_image(frame,
letterbox,
sess,
input_data,
inp_dim,
boxes,
scores,
labels)
vis.visualize_boxes_and_labels_yolo(frame,
boxes_,
classes,
labels_,
scores_,
use_normalized_coordinates=False)
end = time.time()
cv2.putText(frame, '{:.2f}ms'.format((end - start) * 1000), (40, 40),
cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 0, 0), 2)
frame_with_rect.append(frame)
cv2.imshow(device_name, frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
return frame_with_rect
main
函数可加载模型,视频。
def main():
args = arg_parse()
PATH_TO_LABELS = 'labels/coco.names'
# anchors and class labels
anchors = parse_anchors(args.anchor_path)
classes = read_class_names(PATH_TO_LABELS)
num_classes = len(classes)
VIDEO_PATH = args.video
inp_dim = args.resol
try:
# Read Video file
cap = cv2.VideoCapture(VIDEO_PATH)
except IOError:
print("Input video file", VIDEO_PATH, "doesn't exist")
sys.exit(1)
# find number of gpus that is available
gpus = tf.config.experimental.list_logical_devices('GPU')
frame_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# divide frames of video by number of gpus
div = frame_length // len(gpus)
divide_point = [i for i in range(frame_length) if i != 0 and i % div == 0]
divide_point.pop()
frame_list = []
fragments = []
count = 0
while cap.isOpened():
hasFrame, frame = cap.read()
if not hasFrame:
frame_list.append(fragments)
break
if count in divide_point:
frame_list.append(fragments)
fragments = []
fragments.append(frame)
count += 1
cap.release()
with tf.Session() as sess:
input_data = tf.placeholder(tf.float32, [1, inp_dim, inp_dim, 3], name='input_data')
model = Darknet(num_classes, anchors)
with tf.variable_scope('yolov3'):
pred_feature_maps = model.forward(input_data, False)
pred_boxes, pred_confs, pred_probs = model.predict(pred_feature_maps)
pred_scores = pred_confs * pred_probs
boxes, scores, labels = gpu_nms(pred_boxes, pred_scores, num_classes,
max_boxes=200, score_thresh=args.confidence, nms_thresh=args.nmsThreshold)
saver = tf.train.Saver()
saver.restore(sess, args.ckpt)
# Process object detection using threading
thread_detection = [ThreadWithReturnValue(target=detection_gpu,
args=(frame_list[i], gpu.name, args.letterbox_resize, sess,
input_data, inp_dim, boxes, scores, labels, classes))
for i, gpu in enumerate(gpus)]
final_list = []
# Begin operating threads
for th in thread_detection:
th.start()
# Once tasks are completed get return value (frames) and put to new list
for th in thread_detection:
final_list.extend(th.join())