Question

我正在尝试使用并行multiple GPUS和tensorflow来增加对象检测YOLO的推理时间。我想知道执行推理时是否有更好的方法来使用所有GPU的所有资源！

我从视频中读取帧，然后将帧除以GPU数量。然后，我使用多线程在不同的GPU（目前有2个GPU）中运行不同的帧。

这改善了总推理时间，但是每gpu的平均fps变慢了。例如，当我仅使用1 gpu时，平均fps约为60ms，总推断时间为60秒。但是，使用多线程时，每个线程的平均fps增加到大约110ms，但总侵权时间减少到50秒。

我想知道为什么多线程的平均fps增加了。并且有办法提高平均fps。

run_inference_for_single_image和detection_gpu用于推理的功能


def run_inference_for_single_image(frame, lbox_resize, sess, input_data, inp_dim, boxes, scores, labels):
    if lbox_resize:
        img, resize_ratio, dw, dh = letterbox_resize(frame, inp_dim, inp_dim)
    else:
        height_ori, width_ori = frame.shape[:2]
        img = cv2.resize(frame, tuple([inp_dim, inp_dim]))

    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = np.asarray(img, np.float32)
    img = img[np.newaxis, :] / 255.

    boxes_, scores_, labels_ = sess.run([boxes, scores, labels], feed_dict={input_data: img})

    # rescale the coordinates to the original image
    if lbox_resize:
        boxes_[:, [0, 2]] = (boxes_[:, [0, 2]] - dw) / resize_ratio
        boxes_[:, [1, 3]] = (boxes_[:, [1, 3]] - dh) / resize_ratio
    else:
        boxes_[:, [0, 2]] *= (width_ori / float(inp_dim))
        boxes_[:, [1, 3]] *= (height_ori / float(inp_dim))

    return boxes_, scores_, labels_


def detection_gpu(frame_list, device_name,
                  letterbox, sess, input_data,
                  inp_dim, boxes, scores, labels, classes):
    frame_with_rect = []
    with tf.device(device_name):
        for frame in frame_list:
            start = time.time()
            boxes_, scores_, labels_ = run_inference_for_single_image(frame,
                                                                      letterbox,
                                                                      sess,
                                                                      input_data,
                                                                      inp_dim,
                                                                      boxes,
                                                                      scores,
                                                                      labels)
            vis.visualize_boxes_and_labels_yolo(frame,
                                                boxes_,
                                                classes,
                                                labels_,
                                                scores_,
                                                use_normalized_coordinates=False)
            end = time.time()
            cv2.putText(frame, '{:.2f}ms'.format((end - start) * 1000), (40, 40),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 0, 0), 2)
            frame_with_rect.append(frame)
            cv2.imshow(device_name, frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
    return frame_with_rect

main函数可加载模型，视频。

def main():
    args = arg_parse()

    PATH_TO_LABELS = 'labels/coco.names'

    # anchors and class labels
    anchors = parse_anchors(args.anchor_path)
    classes = read_class_names(PATH_TO_LABELS)
    num_classes = len(classes)
    VIDEO_PATH = args.video

    inp_dim = args.resol

    try:
        # Read Video file
        cap = cv2.VideoCapture(VIDEO_PATH)
    except IOError:
        print("Input video file", VIDEO_PATH, "doesn't exist")
        sys.exit(1)

    # find number of gpus that is available
    gpus = tf.config.experimental.list_logical_devices('GPU')
    frame_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # divide frames of video by number of gpus
    div = frame_length // len(gpus)
    divide_point = [i for i in range(frame_length) if i != 0 and i % div == 0]
    divide_point.pop()

    frame_list = []
    fragments = []
    count = 0
    while cap.isOpened():
        hasFrame, frame = cap.read()
        if not hasFrame:
            frame_list.append(fragments)
            break
        if count in divide_point:
            frame_list.append(fragments)
            fragments = []
        fragments.append(frame)
        count += 1
    cap.release()

    with tf.Session() as sess:
        input_data = tf.placeholder(tf.float32, [1, inp_dim, inp_dim, 3], name='input_data')
        model = Darknet(num_classes, anchors)
        with tf.variable_scope('yolov3'):
            pred_feature_maps = model.forward(input_data, False)
        pred_boxes, pred_confs, pred_probs = model.predict(pred_feature_maps)

        pred_scores = pred_confs * pred_probs

        boxes, scores, labels = gpu_nms(pred_boxes, pred_scores, num_classes,
                                        max_boxes=200, score_thresh=args.confidence, nms_thresh=args.nmsThreshold)

        saver = tf.train.Saver()
        saver.restore(sess, args.ckpt)

        # Process object detection using threading
        thread_detection = [ThreadWithReturnValue(target=detection_gpu,
                                                  args=(frame_list[i], gpu.name, args.letterbox_resize, sess,
                                                        input_data, inp_dim, boxes, scores, labels, classes))
                            for i, gpu in enumerate(gpus)]

        final_list = []
        # Begin operating threads
        for th in thread_detection:
            th.start()

        # Once tasks are completed get return value (frames) and put to new list
        for th in thread_detection:
            final_list.extend(th.join())

如何使用并行多个GPU增加推理时间？

0 个答案: