用于人脸检测的OpenCV Python滞后视频

时间:2018-07-23 06:02:31

标签: python opencv webrtc

我有用于使用python和webrtc进行人脸检测的视频流,但是当从视频中进行人脸检测时,视频流出现滞后,每fps非常滞后,我在这里使用aiortc的项目代码:

server.py:

import argparse
import asyncio
import json
import logging
import math
import os
import time
import wave

import cv2
import numpy
from aiohttp import web

from aiortc import RTCPeerConnection, RTCSessionDescription
from aiortc.mediastreams import (AudioFrame, AudioStreamTrack, VideoFrame,
                                 VideoStreamTrack)

from scipy.spatial import distance as dist
from imutils.video import FileVideoStream
from imutils.video import VideoStream
from imutils import face_utils
import dlib
import imutils
from collections import OrderedDict
import numpy as np

ROOT = os.path.dirname(__file__)
AUDIO_OUTPUT_PATH = os.path.join(ROOT, 'output.wav')
AUDIO_PTIME = 0.020  # 20ms audio packetization

FACIAL_LANDMARKS_IDXS = OrderedDict([
    ("mouth", (48, 68)),
    ("right_eyebrow", (17, 22)),
    ("left_eyebrow", (22, 27)),
    ("right_eye", (36, 42)),
    ("left_eye", (42, 48)),
    ("nose", (27, 36)),
    ("jaw", (0, 17))
])



def eye_aspect_ratio(eye):
    A = dist.euclidean(eye[1], eye[5])
    B = dist.euclidean(eye[2], eye[4])

    C = dist.euclidean(eye[0], eye[3])

    ear = (A + B) / (2.0 * C)

    return ear

def turn_aspect_ratio(x1,x2,x3):
    A = dist.euclidean(x1, x2)
    B = dist.euclidean(x2, x3)

    return A/B

def open_mouth_detection(x1,x2,x3,x4):
    A = dist.euclidean(x1, x2)
    B = dist.euclidean(x3, x4)

    return A/B

def rect_to_bb(rect):
    x = rect.left()
    y = rect.top()
    w = rect.right() - x
    h = rect.bottom() - y
    return (x, y, w, h)

def shape_to_np(shape, dtype="int"):
    coords = np.zeros((68, 2), dtype=dtype)

    for i in range(0, 68):
        coords[i] = (shape.part(i).x, shape.part(i).y)

    return coords

def frame_from_bgr(data_bgr):
    data_yuv = cv2.cvtColor(data_bgr, cv2.COLOR_BGR2YUV_YV12)
    return VideoFrame(width=data_bgr.shape[1], height=data_bgr.shape[0], data=data_yuv.tobytes())


def frame_from_gray(data_gray):
    data_bgr = cv2.cvtColor(data_gray, cv2.COLOR_GRAY2BGR)
    data_yuv = cv2.cvtColor(data_bgr, cv2.COLOR_BGR2YUV_YV12)
    return VideoFrame(width=data_bgr.shape[1], height=data_bgr.shape[0], data=data_yuv.tobytes())


def frame_to_bgr(frame):
    data_flat = numpy.frombuffer(frame.data, numpy.uint8)
    data_yuv = data_flat.reshape((math.ceil(frame.height * 12 / 8), frame.width))
    return cv2.cvtColor(data_yuv, cv2.COLOR_YUV2BGR_YV12)


class AudioFileTrack(AudioStreamTrack):
    def __init__(self, path):
        self.last = None
        self.reader = wave.open(path, 'rb')
        self.frames_per_packet = int(self.reader.getframerate() * AUDIO_PTIME)

        self.w = 800
        self.h = 600
        # set crop factor
        #        self.cam.set(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT, self.h)
        #        self.cam.set(cv2.cv.CV_CAP_PROP_FRAME_WIDTH, self.w)
        # load cascade file
        self.face_cascade = cv2.CascadeClassifier('face.xml')
        self.EYE_AR_THRESH = 0.21
        self.RATIO_THRESH = 0.0017
        self.EYE_AR_CONSEC_FRAMES = 3
        self.COUNTER = 0
        self.eyeCount = 0
        self.mouthCount = 0
        self.leftSide = 0
        self.rightSide = 0
        self.PASS = 0
        self.sign = 0
        self.detector = dlib.get_frontal_face_detector()
        self.predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")

        (self.lStart, self.lEnd) = face_utils.FACIAL_LANDMARKS_IDXS["left_eye"]
        (self.rStart, self.rEnd) = face_utils.FACIAL_LANDMARKS_IDXS["right_eye"]

    async def recv(self):
        # as we are reading audio from a file and not using a "live" source,
        # we need to control the rate at which audio is sent
        if self.last:
            now = time.time()
            await asyncio.sleep(self.last + AUDIO_PTIME - now)
        self.last = time.time()

        return AudioFrame(
            channels=self.reader.getnchannels(),
            data=self.reader.readframes(self.frames_per_packet),
            sample_rate=self.reader.getframerate())


class VideoTransformTrack(VideoStreamTrack):
    def __init__(self, transform):
        self.counter = 0
        self.received = asyncio.Queue(maxsize=1)
        self.transform = transform

        self.w = 800
        self.h = 600
        # set crop factor
        #        self.cam.set(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT, self.h)
        #        self.cam.set(cv2.cv.CV_CAP_PROP_FRAME_WIDTH, self.w)
        # load cascade file
        self.face_cascade = cv2.CascadeClassifier('face.xml')
        self.EYE_AR_THRESH = 0.21
        self.RATIO_THRESH = 0.0017
        self.EYE_AR_CONSEC_FRAMES = 3
        self.COUNTER = 0
        self.eyeCount = 0
        self.mouthCount = 0
        self.leftSide = 0
        self.rightSide = 0
        self.PASS = 0
        self.sign = 0
        self.detector = dlib.get_frontal_face_detector()
        self.predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")

        (self.lStart, self.lEnd) = face_utils.FACIAL_LANDMARKS_IDXS["left_eye"]
        (self.rStart, self.rEnd) = face_utils.FACIAL_LANDMARKS_IDXS["right_eye"]

    async def recv(self):
        frame = await self.received.get()

        self.counter += 1
        if (self.counter % 100) > 50:
            # apply image processing to frame
            if self.transform == 'edges':
                img = frame_to_bgr(frame)
                edges = cv2.Canny(img, 100, 200)

                return frame_from_gray(edges)
            elif self.transform == 'face':
                img = frame_to_bgr(frame)
                rows, cols, _ = img.shape

                gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                rects = self.detector(img, 1)

                # face cascade detector
                faces = self.face_cascade.detectMultiScale(gray)
                # draw rect on face arias
                scale = float(self.w / 320.0)
                count = 0

                for f in faces:
                    font = cv2.FONT_HERSHEY_SIMPLEX
                    x, y, z, t = [int(float(v) * scale) for v in f]
                    cv2.putText(img, str(x) + ' ' + str(y), (0, (self.h - 10 - 25 * count)), font, 1, (0, 0, 0), 2)
                    count += 1
                    cv2.rectangle(img, (x, y), (x + z, y + t), (255, 255, 255), 2)

                return frame_from_bgr(img)
            elif self.transform == 'green':
                return VideoFrame(width=frame.width, height=frame.height)
            else:
                #return VideoFrame(width=frame.width, height=frame.height)
                #return liveness_detection(frame)
                return frame
        else:
            # return raw frame
            return frame




async def consume_video(track, local_video):
    """
    Drain incoming video, and echo it back.
    """
    while True:
        frame = await track.recv()

        # we are only interested in the latest frame
        if local_video.received.full():
            await local_video.received.get()

        await local_video.received.put(frame)


async def index(request):
    content = open(os.path.join(ROOT, 'index.html'), 'r').read()
    return web.Response(content_type='text/html', text=content)


async def javascript(request):
    content = open(os.path.join(ROOT, 'client.js'), 'r').read()
    return web.Response(content_type='application/javascript', text=content)


async def offer(request):
    params = await request.json()
    offer = RTCSessionDescription(
        sdp=params['sdp'],
        type=params['type'])

    pc = RTCPeerConnection()
    pc._consumers = []
    pcs.append(pc)

    # prepare local media
    local_audio = AudioFileTrack(path=os.path.join(ROOT, 'demo-instruct.wav'))
    local_video = VideoTransformTrack(transform=params['video_transform'])

    @pc.on('datachannel')
    def on_datachannel(channel):
        @channel.on('message')
        async def on_message(message):
            await channel.send('pong')

    @pc.on('track')
    def on_track(track):
        if track.kind == 'audio':
            pc.addTrack(local_audio)
            pc._consumers.append(asyncio.ensure_future(consume_audio(track)))
        elif track.kind == 'video':
            pc.addTrack(local_video)
            pc._consumers.append(asyncio.ensure_future(consume_video(track, local_video)))

    await pc.setRemoteDescription(offer)
    answer = await pc.createAnswer()
    await pc.setLocalDescription(answer)

    return web.Response(
        content_type='application/json',
        text=json.dumps({
            'sdp': pc.localDescription.sdp,
            'type': pc.localDescription.type
        }))


pcs = []


async def on_shutdown(app):
    # stop audio / video consumers
    for pc in pcs:
        for c in pc._consumers:
            c.cancel()

    # close peer connections
    coros = [pc.close() for pc in pcs]
    await asyncio.gather(*coros)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='WebRTC audio / video / data-channels demo')
    parser.add_argument('--port', type=int, default=8080,
                        help='Port for HTTP server (default: 8080)')
    parser.add_argument('--verbose', '-v', action='count')
    args = parser.parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)

    app = web.Application()
    app.on_shutdown.append(on_shutdown)
    app.router.add_get('/', index)
    app.router.add_get('/client.js', javascript)
    app.router.add_post('/offer', offer)
    web.run_app(app, port=args.port)

此处为检测人脸的代码:

img = frame_to_bgr(frame)
                rows, cols, _ = img.shape

                gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                rects = self.detector(img, 1)

                # face cascade detector
                faces = self.face_cascade.detectMultiScale(gray)
                # draw rect on face arias
                scale = float(self.w / 320.0)
                count = 0

                for f in faces:
                    font = cv2.FONT_HERSHEY_SIMPLEX
                    x, y, z, t = [int(float(v) * scale) for v in f]
                    cv2.putText(img, str(x) + ' ' + str(y), (0, (self.h - 10 - 25 * count)), font, 1, (0, 0, 0), 2)
                    count += 1
                    cv2.rectangle(img, (x, y), (x + z, y + t), (255, 255, 255), 2)

                return frame_from_bgr(img)

为什么会这样?有什么线索可以解决这个问题吗?原因是当我在不带webrtc的python中使用opencv运行时,也运行了

0 个答案:

没有答案