我有用于使用python和webrtc进行人脸检测的视频流,但是当从视频中进行人脸检测时,视频流出现滞后,每fps非常滞后,我在这里使用aiortc的项目代码:
server.py:
import argparse
import asyncio
import json
import logging
import math
import os
import time
import wave
import cv2
import numpy
from aiohttp import web
from aiortc import RTCPeerConnection, RTCSessionDescription
from aiortc.mediastreams import (AudioFrame, AudioStreamTrack, VideoFrame,
VideoStreamTrack)
from scipy.spatial import distance as dist
from imutils.video import FileVideoStream
from imutils.video import VideoStream
from imutils import face_utils
import dlib
import imutils
from collections import OrderedDict
import numpy as np
ROOT = os.path.dirname(__file__)
AUDIO_OUTPUT_PATH = os.path.join(ROOT, 'output.wav')
AUDIO_PTIME = 0.020 # 20ms audio packetization
FACIAL_LANDMARKS_IDXS = OrderedDict([
("mouth", (48, 68)),
("right_eyebrow", (17, 22)),
("left_eyebrow", (22, 27)),
("right_eye", (36, 42)),
("left_eye", (42, 48)),
("nose", (27, 36)),
("jaw", (0, 17))
])
def eye_aspect_ratio(eye):
A = dist.euclidean(eye[1], eye[5])
B = dist.euclidean(eye[2], eye[4])
C = dist.euclidean(eye[0], eye[3])
ear = (A + B) / (2.0 * C)
return ear
def turn_aspect_ratio(x1,x2,x3):
A = dist.euclidean(x1, x2)
B = dist.euclidean(x2, x3)
return A/B
def open_mouth_detection(x1,x2,x3,x4):
A = dist.euclidean(x1, x2)
B = dist.euclidean(x3, x4)
return A/B
def rect_to_bb(rect):
x = rect.left()
y = rect.top()
w = rect.right() - x
h = rect.bottom() - y
return (x, y, w, h)
def shape_to_np(shape, dtype="int"):
coords = np.zeros((68, 2), dtype=dtype)
for i in range(0, 68):
coords[i] = (shape.part(i).x, shape.part(i).y)
return coords
def frame_from_bgr(data_bgr):
data_yuv = cv2.cvtColor(data_bgr, cv2.COLOR_BGR2YUV_YV12)
return VideoFrame(width=data_bgr.shape[1], height=data_bgr.shape[0], data=data_yuv.tobytes())
def frame_from_gray(data_gray):
data_bgr = cv2.cvtColor(data_gray, cv2.COLOR_GRAY2BGR)
data_yuv = cv2.cvtColor(data_bgr, cv2.COLOR_BGR2YUV_YV12)
return VideoFrame(width=data_bgr.shape[1], height=data_bgr.shape[0], data=data_yuv.tobytes())
def frame_to_bgr(frame):
data_flat = numpy.frombuffer(frame.data, numpy.uint8)
data_yuv = data_flat.reshape((math.ceil(frame.height * 12 / 8), frame.width))
return cv2.cvtColor(data_yuv, cv2.COLOR_YUV2BGR_YV12)
class AudioFileTrack(AudioStreamTrack):
def __init__(self, path):
self.last = None
self.reader = wave.open(path, 'rb')
self.frames_per_packet = int(self.reader.getframerate() * AUDIO_PTIME)
self.w = 800
self.h = 600
# set crop factor
# self.cam.set(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT, self.h)
# self.cam.set(cv2.cv.CV_CAP_PROP_FRAME_WIDTH, self.w)
# load cascade file
self.face_cascade = cv2.CascadeClassifier('face.xml')
self.EYE_AR_THRESH = 0.21
self.RATIO_THRESH = 0.0017
self.EYE_AR_CONSEC_FRAMES = 3
self.COUNTER = 0
self.eyeCount = 0
self.mouthCount = 0
self.leftSide = 0
self.rightSide = 0
self.PASS = 0
self.sign = 0
self.detector = dlib.get_frontal_face_detector()
self.predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
(self.lStart, self.lEnd) = face_utils.FACIAL_LANDMARKS_IDXS["left_eye"]
(self.rStart, self.rEnd) = face_utils.FACIAL_LANDMARKS_IDXS["right_eye"]
async def recv(self):
# as we are reading audio from a file and not using a "live" source,
# we need to control the rate at which audio is sent
if self.last:
now = time.time()
await asyncio.sleep(self.last + AUDIO_PTIME - now)
self.last = time.time()
return AudioFrame(
channels=self.reader.getnchannels(),
data=self.reader.readframes(self.frames_per_packet),
sample_rate=self.reader.getframerate())
class VideoTransformTrack(VideoStreamTrack):
def __init__(self, transform):
self.counter = 0
self.received = asyncio.Queue(maxsize=1)
self.transform = transform
self.w = 800
self.h = 600
# set crop factor
# self.cam.set(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT, self.h)
# self.cam.set(cv2.cv.CV_CAP_PROP_FRAME_WIDTH, self.w)
# load cascade file
self.face_cascade = cv2.CascadeClassifier('face.xml')
self.EYE_AR_THRESH = 0.21
self.RATIO_THRESH = 0.0017
self.EYE_AR_CONSEC_FRAMES = 3
self.COUNTER = 0
self.eyeCount = 0
self.mouthCount = 0
self.leftSide = 0
self.rightSide = 0
self.PASS = 0
self.sign = 0
self.detector = dlib.get_frontal_face_detector()
self.predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
(self.lStart, self.lEnd) = face_utils.FACIAL_LANDMARKS_IDXS["left_eye"]
(self.rStart, self.rEnd) = face_utils.FACIAL_LANDMARKS_IDXS["right_eye"]
async def recv(self):
frame = await self.received.get()
self.counter += 1
if (self.counter % 100) > 50:
# apply image processing to frame
if self.transform == 'edges':
img = frame_to_bgr(frame)
edges = cv2.Canny(img, 100, 200)
return frame_from_gray(edges)
elif self.transform == 'face':
img = frame_to_bgr(frame)
rows, cols, _ = img.shape
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
rects = self.detector(img, 1)
# face cascade detector
faces = self.face_cascade.detectMultiScale(gray)
# draw rect on face arias
scale = float(self.w / 320.0)
count = 0
for f in faces:
font = cv2.FONT_HERSHEY_SIMPLEX
x, y, z, t = [int(float(v) * scale) for v in f]
cv2.putText(img, str(x) + ' ' + str(y), (0, (self.h - 10 - 25 * count)), font, 1, (0, 0, 0), 2)
count += 1
cv2.rectangle(img, (x, y), (x + z, y + t), (255, 255, 255), 2)
return frame_from_bgr(img)
elif self.transform == 'green':
return VideoFrame(width=frame.width, height=frame.height)
else:
#return VideoFrame(width=frame.width, height=frame.height)
#return liveness_detection(frame)
return frame
else:
# return raw frame
return frame
async def consume_video(track, local_video):
"""
Drain incoming video, and echo it back.
"""
while True:
frame = await track.recv()
# we are only interested in the latest frame
if local_video.received.full():
await local_video.received.get()
await local_video.received.put(frame)
async def index(request):
content = open(os.path.join(ROOT, 'index.html'), 'r').read()
return web.Response(content_type='text/html', text=content)
async def javascript(request):
content = open(os.path.join(ROOT, 'client.js'), 'r').read()
return web.Response(content_type='application/javascript', text=content)
async def offer(request):
params = await request.json()
offer = RTCSessionDescription(
sdp=params['sdp'],
type=params['type'])
pc = RTCPeerConnection()
pc._consumers = []
pcs.append(pc)
# prepare local media
local_audio = AudioFileTrack(path=os.path.join(ROOT, 'demo-instruct.wav'))
local_video = VideoTransformTrack(transform=params['video_transform'])
@pc.on('datachannel')
def on_datachannel(channel):
@channel.on('message')
async def on_message(message):
await channel.send('pong')
@pc.on('track')
def on_track(track):
if track.kind == 'audio':
pc.addTrack(local_audio)
pc._consumers.append(asyncio.ensure_future(consume_audio(track)))
elif track.kind == 'video':
pc.addTrack(local_video)
pc._consumers.append(asyncio.ensure_future(consume_video(track, local_video)))
await pc.setRemoteDescription(offer)
answer = await pc.createAnswer()
await pc.setLocalDescription(answer)
return web.Response(
content_type='application/json',
text=json.dumps({
'sdp': pc.localDescription.sdp,
'type': pc.localDescription.type
}))
pcs = []
async def on_shutdown(app):
# stop audio / video consumers
for pc in pcs:
for c in pc._consumers:
c.cancel()
# close peer connections
coros = [pc.close() for pc in pcs]
await asyncio.gather(*coros)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='WebRTC audio / video / data-channels demo')
parser.add_argument('--port', type=int, default=8080,
help='Port for HTTP server (default: 8080)')
parser.add_argument('--verbose', '-v', action='count')
args = parser.parse_args()
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
app = web.Application()
app.on_shutdown.append(on_shutdown)
app.router.add_get('/', index)
app.router.add_get('/client.js', javascript)
app.router.add_post('/offer', offer)
web.run_app(app, port=args.port)
此处为检测人脸的代码:
img = frame_to_bgr(frame)
rows, cols, _ = img.shape
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
rects = self.detector(img, 1)
# face cascade detector
faces = self.face_cascade.detectMultiScale(gray)
# draw rect on face arias
scale = float(self.w / 320.0)
count = 0
for f in faces:
font = cv2.FONT_HERSHEY_SIMPLEX
x, y, z, t = [int(float(v) * scale) for v in f]
cv2.putText(img, str(x) + ' ' + str(y), (0, (self.h - 10 - 25 * count)), font, 1, (0, 0, 0), 2)
count += 1
cv2.rectangle(img, (x, y), (x + z, y + t), (255, 255, 255), 2)
return frame_from_bgr(img)
为什么会这样?有什么线索可以解决这个问题吗?原因是当我在不带webrtc的python中使用opencv运行时,也运行了