我制作了一个爬虫程序,以从网站上获得一些弹幕。出现了一些问题。出乎我的意料,发送所有请求仅用30秒,但等待响应需要9分钟。...这是正常情况吗?
否则,我几天前才刚刚学习三重奏。
这里是代码
# coding=utf8
import argparse
import json
import logging
import os
import time
from pprint import pprint
import asks
import trio
from pyquery import PyQuery as jq
logger = logging.getLogger('TencentVideoDanmu')
parser = argparse.ArgumentParser()
parser.add_argument(
'-u', '--url', default="https://v.qq.com/x/cover/p69wlzli02uqwms/d0023ka5gj7.html", type=str, help='TencentVideo URL')
args = parser.parse_args()
asks.init('trio')
AsyncSession = asks.Session()
AsyncSession.headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"cache-control": "max-age=0",
# "cookie": "pgv_info=ssid=s2580931560; pgv_pvid=5076745702; pgv_pvi=4425950208; pgv_si=s1770592256; ptisp=ctc; ptui_loginuin=2669082133; RK=nEJ5woVFyj; ptcz=50e9cdd6ca5afc08869af7a57c608b471a0f19bf39b7aa9925998682e417bfed; ts_uid=8763758775; pac_uid=1_2669082133; tvfe_boss_uuid=c12202d727ca41e8; ts_refer=www.google.ca/; tvfe_search_uid=2db4e25f-aeb5-49cd-b1c6-f66810ab9ea2; login_remember=qq; uin=o0418022662; skey=@RSqEKv4JE; pt2gguin=o0418022662; luin=o0418022662; lskey=000100006089c7a41592fa402e43fd746cc963aae100ef17d5b32db028640097f8dcd9ba33b3f9c6aa3344af; main_login=qq; vuserid=603827039; vusession=d91d873ac7e989b47e1c07f3f2a0; encuin=17cb5e5a4732f6fdbcfe344f11a106c5|418022662; lw_nick=%E9%A2%96|418022662|//thirdqq.qlogo.cn/g?b=sdk&k=2vGdibPiatOuMmNT6L1ZSicvA&s=40&t=1524547387|1; uid=685097243; o_cookie=418022662; ts_last=v.qq.com/x/cover/p69wlzli02uqwms/d0023ka5gj7.html; ptag=www_google_ca|videolist:pagetab; ad_play_index=93",
"referer": "http://v.qq.com/x/search/?q=%E4%BA%BA%E6%B0%91%E7%9A%84%E5%90%8D%E4%B9%89&stag=0&ses=qid%3DlIOeJHL9it2N8x2HEX5AW_jz6cSNMuHjy7qtz4apw3YIDc0J4F7l2w%26last_query%3D%E4%BA%BA%E6%B0%91%E7%9A%84%E5%90%8D%E4%B9%89%26tabid_list%3D0%7C2%7C1%7C12%7C7%7C13%7C3%7C11%7C5%26tabname_list%3D%E5%85%A8%E9%83%A8%7C%E7%94%B5%E8%A7%86%E5%89%A7%7C%E7%94%B5%E5%BD%B1%7C%E5%A8%B1%E4%B9%90%7C%E5%85%B6%E4%BB%96%7C%E8%B4%A2%E7%BB%8F%7C%E7%BB%BC%E8%89%BA%7C%E6%96%B0%E9%97%BB%7C%E9%9F%B3%E4%B9%90%26resolution_tabid_list%3D0%7C1%7C2%7C3%7C4%7C5%26resolution_tabname_list%3D%E5%85%A8%E9%83%A8%7C%E6%A0%87%E6%B8%85%7C%E9%AB%98%E6%B8%85%7C%E8%B6%85%E6%B8%85%7C%E8%93%9D%E5%85%89%7CVR",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
TaskIds = []
Results = {}
Status = {"starts":0,"gets":0}
def AddStart():
Status['starts']+=1
def AddGet():
Status['gets']+=1
# demo: https://v.qq.com/x/cover/p69wlzli02uqwms/d0023ka5gj7.html
async def GetVideoList(url):
response = await AsyncSession.get(url)
TaskIds.extend(response.text.split('var LIST_INFO = {"vid":[')[
1].split(']')[0].replace('"', "").split(","))
TaskIds.append(response.text.split(
'firstClipListVid":"')[1].split('"}')[0])
async def GetRegID(vid):
JqueryID = f"jQuery_{str(int(time.time()))}"
response = await AsyncSession.get(f"https://bullet.video.qq.com/fcgi-bin/target/regist?callback={JqueryID}&otype=json&vid={vid}&cid=p69wlzli02uqwms&lid=&g_tk=1115548900&_={str(int(time.time()))}")
targetid = json.loads(response.text[len(JqueryID)+1:-1]).get('targetid')
if targetid:
Results[vid] = {
"targetid": targetid,
"datas": {},
"SingleMaxCount":0
}
async def GetSigleDanmu(item):
key, dataset = item
JqueryID = f"jQuery_{str(int(time.time()))}"
async def GetItem(item, point=0):
SingleMaxCount = item['SingleMaxCount']
response = await AsyncSession.get(f"https://mfm.video.qq.com/danmu?otype=json&callback={JqueryID}×tamp={point}&target_id={item['targetid']}&count=80&second_count=5&session_key=942368%2C2609%2C1536055685&_=1536055683361")
jsondata = json.loads(
response.text[len(JqueryID)+1:-1], strict=False)
print(point, len(response.text))
for item in jsondata['comments']:
Results[key]["datas"][item["commentid"]] = item
if SingleMaxCount == 0 :
Results[key].update({i: jsondata[i]
for i in ['tol_up', 'single_max_count']})
Results[key]['SingleMaxCount'] = jsondata['single_max_count']
async with trio.open_nursery() as nursery:
nursery.start_soon(GetItem, dataset)
async with trio.open_nursery() as nursery:
for stamp in range(0, dataset['SingleMaxCount']+30, 30):
nursery.start_soon(GetItem, dataset, stamp)
async def GetALLDanmus(vid_list):
async with trio.open_nursery() as nursery:
for vid in vid_list:
nursery.start_soon(GetRegID, vid)
logger.info(f"Get Nums: [{len(Results.keys())}]")
async with trio.open_nursery() as nursery:
for item in Results.items():
nursery.start_soon(GetSigleDanmu, item)
def SaveRes(data, filename="res.json"):
with open(filename, "w") as f:
f.write(json.dumps(data,ensure_ascii=False,indent=4))
if os.path.exists(filename):
logger.info("Saved Success")
def main():
trio.run(GetVideoList,args.url)
trio.run(GetALLDanmus, TaskIds)
SaveRes(Results)
if __name__ == '__main__':
main()