我正在通过使用python flask通过将查询传递给'https://www.google.com/search?q='和beautifulsoup来进行网络抓取。它在本地工作,但是在将其部署到heroku之后,它仅适用于少数查询。并给出500-内部服务器错误。 例如,它适用于键盘,椅子等查询。 鞋子不适合香水。
我尝试重新启动测功机。 这是heroku登录失败的日志。
2019-05-03T16:24:08.092876+00:00 heroku[router]: at=info method=POST path="/labelsearch" host=ris-app.herokuapp.com request_id=4a65fac0-5ebf-4088-ae85-285ee89c14ce fwd="49.206.8.241" dyno=web.1 connect=2ms service=1923ms status=500 bytes=455 protocol=https
这是我的代码:
def label_search():
if request.headers['Content-Type'] != 'application/json':
return "Request must be JSON format"
client_json = json.dumps(request.json)
client_data = json.loads(client_json)
code = doImageSearch(SEARCH_LABEL + quote(client_data['q']))
return parseLabelResults(code)
这是用于网络抓取-
soup = BeautifulSoup(code, 'html.parser')
label_results = {
'links': [],
'titles': [],
'maps': '',
'images': [],
'sources':[],
'shop':[],
'buy_link':[]
}
for div in soup.findAll('div', attrs={'class':'rc'}):
sLink = div.find('a')
label_results['links'].append(sLink['href'])
for buy in soup.findAll('div', attrs={'class':'mnr-c pla-unit'}):
if type(buy)!='NoneType':
blink=buy.find('a').find_next_sibling('a')
if type(blink)!='NoneType':
label_results['buy_link'].append(blink['href'])
for title in soup.findAll('div', attrs={'class':'rc'}):
title_name=title.find('h3')
label_results['titles'].append(title_name.get_text())
for shopping in soup.findAll('div', attrs={'class':'hdtb-mitem hdtb-imb'}):
if type(shopping)!='NoneType':
if shopping.find('a').contents[0]=='Shopping':
a=shopping.find('a')
label_results['shop'].append('https://www.google.com' + a['href'])
for map_link in soup.findAll('div', attrs={'class':'xERobd'}):
if type(map_link)!='NoneType':
mlink=map_link.find('a')
if type(mlink)!='NoneType':
label_results['maps']='https://www.google.com' + mlink['href']
for image in soup.findAll('div', attrs={'id':'imagebox_bigimages'}):
if type(image)!='NoneType':
image_link=image.find('a')
if type(image_link)!='NoneType':
label_results['images'].append('https://www.google.com'+image_link['href'])
for image in soup.findAll('div', attrs={'class':'PFaeqe'}):
if type(image)!='NoneType':
image_link=image.find('a')
if type(image_link)!='NoneType':
label_results['images'].append('https://www.google.com'+image_link['href'])
if len(label_results['images'][0])!=0:
imgcode=doImageSearch(label_results['images'][0])
imgsoup = BeautifulSoup(imgcode, 'html.parser')
for a in imgsoup.findAll('a', attrs={'class':'rg_l'}):
if type(a) != 'NoneType':
img=a.find('img', attrs={'class':'rg_ic rg_i'})
if type(img)!='NoneType':
attrslist=img.attrs
for k in attrslist:
if k=='data-src':
label_results['sources'].append(attrslist[k])
print(label_results)
sys.stdout.flush()
print("Successful search")
return json.dumps(label_results)
请让我知道我要去哪里。谢谢。