我想从网站上抓取数据,但出现一个错误。由于我是网络爬虫的新手,所以请指导我如何解决此问题。这是我面临的UnboundLocalError问题:分配前引用了本地变量'soup'
这是我的代码:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_page(soup):
try:
title = (soup.find('h1',class_="ProductTitle-title",id=False).text)
except:
title = 'Empty Title'
try:
title = (soup.find('h1',class_="TopBar-perUnit TopBar-perUnitTop",id=False).text)
except:
price = 'Empty price'
try:
img = (soup.find('img',class_="ViewSelectorItem-image",id=False).get('src'))
except:
img = 'Empty img'
data = {
'Title' : title,
'Price' : price,
'Img' : img
}
print(data)
def main():
url = "https://www.zazzle.com/60th_silver_diamond_anniversary_photo_invitations-161837951427094549"
get_detail_page(get_page(url))
if __name__ == '__main__':
main()
答案 0 :(得分:0)
我已将用户代理添加到您的代码中:
import urllib.request as urllib2
from bs4 import BeautifulSoup
import csv
REQUEST_HEADER = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}
def get_page(url):
req = urllib2.Request(url, headers=REQUEST_HEADER)
page = urllib2.urlopen(req, timeout=20).read()
soup = BeautifulSoup(page, "html.parser")
return soup
def get_detail_page(soup):
try:
title = (soup.find('h1',class_="ProductTitle-title",id=False).text)
except:
title = 'Empty Title'
try:
title = (soup.find('h1',class_="TopBar-perUnit TopBar-perUnitTop",id=False).text)
except:
price = 'Empty price'
try:
img = (soup.find('img',class_="ViewSelectorItem-image",id=False).get('src'))
except:
img = 'Empty img'
data = {
'Title' : title,
'Price' : price,
'Img' : img
}
print(data)
def main():
url = "https://www.zazzle.com/60th_silver_diamond_anniversary_photo_invitations-161837951427094549"
get_detail_page(get_page(url))
if __name__ == '__main__':
main()
另外,一个非常有趣的内容:Google Chrome: Change the User-Agent String