我有一个遍历网站多个页面的脚本,我想跳过或添加某些页面上可能没有的项目的空格。例如,有些页面不包含有关这本书的描述。当我遇到这些页面之一时,出现属性错误。我的以下脚本可以毫无问题地循环浏览前两页,但是当它到达第三页时,它将停止。 这是回溯
item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/beautifulsoup4-4.6.0-py3.6.egg/bs4/element.py", line 737, in __getattr__ AttributeError: 'NavigableString' object has no attribute 'text'
我该如何解决?这是我的脚本:
from bs4 import BeautifulSoup as soup
import requests
import json
base_url = "https://open.umn.edu/opentextbooks/"
data = []
n = 30
for i in range(4, n+1):
response = requests.get(base_url + "BookDetail.aspx?bookId=" + str(i))
#html parsing
page_soup = soup(response.content, "html5lib")
#grabs info for each textbook
containers = page_soup.findAll("div",{"class":"LongDescription"})
author = page_soup.select("p")
about = page_soup.find("div",{"id":"AboutBook"})
for container in containers:
item = {}
item['type'] = "Textbook"
item['title'] = container.find("div",{"class":"twothird"}).h1.text
item['author'] = author[3].get_text(separator=', ')
if item['author'] == " ":
item['author'] = "University of Minnesota Libraries Publishing"
item['link'] = "https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=" + str(i)
if not container.find(string="Publisher: "):
item['publisher_url'] = item['publisher'] = ""
else:
item['publisher'] = container.find(text="Publisher: ").nextSibling.text
item['publisher_url'] = container.find(text="Publisher: ").nextSibling['href']
item['source'] = "Open Textbook Library"
if not about.h2.nextSibling.nextSibling.nextSibling:
item['description'] = ""
else:
item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text
item['base_url'] = "https://open.umn.edu/opentextbooks/"
if container.find("p",{"class":"Badge-Condition"}).a:
item['license'] = container.find("p",{"class":"Badge-Condition"}).a.text
if container.find("img",{"class":"ctl00_maincontent_imgLicence"}):
item['license'] = ''
if container.find("p",{"class":"Badge-Condition"}).a:
item['license_url'] = container.find("p",{"class":"Badge-Condition"}).a["href"]
if container.find("img",{"class":"ctl00_maincontent_imgLicence"}):
item['license_url'] = ''
if container.find("div",{"class":"twothird"}).p:
item['review'] = container.find("div",{"class":"twothird"}).p.text
else:
item['review'] = ''
if item['review'].startswith('('):
item['review'] = item['review'].replace('(', '')
if item['review'].endswith(' reviews)'):
item['review'] = item['review'].replace(' reviews)', '')
if item['review'] > str(0):
item['review'] = "Reviewed Resource"
else:
item['review'] = ''
item['image_url'] = "https://open.umn.edu/opentextbooks/" + container.img["src"]
data.append(item) # add the item to the list
with open("./json/otl-1.json", "w") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)
答案 0 :(得分:0)
我不建议使用item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text
来解析描述,因为它太具体了。我想到了以下代码:
from bs4 import BeautifulSoup as soup
import requests
import json
from pprint import pprint
base_url = "https://open.umn.edu/opentextbooks/"
data = []
n = 30
for i in range(4, n+1):
response = requests.get(base_url + "BookDetail.aspx?bookId=" + str(i))
page_soup = soup(response.content, "lxml")
data = {}
title, author, description = page_soup.select('h1')[0].text, \
page_soup.select('h1 ~ p')[3].get_text(', '), \
'\n'.join(p.text.strip() for p in page_soup.select('div#AboutBook > p') if p.text.strip())
data['type'] = "Textbook"
data['title'] = title
data['author'] = author if author.strip() else "University of Minnesota Libraries Publishing"
data['link'] = "https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=" + str(i)
data['source'] = "Open Textbook Library"
data['description'] = description
pprint(data)
# with open("./json/otl-1.json", "w") as writeJSON:
# json.dump(data, writeJSON, ensure_ascii=False)
打印:
{'author': 'University of Minnesota Libraries Publishing',
'description': 'This book is intended for an undergraduate or MBA level '
'Financial Accounting course. It covers the standard topics in '
'a standard sequence, utilizing the Socratic method of asking '
'and answering questions.',
'link': 'https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=4',
'source': 'Open Textbook Library',
'title': 'Financial Accounting',
'type': 'Textbook'}
...and so on (for each book)
答案 1 :(得分:-1)
无论何时出现AttributeError,都可以使用以下代码:
Try:
your code here
except AttributeError:
pass or other codes