from bs4 import BeautifulSoup
import re
import urllib2
import urllib
list_open = open("weblist.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
for url in line_in_list:
Beautiful = urllib2.urlopen(url).read()
beautiful
soup = bs4.BeautifulSoup(beautiful)
for news in soup:
print soup.getText()
以下代码可帮助我从多个网站(weblist.txt)中提取文本
但是当我的网站列表包含任何未使用此代码打开的链接或网站时,它会立即停止并且不检查其他链接。假设我有10个链接,而第二个链接未打开或无法解析,则会导致错误并在不检查其他链接的情况下停止该链接。我希望它应检查Web列表中的每个链接(从开始到结束)并从中提取文本所有那些真实的或能够解析的链接。
答案 0 :(得分:0)
只需添加try try语句,如下所示:
for url in line_in_list:
try:
Beautiful = urllib2.urlopen(url).read()
beautiful
soup = bs4.BeautifulSoup(beautiful)
for news in soup:
print soup.getText()
except Exception as e:
#Error handling
print(e)