import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import re
import csv
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
# text = input ('Enter Text - ') - In-case the user wants to manually put-in
some text to evaluate
#print ('\n')
#print (len(lst))
# Take 'Content' input from a csv file
file = open("Test_1.CSV", "r", encoding='utf-8')
reader = csv.reader(file)
for line in reader:
text = line[5]
lst = re.findall('(http.?://[^\s]+)', text)
if not lst: print(line[0], 'Empty List')
else:
try:
for url in lst:
try:
try:
html = urllib.request.urlopen(url, context=ctx).read()
#html = urllib.request.urlopen(urllib.parse.quote(url, errors='ignore'), context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
title = soup.title.string
str_title = str (title)
if 'Twitter' in str_title:
if len(lst) > 1: break
else: continue
else:
print (line[0], str_title, ',', url)
except UnicodeEncodeError as e:
#print("Incorrect URL {}".format(url.encode('ascii', errors='ignore')))
b_url = url.encode('ascii', errors='ignore')
n_url = b_url.decode("utf-8")
#print (n_url)
html = urllib.request.urlopen(n_url, context=ctx).read()
#html = urllib.request.urlopen(urllib.parse.quote(url, errors='ignore'), context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
title = soup.title.string
str_title = str (title)
if 'Twitter' in str_title:
if len(lst) > 1: break
else: continue
else:
print (line[0], str_title, ',', url)
except urllib.error.URLError:
print ('Invalid DNS Link')
except urllib.error.HTTPError as err:
if err.code == 404:
print (line[0], 'Invalid Twitter Link')
上面提到的代码读取一个csv文件,选择一个列,然后使用regex进行解析以在一行中获取所有超链接,然后使用BeautifulSoup通过Hyperlink进行解析以获取页面的“标题字符串”
在运行这段代码时,我首先遇到UnicodeEncodeError
并解决了它;然后,我遇到了urllib.error.URLError
并也解决了这个问题。现在,我遇到了另一个
"Traceback (most recent call last): File "C:\Users\asaxena\Desktop\py4e\Gartner\crawler_new.py", line 32, in <modu le> title = soup.title.string AttributeError: 'NoneType' object has no attribute 'string'".
我真的有什么办法可以绕过出现的任何类型的错误?即使是未预见的?我知道BeautifulSoup倾向于抛出意想不到的错误,部分原因是网络上漫游的内容种类繁多。
答案 0 :(得分:0)
我终于解决了它,方法是将整个代码放在try / except块下:
尝试: #将我所有的代码放在这里 例外,例如e: 打印(“忽略错误”)
该代码将能够处理所有类型的异常。