Question

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import re
import csv

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# text = input ('Enter Text - ') - In-case the user wants to manually put-in 
some text to evaluate
#print ('\n')
#print (len(lst))

# Take 'Content' input from a csv file
file = open("Test_1.CSV", "r", encoding='utf-8')
reader = csv.reader(file)
for line in reader:
text = line[5]
lst = re.findall('(http.?://[^\s]+)', text)

if not lst: print(line[0], 'Empty List')
else:
    try:
        for url in lst:
            try:
                try:
                    html = urllib.request.urlopen(url, context=ctx).read()
                    #html = urllib.request.urlopen(urllib.parse.quote(url, errors='ignore'), context=ctx).read()
                    soup = BeautifulSoup(html, 'html.parser')
                    title = soup.title.string
                    str_title = str (title)
                    if 'Twitter' in str_title:
                        if len(lst) > 1: break
                        else: continue
                    else:
                        print (line[0], str_title, ',', url)
                except UnicodeEncodeError as e:
                    #print("Incorrect URL {}".format(url.encode('ascii', errors='ignore')))
                    b_url = url.encode('ascii', errors='ignore')
                    n_url = b_url.decode("utf-8")
                    #print (n_url)
                    html = urllib.request.urlopen(n_url, context=ctx).read()
                    #html = urllib.request.urlopen(urllib.parse.quote(url, errors='ignore'), context=ctx).read()
                    soup = BeautifulSoup(html, 'html.parser')
                    title = soup.title.string
                    str_title = str (title)
                    if 'Twitter' in str_title:
                        if len(lst) > 1: break
                        else: continue
                    else:
                        print (line[0], str_title, ',', url)
            except urllib.error.URLError:
                print ('Invalid DNS Link')
    except urllib.error.HTTPError as err:
        if err.code == 404:
            print (line[0], 'Invalid Twitter Link')

上面提到的代码读取一个csv文件，选择一个列，然后使用regex进行解析以在一行中获取所有超链接，然后使用BeautifulSoup通过Hyperlink进行解析以获取页面的“标题字符串”

在运行这段代码时，我首先遇到UnicodeEncodeError并解决了它；然后，我遇到了urllib.error.URLError并也解决了这个问题。现在，我遇到了另一个

"Traceback (most recent call last): File "C:\Users\asaxena\Desktop\py4e\Gartner\crawler_new.py", line 32, in <modu le> title = soup.title.string AttributeError: 'NoneType' object has no attribute 'string'".

我真的有什么办法可以绕过出现的任何类型的错误？即使是未预见的？我知道BeautifulSoup倾向于抛出意想不到的错误，部分原因是网络上漫游的内容种类繁多。

Answer 1

我终于解决了它，方法是将整个代码放在try / except块下：

尝试：＃将我所有的代码放在这里例外，例如e：打印（“忽略错误”）

该代码将能够处理所有类型的异常。

AttributeError：“ NoneType”对象没有属性“ string”

1 个答案: