我的代码是:
import os, inspect
import urllib.request
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import csv
# Get files which are to be read
for root, dirs, files in os.walk(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + "/converted_html_files"):
# print(files)
filenames = files
# Extracting reasons from the files (based on detection of keywords such as 'because', 'due to' etc.)
reasons_list = []
for file in filenames:
url = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + "/converted_html_files/" + file
if url.split('.')[-1] == 'html':
HtmlFile = open(url, 'r',encoding= 'utf-8' )
source_code = HtmlFile.read()
soup = BeautifulSoup(source_code, "html5lib")
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
elif url.split('.')[-1] == 'htm':
source_code = open(url).read()
soup = BeautifulSoup(source_code, "html5lib")
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
else:
text = open(url).read()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
text = text.replace('The', '. The')
text = text.replace('Quest', '. Quest')
text = text.replace('\n', ' ')
sentences=sent_tokenize(text)
my_sentence=[]
for sent in sentences:
if 'because' in word_tokenize(sent):
my_sentence += [sent]
elif 'due' in word_tokenize(sent):
my_sentence += [sent]
elif 'arisen' in word_tokenize(sent):
my_sentence += [sent]
elif 'raised' in word_tokenize(sent):
my_sentence += [sent]
reasons_list += [(int(file.split('_')[0]), my_sentence)]
# print([int(file.split('_')[0]), my_sentence])
# Sorting the list of reasons
reasons_list = sorted(reasons_list, key=lambda x: x[0])
# Writing the reasons in the excel file that was created while
downloading the company files and saving it in new CSV file
in_file = open("output.csv", "rt")
reader = csv.reader(in_file)
out_file = open("output2.csv", "wt")
writer = csv.writer(out_file)
i=1
j=0
for row in reader:
# print(row)
# writer.writerow(row)
if i is 1:
writer.writerow([row[0], row[1], row[2], row[3], "Reasons"])
elif i > 1 and (i%2 is 1):
# writer.writerow(row)
writer.writerow([row[0], row[1], row[2], row[3], ' '.join(reasons_list[j][1])])
j=j+1
i=i+1
in_file.close()
out_file.close()
我收到错误:
Traceback (most recent call last):
File "/Users/Rahul/Downloads/3. Extracting_reasons.py", line 31, in <module>
source_code = open(url).read()
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 4639: invalid start byte
Process finished with exit code 1
如何解决?