Python Unicodedecode错误

时间:2017-07-13 05:37:00

标签: python error-handling


import os, inspect
import urllib.request
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import csv

# Get files which are to be read
for root, dirs, files in os.walk(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + "/converted_html_files"):
    # print(files)
    filenames = files

# Extracting reasons from the files (based on detection of keywords such as 'because', 'due to' etc.)

reasons_list = []

for file in filenames:

 url = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + "/converted_html_files/" + file

if url.split('.')[-1] == 'html':
        HtmlFile = open(url, 'r',encoding= 'utf-8' )
        source_code =
        soup = BeautifulSoup(source_code, "html5lib")

        # kill all script and style elements
        for script in soup(["script", "style"]):
            script.extract()    # rip it out

        # get text
        text = soup.get_text()

    elif url.split('.')[-1] == 'htm':
        source_code = open(url).read()
        soup = BeautifulSoup(source_code, "html5lib")

        # kill all script and style elements
        for script in soup(["script", "style"]):
            script.extract()    # rip it out

        # get text
        text = soup.get_text()

        text = open(url).read()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    text = text.replace('The', '. The')
    text = text.replace('Quest', '. Quest')
    text = text.replace('\n', ' ')


    for sent in sentences:
        if 'because' in word_tokenize(sent):
            my_sentence += [sent]
        elif 'due' in word_tokenize(sent):
            my_sentence += [sent]
        elif 'arisen' in word_tokenize(sent):
            my_sentence += [sent]
        elif 'raised' in word_tokenize(sent):
            my_sentence += [sent]

    reasons_list += [(int(file.split('_')[0]), my_sentence)]
    # print([int(file.split('_')[0]), my_sentence])

   # Sorting the list of reasons
   reasons_list = sorted(reasons_list, key=lambda x: x[0])

   # Writing the reasons in the excel file that was created while 
  downloading the company files and saving it in new CSV file

  in_file = open("output.csv", "rt")
  reader = csv.reader(in_file)
  out_file = open("output2.csv", "wt")
   writer = csv.writer(out_file)
  for row in reader:
    # print(row)
    # writer.writerow(row)
     if i is 1:
        writer.writerow([row[0], row[1], row[2], row[3], "Reasons"])
     elif i > 1 and (i%2 is 1):
        # writer.writerow(row)
        writer.writerow([row[0], row[1], row[2], row[3], ' '.join(reasons_list[j][1])])


Traceback (most recent call last):
  File "/Users/Rahul/Downloads/3.", line 31, in <module>
    source_code = open(url).read()
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/", line 321, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 4639: invalid start byte

Process finished with exit code 1


0 个答案:
