Question

我正在尝试将HTML链接解析为代码，并将其源代码作为字符串列表。因为我必须使用从中获取一些相关数据，我将所有内容解码为UTF-8方案。

我也在使用beautifulsoup4，它以解码的形式提取文本。

这是我用过的代码。

def do_underline(line,mistakes):
    last = u'</u></font>'
    first = u"<u><font color='red'>"
    a = [i.decode(encoding='UTF-8', errors='ignore') for i in line]
    lenm = len(mistakes)
    for i in range(lenm):
        a.insert(mistakes[lenm-i-1][2],last)
        a.insert(mistakes[lenm-i-1][1],first)
    b = u''
    return b.join(a)

def readURL(u):
    """
    URL -> List

    Opens a webpage's source code and extract it text
    along with blank and new lines.
    enumerate all lines.(including blank and new lines

    """
    global line_dict,q
    line_dict = {}
    p = opener.open(u)
    p1 = p.readlines()
    q = [i.decode(encoding = 'UTF-8',errors='ignore') for i in p1]
    q1 = [BeautifulSoup(i).get_text() for i in q]
    q2 = list(enumerate(q1))
    line_dict = {i:j for (i,j) in enumerate(q)}
    return q2

def process_file(f):
    """
    (.html file) -> List of Spelling Mistakes
    """
    global line_dict
    re = readURL(f)
    de = del_blankempty(re)
    fd = form_dict(de)

    fflist = []
    chklst = []

    for i in fd:
        chklst = chklst + list_braces(i,line_dict)
        fflist = fflist + find_index_mistakes(i,fd)

    final_list = list(set(is_inside_braces_or_not(chklst,fflist)))

    final_dict = {i:sorted(list(set([final_list[j] for j in range(len(final_list)) if final_list[j][0] == i])),key=lambda student: student[1]) for i in fd}

    for i in line_dict:
        if i in fd:
            line_dict[i] = do_underline(line_dict[i],final_dict[i])
        else:
            line_dict[i] = line_dict[i]

    create_html_file(line_dict)
    print "Your Task is completed"

def create_html_file(a):
    import io
    fl = io.open('Spellcheck1.html','w', encoding='UTF-8')
    for i in a:
        fl.write(a[i])
    print "Your HTML text file is created"

每次运行脚本时都会出现以下错误。

Traceback (most recent call last):
  File "checker.py", line 258, in <module>
    process_file('https://www.fanfiction.net/s/9421614/1/The-Night-Blooming-Flower')
  File "checker.py", line 243, in process_file
    line_dict[i] = do_underline(line_dict[i],final_dict[i])
  File "checker.py", line 89, in do_underline
    a = [i.decode(encoding='UTF-8', errors='ignore') for i in line]
  File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
    return codecs.utf_8_decode(input, errors, True)
UnicodeEncodeError: 'ascii' codec can't encode character u'\xf3' in position 0: ordinal not in range(128)

有关我如何删除此错误的任何建议。如果有一种方法可以将evrything解码为来自给定链接的UTF-8，那么我认为它将解决问题。

＆＃39; ASCII＆＃39;编解码器无法对字符进行编码

0 个答案: