我正在尝试将HTML链接解析为代码,并将其源代码作为字符串列表。因为我必须使用从中获取一些相关数据,我将所有内容解码为UTF-8方案。
我也在使用beautifulsoup4,它以解码的形式提取文本。
这是我用过的代码。
def do_underline(line,mistakes):
last = u'</u></font>'
first = u"<u><font color='red'>"
a = [i.decode(encoding='UTF-8', errors='ignore') for i in line]
lenm = len(mistakes)
for i in range(lenm):
a.insert(mistakes[lenm-i-1][2],last)
a.insert(mistakes[lenm-i-1][1],first)
b = u''
return b.join(a)
def readURL(u):
"""
URL -> List
Opens a webpage's source code and extract it text
along with blank and new lines.
enumerate all lines.(including blank and new lines
"""
global line_dict,q
line_dict = {}
p = opener.open(u)
p1 = p.readlines()
q = [i.decode(encoding = 'UTF-8',errors='ignore') for i in p1]
q1 = [BeautifulSoup(i).get_text() for i in q]
q2 = list(enumerate(q1))
line_dict = {i:j for (i,j) in enumerate(q)}
return q2
def process_file(f):
"""
(.html file) -> List of Spelling Mistakes
"""
global line_dict
re = readURL(f)
de = del_blankempty(re)
fd = form_dict(de)
fflist = []
chklst = []
for i in fd:
chklst = chklst + list_braces(i,line_dict)
fflist = fflist + find_index_mistakes(i,fd)
final_list = list(set(is_inside_braces_or_not(chklst,fflist)))
final_dict = {i:sorted(list(set([final_list[j] for j in range(len(final_list)) if final_list[j][0] == i])),key=lambda student: student[1]) for i in fd}
for i in line_dict:
if i in fd:
line_dict[i] = do_underline(line_dict[i],final_dict[i])
else:
line_dict[i] = line_dict[i]
create_html_file(line_dict)
print "Your Task is completed"
def create_html_file(a):
import io
fl = io.open('Spellcheck1.html','w', encoding='UTF-8')
for i in a:
fl.write(a[i])
print "Your HTML text file is created"
每次运行脚本时都会出现以下错误。
Traceback (most recent call last):
File "checker.py", line 258, in <module>
process_file('https://www.fanfiction.net/s/9421614/1/The-Night-Blooming-Flower')
File "checker.py", line 243, in process_file
line_dict[i] = do_underline(line_dict[i],final_dict[i])
File "checker.py", line 89, in do_underline
a = [i.decode(encoding='UTF-8', errors='ignore') for i in line]
File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeEncodeError: 'ascii' codec can't encode character u'\xf3' in position 0: ordinal not in range(128)
有关我如何删除此错误的任何建议。 如果有一种方法可以将evrything解码为来自给定链接的UTF-8,那么我认为它将解决问题。