所以这就是重点,在Aptana 3中我没有例外,一切似乎都很好。
但是当我尝试直接在python中运行时,bucle会在某些时候停止,我猜它可能与某些特殊字符匹配,例如ó或类似的东西。
class Spider(HTMLParser):
def __init__(self, url):
HTMLParser.__init__(self)
string=[]
string2=[]
self.feed(urlopen(url).read()
def handle_starttag(self, tag, attrs):
if tag == 'a' and attrs:
title= attrs[1][1]
link= attrs[0][1]
string.append(title)
string2.append(link)
我在python中有点新手,所以我无法获得比这更好的异常信息:
line 33, in __init__ self.feed(urlopen(url).read()) File "/usr/lib/python2.6/HTMLParser.py", line 108,
in feed self.goahead(0) File "/usr/lib/python2.6/HTMLParser.py", line 148,
in goahead k = self.parse_starttag(i) File "/usr/lib/python2.6/HTMLParser.py", line 252,
in parse_starttag attrvalue = self.unescape(attrvalue)'])
加注输出:
File "parse.py", line 65, in <module>
Spider("http://...")
File "parse.py", line 33, in __init__
self.feed(urlopen(url).read())
File "/usr/lib/python2.6/HTMLParser.py", line 108, in feed
self.goahead(0)
File "/usr/lib/python2.6/HTMLParser.py", line 148, in goahead
k = self.parse_starttag(i)
File "/usr/lib/python2.6/HTMLParser.py", line 252, in parse_starttag
attrvalue = self.unescape(attrvalue)
File "/usr/lib/python2.6/HTMLParser.py", line 390, in unescape
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
File "/usr/lib/python2.6/re.py", line 151, in sub
return _compile(pattern, 0).sub(repl, string, count)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 2: ordinal not in range(128)
File "parse.py", line 65, in <module>
Spider("http://...")
File "parse.py", line 33, in __init__
self.feed(urlopen(url).read())
File "/usr/lib/python2.6/HTMLParser.py", line 108, in feed
self.goahead(0)
File "/usr/lib/python2.6/HTMLParser.py", line 148, in goahead
k = self.parse_starttag(i)
File "/usr/lib/python2.6/HTMLParser.py", line 252, in parse_starttag
attrvalue = self.unescape(attrvalue)
File "/usr/lib/python2.6/HTMLParser.py", line 390, in unescape
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
File "/usr/lib/python2.6/re.py", line 151, in sub
return _compile(pattern, 0).sub(repl, string, count)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 2: ordinal not in range(128)
现在我可以通过更改此内容来跳过错误:
但仍然是一个丑陋的解决方案,我需要在文中加入口音。
我试过
self.feed(unicode(urlopen(url).read(),errors='replace'))
self.feed(unicode(urlopen(url).read(),errors='replace'))
但没有改变。
答案 0 :(得分:1)
答案 1 :(得分:0)
不幸的是,Max的解决方案无效。
AttributeError:'module'对象没有属性'setdefaultencoding'
我终于通过添加:
来实现它z=urlopen(url).read().decode('utf-8')
self.feed(z)