我将在下面发布的代码给出了这个错误,我无法弄清楚为什么或如何解决它。如果有人可以提供帮助,我会非常感激。谢谢!
Traceback (most recent call last):
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 99, in <module>
main()
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 76, in main
for final_url in pool.imap(handle_listing, listings):
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenpool.py", line 232, in next
val = self.waiters.get().wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 166, in wait
return self._exit_event.wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\event.py", line 120, in wait
current.throw(*self._exc)
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 192, in main
result = function(*args, **kwargs)
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 48, in handle_listing
yellow_page = BeautifulSoup(download(yellow_page_url))
File "build\bdist.win32\egg\BeautifulSoup.py", line 1519, in __init__
BeautifulStoneSoup.__init__(self, *args, **kwargs)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1144, in __init__
self._feed(isHTML=isHTML)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1168, in _feed
smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1770, in __init__
self._detectEncoding(markup, isHTML)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1915, in _detectEncoding
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
TypeError: expected string or buffer
我不知道它想要什么或意味着什么......
这是我的代码:
from gzip import GzipFile
from cStringIO import StringIO
import re
import webbrowser
import time
from difflib import SequenceMatcher
import os
import sys
from BeautifulSoup import BeautifulSoup
import eventlet
from eventlet.green import urllib2
import urllib2
import urllib
def download(url):
print "Downloading:", url
s = urllib2.urlopen(url).read()
if s[:2] == '\x1f\x8b':
ifh = GzipFile(mode='rb', fileobj=StringIO(s))
s = ifh.read()
print "Downloaded: ", url
return s
def replace_chars(text, replacements):
return ''.join(replacements.get(x,x) for x in text)
def handle_listing(listing_url):
listing_document = BeautifulSoup(download(listing_url))
# ignore pages that link to yellowpages
if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")):
listing_title = listing_document.title.text
# define an alphabet
alfa = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
reps = {' ':'-', ',':'', '\'':'', '[':'', ']':'', '-Suite-' + alfa[1-26] : ''}
if TITLE_MATCH.match(listing_title) is not None:
title, = TITLE_MATCH.match(listing_title).groups()
if ADDRESS_MATCH.match(listing_title) is not None:
address, = ADDRESS_MATCH.match(listing_title).groups()
yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
replace_chars(address, reps),
replace_chars(title, reps),
)
yellow_page = BeautifulSoup(download(yellow_page_url))
page_url = yellow_page.find("h3", {"class" : "business-name fn org"})
if page_url:
page_url = page_url.a["href"]
business_name = title[:title.index(",")]
page = BeautifulSoup(download(page_url))
yellow_page_address = page.find("span", {"class" : "street-address"})
if yellow_page_address:
if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5:
pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0)
page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'})
final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
pid, page_escaped)
return final_url
def main():
pool = eventlet.GreenPool()
listings_document = BeautifulSoup(download(START_URL))
listings = listings_document.findAll("a", href = LOCATION_LISTING)
listings = [listing['href'] for listing in listings]
for final_url in pool.imap(handle_listing, listings):
print final_url
"""
if str(final_url) is not None:
url = str(final_url)
req = urllib2.Request(url)
response = urllib2.urlopen(req)
page = response.read()
time.sleep(2)
"""
for a in range(0,1):
START_URL = 'http://www.locationary.com/place/en/US/Arkansas/Fayetteville-page2/?ACTION_TOKEN=NumericAction'
TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$')
ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$')
LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp')
if __name__ == '__main__':
main()
答案 0 :(得分:3)
新手使用任何支持异常的语言所犯的一个非常常见的错误是他们捕获了他们实际上没有处理的异常。这会导致难以调试的错误,因为它会破坏程序的正常流程。
具体而言,在urllib2.HTTPError
中捕获download()
会阻止实际问题传播到程序的其余部分。要么完全删除异常处理程序,要么在处理程序的末尾删除raise
以保持流程。