Question

我将在下面发布的代码给出了这个错误，我无法弄清楚为什么或如何解决它。如果有人可以提供帮助，我会非常感激。谢谢！

Traceback (most recent call last):
  File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 99, in <module>
    main()
  File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 76, in main
    for final_url in pool.imap(handle_listing, listings):
  File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenpool.py", line 232, in next
    val = self.waiters.get().wait()
  File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 166, in wait
    return self._exit_event.wait()
  File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\event.py", line 120, in wait
    current.throw(*self._exc)
  File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 192, in main
    result = function(*args, **kwargs)
  File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 48, in handle_listing
    yellow_page = BeautifulSoup(download(yellow_page_url))
  File "build\bdist.win32\egg\BeautifulSoup.py", line 1519, in __init__
    BeautifulStoneSoup.__init__(self, *args, **kwargs)
  File "build\bdist.win32\egg\BeautifulSoup.py", line 1144, in __init__
    self._feed(isHTML=isHTML)
  File "build\bdist.win32\egg\BeautifulSoup.py", line 1168, in _feed
    smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
  File "build\bdist.win32\egg\BeautifulSoup.py", line 1770, in __init__
    self._detectEncoding(markup, isHTML)
  File "build\bdist.win32\egg\BeautifulSoup.py", line 1915, in _detectEncoding
    '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
TypeError: expected string or buffer

我不知道它想要什么或意味着什么......

这是我的代码：

from gzip import GzipFile
from cStringIO import StringIO
import re
import webbrowser
import time
from difflib import SequenceMatcher
import os
import sys
from BeautifulSoup import BeautifulSoup
import eventlet
from eventlet.green import urllib2
import urllib2
import urllib

def download(url):
    print "Downloading:", url
    s = urllib2.urlopen(url).read()
    if s[:2] == '\x1f\x8b':
        ifh = GzipFile(mode='rb', fileobj=StringIO(s))
        s = ifh.read()
    print "Downloaded: ", url
    return s


def replace_chars(text, replacements):
    return ''.join(replacements.get(x,x) for x in text)

def handle_listing(listing_url):
    listing_document = BeautifulSoup(download(listing_url))

    # ignore pages that link to yellowpages
    if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")):
        listing_title = listing_document.title.text
        # define an alphabet
        alfa = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        reps = {' ':'-', ',':'', '\'':'', '[':'', ']':'', '-Suite-' + alfa[1-26] : ''}
        if TITLE_MATCH.match(listing_title) is not None:
            title, = TITLE_MATCH.match(listing_title).groups()
            if ADDRESS_MATCH.match(listing_title) is not None:
                address, = ADDRESS_MATCH.match(listing_title).groups()
                yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
                    replace_chars(address, reps),
                    replace_chars(title, reps),
                )

                yellow_page = BeautifulSoup(download(yellow_page_url))

                page_url = yellow_page.find("h3", {"class" : "business-name fn org"})
                if page_url:
                    page_url = page_url.a["href"]

                    business_name = title[:title.index(",")]

                    page = BeautifulSoup(download(page_url))
                    yellow_page_address =  page.find("span", {"class" : "street-address"})
                    if yellow_page_address:

                        if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5:
                            pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0)
                            page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'})

                            final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
                                    pid, page_escaped)
                            return final_url


def main():

    pool = eventlet.GreenPool()
    listings_document = BeautifulSoup(download(START_URL))
    listings = listings_document.findAll("a", href = LOCATION_LISTING)
    listings = [listing['href'] for listing in listings]

    for final_url in pool.imap(handle_listing, listings):
        print final_url

        """
        if str(final_url) is not None:

            url = str(final_url)

            req = urllib2.Request(url)
            response = urllib2.urlopen(req)
            page = response.read()
            time.sleep(2)

        """

for a in range(0,1):

    START_URL = 'http://www.locationary.com/place/en/US/Arkansas/Fayetteville-page2/?ACTION_TOKEN=NumericAction'
    TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$')
    ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$')
    LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp')

    if __name__ == '__main__':
        main()

Answer 1

新手使用任何支持异常的语言所犯的一个非常常见的错误是他们捕获了他们实际上没有处理的异常。这会导致难以调试的错误，因为它会破坏程序的正常流程。

具体而言，在urllib2.HTTPError中捕获download()会阻止实际问题传播到程序的其余部分。要么完全删除异常处理程序，要么在处理程序的末尾删除raise以保持流程。

如何使用Python调试此错误？

1 个答案: