当我运行以下代码时,我不断收到此错误:
Traceback (most recent call last):
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 94, in <module>
main()
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 71, in main
for final_url in pool.imap(handle_listing, listings):
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenpool.py", line 232, in next
val = self.waiters.get().wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 166, in wait
return self._exit_event.wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\event.py", line 120, in wait
current.throw(*self._exc)
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 192, in main
result = function(*args, **kwargs)
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 35, in handle_listing
title, = TITLE_MATCH.match(listing_title).groups()
AttributeError: 'NoneType' object has no attribute 'groups'
有什么问题?
它与Title匹配有关,但我不知道如何修复它!
如果你能帮助我,我会非常感激!
谢谢!
from gzip import GzipFile
from cStringIO import StringIO
import re
import webbrowser
import time
from difflib import SequenceMatcher
import os
import sys
from BeautifulSoup import BeautifulSoup
import eventlet
from eventlet.green import urllib2
import urllib2
import urllib
def download(url):
print "Downloading:", url
s = urllib2.urlopen(url).read()
if s[:2] == '\x1f\x8b':
ifh = GzipFile(mode='rb', fileobj=StringIO(s))
s = ifh.read()
print "Downloaded: ", url
return s
def replace_chars(text, replacements):
return ''.join(replacements.get(x,x) for x in text)
def handle_listing(listing_url):
listing_document = BeautifulSoup(download(listing_url))
# ignore pages that link to yellowpages
if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")):
listing_title = listing_document.title.text
reps = {' ':'-', ',':'', '\'':'', '[':'', ']':''}
title, = TITLE_MATCH.match(listing_title).groups()
address, = ADDRESS_MATCH.match(listing_title).groups()
yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
replace_chars(address, reps),
replace_chars(title, reps),
)
yellow_page = BeautifulSoup(download(yellow_page_url))
page_url = yellow_page.find("h3", {"class" : "business-name fn org"})
if page_url:
page_url = page_url.a["href"]
business_name = title[:title.index(",")]
page = BeautifulSoup(download(page_url))
yellow_page_address = page.find("span", {"class" : "street-address"})
if yellow_page_address:
if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5:
pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0)
page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'})
final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
pid, page_escaped)
return final_url
def main():
pool = eventlet.GreenPool()
listings_document = BeautifulSoup(download(START_URL))
listings = listings_document.findAll("a", href = LOCATION_LISTING)
listings = [listing['href'] for listing in listings]
for final_url in pool.imap(handle_listing, listings):
print final_url
if str(final_url) is not None:
url = str(final_url)
req = urllib2.Request(url)
response = urllib2.urlopen(req)
page = response.read()
time.sleep(2)
for a in range(2,3):
START_URL = 'http://www.locationary.com/place/en/US/New_Jersey/Randolph-page' + str(a) + '/?ACTION_TOKEN=NumericAction'
TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$')
ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$')
LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp')
if __name__ == '__main__':
main()
答案 0 :(得分:4)
引用您的错误:
title,= TITLE_MATCH.match(listing_title).groups()
AttributeError:&#39; NoneType&#39;对象没有属性&#39; groups&#39;
TITLE_MATCH.match(listing_title)
会返回None
,因此您无法致电.groups()
。
答案 1 :(得分:2)
当re
.match
找不到匹配的内容时,会返回None
。由于您无法在.groups()
上致电None
,因此您必须先检查匹配项。要做到这一点:
改变这个:
title, = TITLE_MATCH.match(listing_title).groups()
address, = ADDRESS_MATCH.match(listing_title).groups()
对此:
titleMatch = TITLE_MATCH.match(listing_title)
if titleMatch:
title, = titleMatch.groups()
else:
# handle it
addressMatch = ADDRESS_MATCH.match(listing_title)
if addressMatch:
address, = addressMatch.groups()
else:
# handle it