我使用的模式是获取纬度,经度,网址等等:
pattern = "<article.*?latitude="(.*?)".*?longitude="(.*?)">.*?<a href="(.*?)".*?<figcaption.*?>(.*?)</figcaption>.*?</a>.*?<dt class="listing-type zsg-content_collapsed"><span.*?></span>(.*?)</dt>"
有时它运行良好,但有时它会挂起
re.findall(pattern, page)
。
代码段为:
def getPage(strUrl):
socket.setdefaulttimeout(60)
try:
request = urllib2.Request(strUrl)
#mock browser
request.add_header("User-Agent","Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0")
response = urllib2.urlopen(request)
except urllib2.URLError, e:
print "Bad Url or timeout"
print type(e)
print e
page = ''
except socket.timeout,e:
print "socket timeout"
print type(e)
print e
page = ''
else:
page = response.read().decode('utf8')
print "Get page contents successfully"
return m_page
def getHouseInfo(self,strRegularExpr,page):
if strRegularExpr=='' or page=='':
print "regular expression is null or page is null."
return False
pattern = re.compile(strRegularExpr,re.S)
items = re.findall(pattern,page)
return items
def getHomeDetailLinks(page):
print "<<<<<<Get links starts>>>>>>"
items = getHouseInfo(mapRe['homeDetailLinks'],page)
print items
print "<<<<<<Get links ends>>>>>>"
return items
page = getPage("http://www.zillow.com/homes/for_rent/02138_rb/1_p")
temp = getHomeDetailLinks(page)
print temp
当它挂起时,我必须使用CTRL + C来阻止它,它会抛出异常:
^CTraceback (most recent call last):
...
items = self.getHouseInfo(mapRe['homeDetailLinks'],page)
File ".../crawlbase.py", line 59, in getHouseInfo
items = re.findall(pattern,page)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/re.py", line 177, in findall
return _compile(pattern, flags).findall(string)
KeyboardInterrupt
有人能给我一些线索来解决这个问题吗?