我是python的新手,并尝试开发非常简单的网络爬虫。我的网络爬虫工作得很好,但它长时间坚持一个链接。如何设置超时功能?
如何处理urllib2.HTTPError?我的除外陈述是否正确?
def get_link(page):
start = page.find('<a href=')
if start==-1:
return None,0
startp=page.find('"',start)
endp=page.find('"',startp+1)
url=page[startp+1:endp]
return url,endp
def get_all_link(page):
allurl = []
while True:
url,endp=get_link(page)
if url:
page=page[endp:]
allurl.append(url)
else:
return allurl
break
def get_page(page, tocrawl):
import urllib2
try:
page_source = urllib2.urlopen(page)
return page_source.read()
except:
page = tocrawl.pop()
raise
def validate(page):
valid = page.find('http')
if valid == -1:
return 0
return 1
def crawler(seed):
tocrawl = [seed]
crawled = []
i=0
while tocrawl:
page=tocrawl.pop()
valid = validate(page)
if valid:
if page not in crawled:
tocrawl = set(tocrawl) | set(get_all_link(get_page(page,tocrawl)))
crawled.append(page)
i=i+1
f = open("crawled.txt","a")
f.write(repr(i)+" : "+repr(page)+"\n")
f.close()
return crawled
crawler("http://google.com")