如何在网络爬虫中设置超时?

时间:2014-10-28 05:55:54

标签: python timeout web-crawler urllib2

我是python的新手,并尝试开发非常简单的网络爬虫。我的网络爬虫工作得很好,但它长时间坚持一个链接。如何设置超时功能?

如何处理urllib2.HTTPError?我的除外陈述是否正确?

def get_link(page):
    start = page.find('<a href=')
    if start==-1:
        return None,0
    startp=page.find('"',start)
    endp=page.find('"',startp+1)
    url=page[startp+1:endp]
    return url,endp

def get_all_link(page):
    allurl = []
    while True:
        url,endp=get_link(page)
        if url:
            page=page[endp:]
            allurl.append(url)
        else:
            return allurl
            break

def get_page(page, tocrawl):
    import urllib2
    try:
        page_source = urllib2.urlopen(page)
        return page_source.read()
    except:
        page = tocrawl.pop()
        raise

def validate(page):
    valid = page.find('http')

    if valid == -1:
       return 0
    return 1


def crawler(seed):
    tocrawl = [seed]
    crawled = []
    i=0

    while tocrawl:
        page=tocrawl.pop()
        valid = validate(page)
        if valid:
            if page not in crawled:
                tocrawl = set(tocrawl) | set(get_all_link(get_page(page,tocrawl)))
                crawled.append(page)
                i=i+1
                f = open("crawled.txt","a")
                f.write(repr(i)+" : "+repr(page)+"\n")
                f.close()
    return crawled

crawler("http://google.com")

0 个答案:

没有答案