Question

我正在运行一个脚本，该脚本应该抓取大约150000个网站页面，并将信息输出到Excel工作表中。我发现程序往往每50页左右停止工作，所以我一次输出20页。我怎么能让我的脚本依赖它已经处于非活动状态，然后再说一分钟，然后从它停止的地方重新启动？

这是我的代码：

def main():
    pass

if __name__ == '__main__':
    main()

import urllib2
import re
import time

def get_next_target(page):
    start_link = page.find('<tr onclick=' + '"openurl(')
    if start_link == -1:
        return None, 0
    start_quote = page.find("'", start_link)
    end_quote = page.find("</tr", start_quote + 1)
    url = page[start_quote + 1:end_quote]
    url = "http://www.kipa.co.il/" + url
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links

def split_qa(qa):
    splitfrom = qa.find('show_banner')
    split = qa.find("s", splitfrom)
    q = qa[0:split]
    split = qa.find(");", splitfrom)
    a = qa[split + 2:]
    return q, a

def clear_line(page):
    newpage = ''
    add = 'yes'
    for extract in page:
        if extract == '<':
            add = 'no'
        if add == 'yes':
            newpage = newpage + extract
        if extract == '>':
            add = 'yes'
    q, a = split_qa(newpage)
    return q, a

def main():
    pass

if __name__ == '__main__':
    main()

import urllib2
import re

def get_next_target(page):
    start_link = page.find('<tr onclick=' + '"openurl(')
    if start_link == -1:
        return None, 0
    start_quote = page.find("'", start_link)
    end_quote = page.find("</tr", start_quote + 1)
    url = page[start_quote + 1:end_quote]
    url = "http://www.kipa.co.il/" + url
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links

def split_qa(qa):
    splitfrom = qa.find('show_banner')
    split = qa.find("s", splitfrom)
    q = qa[0:split]
    split = qa.find(");", splitfrom)
    a = qa[split + 2:]
    return q, a

def clear_line(page):
    newpage = ''
    add = 'yes'
    for extract in page:
        if extract == '<':
            add = 'no'
        if add == 'yes':
            newpage = newpage + extract
        if extract == '>':
            add = 'yes'
    q, a = split_qa(newpage)
    return q, a

def get_content(url):
    response = urllib2.urlopen(url)
    page = response.read()
    page = page.decode('utf-8')

    start_link = page.find('<p class="padd10">')
    if start_link == -1:
        return None, 0
    start_quote = page.find("<strong>", start_link)
    end_quote = page.find('<p class="padd10 ravName">', start_quote + 1)
    content = page[start_quote:end_quote]

    q, a = clear_line(content)
    return q, a

import xlsxwriter
print('where to start?')
i = int(raw_input())
for sheet in range(i,6760):

    workbook = xlsxwriter.Workbook('kipa' + str(sheet) + '.xlsx')
    worksheet = workbook.add_worksheet()
    bold = workbook.add_format({'bold': 1})
    worksheet.set_column('A:A', 20)
    worksheet.set_column('A:C', 10)
    worksheet.set_column('A:D', 30)
    worksheet.set_column('A:E', 30)
    worksheet.set_column('A:F', 30)
    worksheet.write('A1', 'Link', bold)
    worksheet.write('B1', 'Date', bold)
    worksheet.write('C1', 'Rabbi', bold)
    worksheet.write('D1', 'Title', bold)
    worksheet.write('E1', 'Qestion', bold)
    worksheet.write('F1', 'Answer', bold)

    xlplace = 0

    qa_page = "http://www.kipa.co.il/ask/page/" + str(i)
    i = i + 1

    response = urllib2.urlopen(qa_page)
    page_source = response.read()
    page_source = page_source.decode('utf-8')

    biglist = get_all_links(page_source)
    qnumeber = 1

    for extract in biglist:

        xlplace = xlplace + 1
        end_quote = extract.find("'", 0)
        url = extract[0:end_quote]
        worksheet.write(xlplace, 0, url)

        start_link = extract.find('<td')
        start_quote = extract.find(">", start_link)
        end_quote = extract.find("</td>", start_quote + 1)
        date = extract[start_quote + 1:end_quote]
        worksheet.write(xlplace, 1, date)

        start_link = extract.find('<td', end_quote)
        start_quote = extract.find(">", start_link)
        end_quote = extract.find("</td>", start_quote + 1)
        rabbi = extract[start_quote + 1:end_quote]
        worksheet.write(xlplace, 2, rabbi)

        start_link = extract.find('">', end_quote)
        start_quote = extract.find(">", start_link)
        end_quote = extract.find("<", start_quote + 1)
        title = extract[start_quote + 1:end_quote]
        worksheet.write(xlplace, 3, title)

        q, a = get_content(url)
        worksheet.write(xlplace, 4, q)
        worksheet.write(xlplace, 5, a)
        print(qnumeber)
        qnumeber = qnumeber + 1


    print(qa_page)
workbook.close()

Answer 1

我认为你想要的是通过urllib2为http请求设置超时。

您可以将请求设置为：

response = urllib2.urlopen(url, timeout=30)  # Set a time out for 30 seconds or 60 for a minute.

如果想要再次尝试打开特定的url，您可以使用以下内容：

def try_url_open(url, timeout=5, times=1):
"""
Try to open url, and if fails on timeout error. It tries again <times> times.  
"""
response = None
while times != 0:
    times -= 1 
    try:
        response = urllib2.urlopen(url, timeout=timeout)
    except socket.timeout, error:
        # Timeout error here, so try again until variable time reach value 0.
        continue
return response

在python中不活动后重新启动函数

1 个答案: