我想使用Pycurl执行多请求。代码是: m.add_handle(手柄) requests.append((句柄,响应))
# Perform multi-request.
SELECT_TIMEOUT = 1.0
num_handles = len(requests)
while num_handles:
ret = m.select(SELECT_TIMEOUT)
if ret == -1: continue
while 1:
ret, num_handles = m.perform()
print "In while loop of multicurl"
if ret != pycurl.E_CALL_MULTI_PERFORM: break
事情是,这个循环需要永远运行。它没有终止。 任何人都可以告诉我,它做了什么以及可能出现的问题是什么?
答案 0 :(得分:5)
您是否浏览过PyCurl官方代码?下面的代码实现了多个东西,我尝试执行它,并且我能够在300秒内并行地抓取大约10,000个URL。我想这正是你想要实现的目标?如果我错了,请纠正我。
#! /usr/bin/env python
# -*- coding: iso-8859-1 -*-
# vi:ts=4:et
# $Id: retriever-multi.py,v 1.29 2005/07/28 11:04:13 mfx Exp $
#
# Usage: python retriever-multi.py <file with URLs to fetch> [<# of
# concurrent connections>]
#
import sys
import pycurl
# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
try:
import signal
from signal import SIGPIPE, SIG_IGN
signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
pass
# Get args
num_conn = 10
try:
if sys.argv[1] == "-":
urls = sys.stdin.readlines()
else:
urls = open(sys.argv[1]).readlines()
if len(sys.argv) >= 3:
num_conn = int(sys.argv[2])
except:
print "Usage: %s <file with URLs to fetch> [<# of concurrent connections>]" % sys.argv[0]
raise SystemExit
# Make a queue with (url, filename) tuples
queue = []
for url in urls:
url = url.strip()
if not url or url[0] == "#":
continue
filename = "doc_%03d.dat" % (len(queue) + 1)
queue.append((url, filename))
# Check args
assert queue, "no URLs given"
num_urls = len(queue)
num_conn = min(num_conn, num_urls)
assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
# Pre-allocate a list of curl objects
m = pycurl.CurlMulti()
m.handles = []
for i in range(num_conn):
c = pycurl.Curl()
c.fp = None
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.setopt(pycurl.CONNECTTIMEOUT, 30)
c.setopt(pycurl.TIMEOUT, 300)
c.setopt(pycurl.NOSIGNAL, 1)
m.handles.append(c)
# Main loop
freelist = m.handles[:]
num_processed = 0
while num_processed < num_urls:
# If there is an url to process and a free curl object, add to multi stack
while queue and freelist:
url, filename = queue.pop(0)
c = freelist.pop()
c.fp = open(filename, "wb")
c.setopt(pycurl.URL, url)
c.setopt(pycurl.WRITEDATA, c.fp)
m.add_handle(c)
# store some info
c.filename = filename
c.url = url
# Run the internal curl state machine for the multi stack
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
# Check for curl objects which have terminated, and add them to the freelist
while 1:
num_q, ok_list, err_list = m.info_read()
for c in ok_list:
c.fp.close()
c.fp = None
m.remove_handle(c)
print "Success:", c.filename, c.url, c.getinfo(pycurl.EFFECTIVE_URL)
freelist.append(c)
for c, errno, errmsg in err_list:
c.fp.close()
c.fp = None
m.remove_handle(c)
print "Failed: ", c.filename, c.url, errno, errmsg
freelist.append(c)
num_processed = num_processed + len(ok_list) + len(err_list)
if num_q == 0:
break
# Currently no more I/O is pending, could do something in the meantime
# (display a progress bar, etc.).
# We just call select() to sleep until some more data is available.
m.select(1.0)
# Cleanup
for c in m.handles:
if c.fp is not None:
c.fp.close()
c.fp = None
c.close()
m.close()
答案 1 :(得分:0)
我认为这是因为你只是突破了第一个while循环
# Perform multi-request.
SELECT_TIMEOUT = 1.0
num_handles = len(requests)
while num_handles: # while nr.1
ret = m.select(SELECT_TIMEOUT)
if ret == -1: continue
while 1: # while nr.2
ret, num_handles = m.perform()
print "In while loop of multicurl"
if ret != pycurl.E_CALL_MULTI_PERFORM: break
'**'
所以如果你使用'break'会发生什么,你将会突破当前的while循环(当你使用break时,你在第二个while循环中。) 该程序的下一步将在这里写入'**'行,因为它是它跳回的最后一行。 (到while num_handles的第一行) 然后3行进一步进入'while 1:'并且更柔和..这就是你如何获得inf循环。
所以解决这个问题的方法是:
# Perform multi-request.
SELECT_TIMEOUT = 1.0
num_handles = len(requests)
while num_handles: # while nr.1
ret = m.select(SELECT_TIMEOUT)
if ret == -1: continue
while 1: # while nr.2
ret, num_handles = m.perform()
print "In while loop of multicurl"
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
break
所以这里发生的事情是,只要它从嵌套的while循环中断开,它就会自动突破第一个循环。
(并且由于while
和之前使用的continue