我得到了大约10K的网址,在收到来自~8k网址的回复后,我的脚本挂了。我使用线程同时下载并使用一组正则表达式解析结果,我已禁用正则表达式来隔离问题。
这是我的下载代码
from threading import Thread
import urllib2, sys
from Queue import Queue
from parse import run_parse
concurrent = 200
def doWork():
while True:
url = q.get()
status, url = getStatus(url)
#if status != None:
#doSomethingWithResult(status, url)
q.task_done()
def getStatus(url):
try:
req = urllib2.Request(url, headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31'})
resp = urllib2.urlopen(req, timeout=10)
data = resp.read()
if resp.code == 200 and len(data) < 1000000:
return data, url
else:
raise Exception("invalid response, url: %s, code: %s, size: %s" % (url, resp.code, len(data)))
except urllib2.HTTPError, e:
print "Error: %s, Reason: %s" % e.code, e.reason
return None, None
except Exception, e:
print "Error: %s" % repr(e)
return None, None
def doSomethingWithResult(content, url):
print "status: %s, url: %s" % (run_parse(content), url)
#print status, url
def download(urls):
global q
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for url in urls:
q.put(url.strip())
q.join()
except Exception, e:
print repr(e)
我从主代码中调用此脚本,将生成器传递给下载函数,然后启动该过程。它运行在运行ubuntu 12.04和python 2.7.3的linux机器上。
我导入了(在主脚本中)一个snippet,它打印了挂起线程的堆栈跟踪,这里是堆栈:
我有500次这个错误:
--------------------Thread 140015572993792--------------------
File "/usr/lib/python2.7/threading.py", line 524, in __bootstrap
self.__bootstrap_inner()
File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 504, in run
self.__target(*self.__args, **self.__kwargs)
File "/home/ubuntu/r/threaded_downloader.py", line 10, in doWork
url = q.get()
File "/usr/lib/python2.7/Queue.py", line 168, in get
self.not_empty.wait()
File "/usr/lib/python2.7/threading.py", line 243, in wait
waiter.acquire()
更多堆栈错误:
--------------------Thread 139761473361664--------------------
File "/usr/lib/python2.7/threading.py", line 524, in __bootstrap
self.__bootstrap_inner()
File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 504, in run
self.__target(*self.__args, **self.__kwargs)
File "/home/ubuntu/r/threaded_downloader.py", line 10, in doWork
url = q.get()
File "/usr/lib/python2.7/Queue.py", line 68, in task_done
self.all_tasks_done.release()
--------------------Thread 139761473361664--------------------
File "/usr/lib/python2.7/threading.py", line 524, in __bootstrap
self.__bootstrap_inner()
File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 504, in run
self.__target(*self.__args, **self.__kwargs)
File "/home/ubuntu/r/threaded_downloader.py", line 10, in doWork
url = q.get()
File "/usr/lib/python2.7/Queue.py", line 168, in get
self.not_empty.wait()
File "/usr/lib/python2.7/threading.py", line 274, in wait
self._acquire_restore(saved_state)
HTTPError()
--------------------Thread 139761473361664--------------------
File "/usr/lib/python2.7/threading.py", line 524, in __bootstrap
self.__bootstrap_inner()
File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 504, in run
self.__target(*self.__args, **self.__kwargs)
File "/home/ubuntu/r/threaded_downloader.py", line 11, in doWork
status, url = getStatus(url)
File "/home/ubuntu/r/threaded_downloader.py", line 20, in getStatus
data = resp.read()
File "/usr/lib/python2.7/socket.py", line 351, in read
data = self._sock.recv(rbufsize)
File "/usr/lib/python2.7/httplib.py", line 541, in read
return self._read_chunked(amt)
File "/usr/lib/python2.7/httplib.py", line 594, in _read_chunked
return ''.join(value)
File "/usr/lib/python2.7/httplib.py", line 652, in _safe_read
return ''.join(s)
File "/usr/lib/python2.7/socket.py", line 404, in read
return buf.getvalue()
来自具有更高阈值时间(15秒&#39;)的运行的堆栈跟踪,用于挂起线程脚本。
--------------------Thread 140019171399424--------------------
File "/usr/lib/python2.7/threading.py", line 524, in __bootstrap
self.__bootstrap_inner()
File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 504, in run
self.__target(*self.__args, **self.__kwargs)
File "/home/ubuntu/r/threaded_downloader.py", line 10, in doWork
url = q.get()
File "/usr/lib/python2.7/Queue.py", line 68, in task_done
self.all_tasks_done.release()
--------------------Thread 140019171399424--------------------
File "/usr/lib/python2.7/threading.py", line 524, in __bootstrap
self.__bootstrap_inner()
File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 504, in run
self.__target(*self.__args, **self.__kwargs)
File "/home/ubuntu/r/threaded_downloader.py", line 10, in doWork
url = q.get()
File "/usr/lib/python2.7/Queue.py", line 168, in get
self.not_empty.wait()
File "/usr/lib/python2.7/threading.py", line 274, in wait
self._acquire_restore(saved_state)
File "/usr/lib/python2.7/threading.py", line 223, in _acquire_restore
self.__lock.acquire() # Ignore saved state
--------------------Thread 140019171399424--------------------
File "/usr/lib/python2.7/threading.py", line 524, in __bootstrap
self.__bootstrap_inner()
File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 504, in run
self.__target(*self.__args, **self.__kwargs)
File "/home/ubuntu/r/threaded_downloader.py", line 10, in doWork
url = q.get()
File "/usr/lib/python2.7/Queue.py", line 182, in get
self.not_empty.release()
File "/usr/lib/python2.7/threading.py", line 274, in wait
self._acquire_restore(saved_state)
--------------------Thread 140019171399424--------------------
File "/home/ubuntu/r/r.py", line 185, in <module>
threaded_downloader.download(all_urls)
File "/home/ubuntu/r/threaded_downloader.py", line 47, in download
q.put(url.strip())
File "/usr/lib/python2.7/Queue.py", line 140, in put
self.not_full.release()
似乎可能是我为urllib2请求设置的超时?或者是套接字问题?
我尝试使用像gevent和tornado这样的异步库来异步下载发送请求,但是在将结果处理与提取过程同步时遇到了问题,所以我使用了线程。