python线程在使用urllib2获取10K url时挂起

时间:2015-03-31 08:25:46

标签: python multithreading sockets urllib2 deadlock

我得到了大约10K的网址,在收到来自~8k网址的回复后,我的脚本挂了。我使用线程同时下载并使用一组正则表达式解析结果,我已禁用正则表达式来隔离问题。

这是我的下载代码

from threading import Thread
import urllib2, sys
from Queue import Queue
from parse import run_parse

concurrent = 200

def doWork():
    while True:
        url = q.get()
        status, url = getStatus(url)
       #if status != None:
           #doSomethingWithResult(status, url)
        q.task_done()

def getStatus(url):
    try:
        req = urllib2.Request(url, headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31'})
        resp = urllib2.urlopen(req,  timeout=10)
        data = resp.read()
        if resp.code == 200 and len(data) < 1000000:
            return data, url
        else:
            raise Exception("invalid response, url: %s, code: %s, size: %s" % (url, resp.code, len(data)))
    except urllib2.HTTPError, e:
        print "Error: %s, Reason: %s" % e.code, e.reason
        return None, None
    except Exception, e:
        print "Error: %s" % repr(e)
        return None, None


def doSomethingWithResult(content, url):
    print "status: %s, url: %s"  % (run_parse(content), url)
    #print status, url


def download(urls):
    global q
    q =  Queue(concurrent * 2)
    for i in range(concurrent):
        t = Thread(target=doWork)
        t.daemon = True
        t.start()
    try:
        for url in urls:
            q.put(url.strip())
        q.join()
    except Exception, e:
        print repr(e)

我从主代码中调用此脚本,将生成器传递给下载函数,然后启动该过程。它运行在运行ubuntu 12.04和python 2.7.3的linux机器上。

我导入了(在主脚本中)一个snippet,它打印了挂起线程的堆栈跟踪,这里是堆栈:

我有500次这个错误:

--------------------Thread 140015572993792--------------------
  File "/usr/lib/python2.7/threading.py", line 524, in __bootstrap
    self.__bootstrap_inner()
  File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
    self.run()
  File "/usr/lib/python2.7/threading.py", line 504, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/home/ubuntu/r/threaded_downloader.py", line 10, in doWork
    url = q.get()
  File "/usr/lib/python2.7/Queue.py", line 168, in get
    self.not_empty.wait()
  File "/usr/lib/python2.7/threading.py", line 243, in wait
    waiter.acquire()

更多堆栈错误:

    --------------------Thread 139761473361664--------------------
      File "/usr/lib/python2.7/threading.py", line 524, in __bootstrap
        self.__bootstrap_inner()
      File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
        self.run()
      File "/usr/lib/python2.7/threading.py", line 504, in run
        self.__target(*self.__args, **self.__kwargs)
      File "/home/ubuntu/r/threaded_downloader.py", line 10, in doWork
        url = q.get()
      File "/usr/lib/python2.7/Queue.py", line 68, in task_done
        self.all_tasks_done.release()

    --------------------Thread 139761473361664--------------------
      File "/usr/lib/python2.7/threading.py", line 524, in __bootstrap
        self.__bootstrap_inner()
      File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
        self.run()
      File "/usr/lib/python2.7/threading.py", line 504, in run
        self.__target(*self.__args, **self.__kwargs)
      File "/home/ubuntu/r/threaded_downloader.py", line 10, in doWork
        url = q.get()
      File "/usr/lib/python2.7/Queue.py", line 168, in get
        self.not_empty.wait()
      File "/usr/lib/python2.7/threading.py", line 274, in wait
        self._acquire_restore(saved_state)
    HTTPError()
    --------------------Thread 139761473361664--------------------
      File "/usr/lib/python2.7/threading.py", line 524, in __bootstrap
        self.__bootstrap_inner()
      File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
        self.run()
      File "/usr/lib/python2.7/threading.py", line 504, in run
        self.__target(*self.__args, **self.__kwargs)
      File "/home/ubuntu/r/threaded_downloader.py", line 11, in doWork
        status, url = getStatus(url)
      File "/home/ubuntu/r/threaded_downloader.py", line 20, in getStatus
        data = resp.read()
      File "/usr/lib/python2.7/socket.py", line 351, in read
        data = self._sock.recv(rbufsize)
      File "/usr/lib/python2.7/httplib.py", line 541, in read
        return self._read_chunked(amt)
      File "/usr/lib/python2.7/httplib.py", line 594, in _read_chunked
        return ''.join(value)
      File "/usr/lib/python2.7/httplib.py", line 652, in _safe_read
        return ''.join(s)
      File "/usr/lib/python2.7/socket.py", line 404, in read
        return buf.getvalue()

来自具有更高阈值时间(15秒&#39;)的运行的堆栈跟踪,用于挂起线程脚本。

--------------------Thread 140019171399424--------------------
  File "/usr/lib/python2.7/threading.py", line 524, in __bootstrap
    self.__bootstrap_inner()
  File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
    self.run()
  File "/usr/lib/python2.7/threading.py", line 504, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/home/ubuntu/r/threaded_downloader.py", line 10, in doWork
    url = q.get()
  File "/usr/lib/python2.7/Queue.py", line 68, in task_done
    self.all_tasks_done.release()

--------------------Thread 140019171399424--------------------
  File "/usr/lib/python2.7/threading.py", line 524, in __bootstrap
    self.__bootstrap_inner()
  File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
    self.run()
  File "/usr/lib/python2.7/threading.py", line 504, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/home/ubuntu/r/threaded_downloader.py", line 10, in doWork
    url = q.get()
  File "/usr/lib/python2.7/Queue.py", line 168, in get
    self.not_empty.wait()
  File "/usr/lib/python2.7/threading.py", line 274, in wait
    self._acquire_restore(saved_state)
  File "/usr/lib/python2.7/threading.py", line 223, in _acquire_restore
    self.__lock.acquire()           # Ignore saved state
--------------------Thread 140019171399424--------------------
  File "/usr/lib/python2.7/threading.py", line 524, in __bootstrap
    self.__bootstrap_inner()
  File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
    self.run()
  File "/usr/lib/python2.7/threading.py", line 504, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/home/ubuntu/r/threaded_downloader.py", line 10, in doWork
    url = q.get()
  File "/usr/lib/python2.7/Queue.py", line 182, in get
    self.not_empty.release()
  File "/usr/lib/python2.7/threading.py", line 274, in wait
    self._acquire_restore(saved_state)
--------------------Thread 140019171399424--------------------
  File "/home/ubuntu/r/r.py", line 185, in <module>
    threaded_downloader.download(all_urls)
  File "/home/ubuntu/r/threaded_downloader.py", line 47, in download
    q.put(url.strip())
  File "/usr/lib/python2.7/Queue.py", line 140, in put
    self.not_full.release()

似乎可能是我为urllib2请求设置的超时?或者是套接字问题?

我尝试使用像gevent和tornado这样的异步库来异步下载发送请求,但是在将结果处理与提取过程同步时遇到了问题,所以我使用了线程。

0 个答案:

没有答案