在下面的代码中我试图首先检查URL状态代码然后启动相关线程并执行相同操作将其添加到队列中,
但是如果网址太多,我会收到TimeOut错误。 以下添加的所有代码 但只是发现了另一个 bug 如果我传递一个mp3文件和一些jpeg图像,那么下载的正确大小的mp3文件就会打开,因为其中一个图像是通过的。
_fdUtils
def getParser():
parser = argparse.ArgumentParser(prog='FileDownloader',
description='Utility to download files from internet')
parser.add_argument('-v', '--verbose', default=logging.DEBUG,
help='by default its on, pass None or False to not spit in shell')
parser.add_argument('-st', '--saveTo', default=None, action=FullPaths,
help='location where you want files to download to')
parser.add_argument('-urls', nargs='*',
help='urls of files you want to download.')
parser.add_argument('-se', nargs='*', default=[1], help='Split each url passed to urls by the'\
" respective split order, if a url doesn't have a split default is taken 1 ")
return parser.parse_args()
def getResponse(url):
return requests.head(url, allow_redirects=True, timeout=10, headers={'Accept-Encoding': 'identity'})
def isWorkingURL(url):
response = getResponse(url)
return response.status_code in [302, 200, 100, 204, 300]
def getUrl(url):
""" gets the actual url to download file from.
"""
response = getResponse(url)
return response.headers.get('location', url)
错误堆栈跟踪:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 810, in __bootstrap_inner
self.run()
File "python/file_download.py", line 181, in run
_grabAndWriteToDisk(self, split, url, self.__saveTo, 0, self.queue)
File "python/file_download.py", line 70, in _grabAndWriteToDisk
resp = requests.get(url, headers={'Range': 'bytes=%s' % irange}, stream=True)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests-2.1.0-py2.7.egg/requests/api.py", line 55, in get
return request('get', url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests-2.1.0-py2.7.egg/requests/api.py", line 44, in request
return session.request(method=method, url=url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests-2.1.0-py2.7.egg/requests/sessions.py", line 382, in request
resp = self.send(prep, **send_kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests-2.1.0-py2.7.egg/requests/sessions.py", line 505, in send
history = [resp for resp in gen] if allow_redirects else []
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests-2.1.0-py2.7.egg/requests/sessions.py", line 167, in resolve_redirects
allow_redirects=False,
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests-2.1.0-py2.7.egg/requests/sessions.py", line 485, in send
r = adapter.send(request, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests-2.1.0-py2.7.egg/requests/adapters.py", line 381, in send
raise Timeout(e)
Timeout: HTTPConnectionPool(host='ia600506.us.archive.org', port=80): Read timed out. (read timeout=<object object at 0x1002b40b0>)
我们再去一次:
import argparse
import logging
import Queue
import os
import requests
import signal
import socket
import sys
import time
import threading
import utils as _fdUtils
from collections import OrderedDict
from itertools import izip_longest
from socket import error as SocketError, timeout as SocketTimeout
# timeout in seconds
TIMEOUT = 10
socket.setdefaulttimeout(TIMEOUT)
DESKTOP_PATH = os.path.expanduser("~/Desktop")
appName = 'FileDownloader'
logFile = os.path.join(DESKTOP_PATH, '%s.log' % appName)
_log = _fdUtils.fdLogger(appName, logFile, logging.DEBUG, logging.DEBUG, console_level=logging.DEBUG)
queue = Queue.Queue()
STOP_REQUEST = threading.Event()
maxSplits = threading.BoundedSemaphore(3)
threadLimiter = threading.BoundedSemaphore(5)
lock = threading.Lock()
pulledSize = 0
dataDict = {}
def _grabAndWriteToDisk(threadName, url, saveTo, first=None, queue=None, mode='wb', irange=None):
""" Function to download file..
Args:
url(str): url of file to download
saveTo(str): path where to save file
first(int): starting byte of the range
queue(Queue.Queue): queue object to set status for file download
mode(str): mode of file to be downloaded
irange(str): range of byte to download
"""
fileName = _fdUtils.getFileName(url)
filePath = os.path.join(saveTo, fileName)
fileSize = _fdUtils.getUrlSizeInBytes(url)
downloadedFileSize = 0 if not first else first
block_sz = 8192
resp = requests.get(url, headers={'Range': 'bytes=%s' % irange}, stream=True)
for fileBuffer in resp.iter_content(block_sz):
if not fileBuffer:
break
with open(filePath, mode) as fd:
downloadedFileSize += len(fileBuffer)
fd.write(fileBuffer)
mode = 'a'
status = r"%10d [%3.2f%%]" % (downloadedFileSize, downloadedFileSize * 100. / fileSize)
status = status + chr(8)*(len(status)+1)
sys.stdout.write('%s\r' % status)
time.sleep(.01)
sys.stdout.flush()
if downloadedFileSize == fileSize:
STOP_REQUEST.set()
queue.task_done()
_log.debug("Downloaded %s %s%% using %s and saved to %s", fileName,
downloadedFileSize * 100. / fileSize, threadName.getName(), saveTo)
def _downloadChunk(url, idx, irange, fileName, sizeInBytes):
_log.debug("Downloading %s for first chunk %s of %s " % (irange, idx+1, fileName))
pulledSize = irange[-1]
try:
resp = requests.get(url, allow_redirects=False, timeout=TIMEOUT,
headers={'Range': 'bytes=%s-%s' % (str(irange[0]), str(irange[-1]))},
stream=True)
except (SocketTimeout, requests.exceptions), e:
_log.error(e)
return
chunk_size = str(irange[-1])
for chunk in resp.iter_content(chunk_size):
status = r"%10d [%3.2f%%]" % (pulledSize, pulledSize * 100. / int(chunk_size))
status = status + chr(8)*(len(status)+1)
sys.stdout.write('%s\r' % status)
sys.stdout.flush()
pulledSize += len(chunk)
dataDict[idx] = chunk
time.sleep(.03)
if pulledSize == sizeInBytes:
_log.info("%s downloaded %3.0f%%", fileName, pulledSize * 100. / sizeInBytes)
class ThreadedFetch(threading.Thread):
""" docstring for ThreadedFetch
"""
def __init__(self, saveTo, queue):
super(ThreadedFetch, self).__init__()
self.queue = queue
self.__saveTo = saveTo
def run(self):
threadLimiter.acquire()
try:
items = self.queue.get()
url = items[0]
split = items[-1]
fileName = _fdUtils.getFileName(url)
# grab split chunks in separate thread.
if split > 1:
maxSplits.acquire()
try:
sizeInBytes = _fdUtils.getUrlSizeInBytes(url)
byteRanges = _fdUtils.getRangeSegements(sizeInBytes, split)
filePath = os.path.join(self.__saveTo, fileName)
downloaders = [
threading.Thread(
target=_downloadChunk,
args=(url, idx, irange, fileName, sizeInBytes),
)
for idx, irange in enumerate(byteRanges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
# this makes the wait for all thread to finish
# which confirms the dataDict is up-to-date
for th in downloaders:
th.join()
downloadedSize = 0
with open(filePath, 'wb') as fh:
for _idx, chunk in sorted(dataDict.iteritems()):
downloadedSize += len(chunk)
status = r"%10d [%3.2f%%]" % (downloadedSize, downloadedSize * 100. / sizeInBytes)
status = status + chr(8)*(len(status)+1)
fh.write(chunk)
sys.stdout.write('%s\r' % status)
time.sleep(.04)
sys.stdout.flush()
if downloadedSize == sizeInBytes:
_log.info("%s, saved to %s", fileName, self.__saveTo)
self.queue.task_done()
finally:
maxSplits.release()
else:
while not STOP_REQUEST.isSet():
self.setName("primary_%s_thread" % fileName.split(".")[0])
# if downlaod whole file in single chunk no need
# to start a new thread, so directly download here.
_grabAndWriteToDisk(self, url, self.__saveTo, 0, self.queue)
finally:
threadLimiter.release()
def main(appName):
args = _fdUtils.getParser()
saveTo = args.saveTo if args.saveTo else DESKTOP_PATH
# spawn a pool of threads, and pass them queue instance
# each url will be downloaded concurrently
unOrdUrls = dict(izip_longest(args.urls, args.se, fillvalue=1))
ordUrls = OrderedDict([(k, unOrdUrls[k]) for k in sorted(unOrdUrls, key=unOrdUrls.get, reverse=False) if _fdUtils.isWorkingURL(k, _log) and _fdUtils.notOnDisk(k, saveTo)])
print "length: %s " % len(ordUrls)
for i in xrange(len(ordUrls)):
t = ThreadedFetch(saveTo, queue)
t.daemon = True
t.start()
try:
# populate queue with data
for url, split in ordUrls.iteritems():
url = _fdUtils.getUrl(url)
print url
queue.put((url, int(split)))
# wait on the queue until everything has been processed
queue.join()
_log.info('All tasks completed.')
except (KeyboardInterrupt, SystemExit):
_log.critical('! Received keyboard interrupt, quitting threads.')
if __name__ == "__main__":
# change the name of MainThread.
threading.currentThread().setName("FileDownloader")
myapp = threading.currentThread().getName()
main(myapp)
答案 0 :(得分:2)
我在你的代码中看到两个问题。由于它不完整,我不确定它应该如何工作,所以我不能保证一个是你遇到的特定的一个,但我很确定你需要修复它们。
首先:
queue.put((_fdUtils.getUrl(url), int(split)))
这将在主线程中调用_fdUtils.getUrl(url)
,并将结果放在队列中。您的意见明确暗示您打算在后台线程上进行下载。
如果要传递要调用的函数,只需将函数及其参数作为元组的单独成员传递,或将其包装在闭包或部分中:
queue.put((lambda: _fdUtils.getUrl(url), int(split)))
第二
t = ThreadedFetch(saveTo, queue)
t.daemon = True
t.start()
这将为每个URL启动一个线程。这几乎不是一个好主意。通常,下载程序一次不使用超过4-16个线程,并且同一站点不超过2-4个线程。您可以轻松地超时,因为您发送垃圾邮件有些人坐得太快,而且它的服务器或路由器让您暂时退出。或者,对于巨大的数量的请求,您可能会充斥自己的网络并阻止ACK甚至重新启动路由器(特别是如果您有廉价的家庭WiFi路由器或ADSL与糟糕的提供商)。
另外,更简单的方法是使用智能池,例如multiprocessing.dummy.Pool
(multiprocessing.dummy
表示它的行为类似于multiprocessing
模块但使用线程)或者,更好的是concurrent.futures.ThreadPoolExecutor
。事实上,如果您查看文档,则并行下载程序为the first example for ThreadPoolExecutor
。