我正在尝试编写一个可以同时下载大量网页的程序。我想写一些小的测试脚本来看看我是否可以掌握多线程。它似乎工作正常,但它并不像我想象的那样稳定或令人印象深刻。我究竟做错了什么?我怎么能改善这个?
class UrlMiner(threading.Thread):
def __init__(self, url_queue, targets, visited, timeout=5):
threading.Thread.__init__(self)
self.queue = url_queue
self.targets = targets
self.visited = visited
self.timeout = timeout
def run(self):
while True:
try:
url = self.queue.get()
#print 'Scraping {}'.format(url)
web_page = ''
#with contextlib.closing(urllib2.urlopen(url,
# None,
# self.timeout)) as page:
# web_page = page.read()
data = urllib2.urlopen(url, None, self.timeout)
web_page = data.read()
data.close()
except urllib2.HTTPError:
pass
except urllib2.URLError:
pass
except socket.error:
pass
except socket.timeout:
pass
except httplib.IncompleteRead:
pass
except httplib.BadStatusLine:
pass
except httplib.InvalidURL:
pass
if web_page:
#print 'Successfully scraped ', url
self.visited.good()
else:
#print 'Unsuccessfully scraped ', url
pass
self.visited.add(url)
self.queue.task_done()
def main():
urls = []
with open('unvisited.txt') as infile:
for line in infile:
urls.append(line)
visited = SetWrapper()
targets = SetWrapper()
queue = Queue.Queue()
for url in urls:
queue.put(url)
# worker daemons
for i in range(0, 100):
t = UrlMiner(queue, targets, visited, timeout=14)
t.setDaemon(True)
t.start()
start_time = time.time()
queue.join()
ttime = time.time() - start_time
print '{} sites were scrapped in {} seconds'.format(len(urls), ttime)
print '{} were filled requests.'.format(visited.goodv)
100个线程和1001个站点的结果如下:
$ python test.py在138.261109114秒内抓取1001个网站 填写了262个请求。
我使用的线程越少,越快,越成功(填充请求越多)。我有一个很好的互联网连接,我已经在OSX和Linux上进行了测试,得到了相同的结果(8核,8GB内存)。