我如何让下面的脚本一次下载多个链接,而不是一次下载urllib2?
蟒:
from BeautifulSoup import BeautifulSoup
import lxml.html as html
import urlparse
import os, sys
import urllib2
import re
print ("downloading and parsing Bibles...")
root = html.parse(open('links.html'))
for link in root.findall('//a'):
url = link.get('href')
name = urlparse.urlparse(url).path.split('/')[-1]
dirname = urlparse.urlparse(url).path.split('.')[-1]
f = urllib2.urlopen(url)
s = f.read()
if (os.path.isdir(dirname) == 0):
os.mkdir(dirname)
soup = BeautifulSoup(s)
articleTag = soup.html.body.article
converted = str(articleTag)
full_path = os.path.join(dirname, name)
open(full_path, 'w').write(converted)
print(name)
print("DOWNLOADS COMPLETE!")
links.html
<a href="http://www.youversion.com/bible/gen.1.nmv-fas">http://www.youversion.com/bible/gen.1.nmv-fas</a>
<a href="http://www.youversion.com/bible/gen.2.nmv-fas">http://www.youversion.com/bible/gen.2.nmv-fas</a>
<a href="http://www.youversion.com/bible/gen.3.nmv-fas">http://www.youversion.com/bible/gen.3.nmv-fas</a>
<a href="http://www.youversion.com/bible/gen.4.nmv-fas">http://www.youversion.com/bible/gen.4.nmv-fas</a>
<a href="http://www.youversion.com/bible/gen.5.nmv-fas">http://www.youversion.com/bible/gen.5.nmv-fas</a>
<a href="http://www.youversion.com/bible/gen.6.nmv-fas">http://www.youversion.com/bible/gen.6.nmv-fas</a>
答案 0 :(得分:1)
Blainer,尝试穿线。
这是一个很好的实际例子
http://www.ibm.com/developerworks/aix/library/au-threadingpython/
然后引用python std库
http://docs.python.org/library/threading.html
如果查看实际示例,它实际上有一个urllib2并发下载的线程版本示例。我继续向你迈进了一步,你将不得不使用修复这个的部分来进一步解析你的html ..
#!/usr/bin/env python
import Queue
import threading
import urllib2
import time
import htmllib, formatter
class LinksExtractor(htmllib.HTMLParser):
# derive new HTML parser
def __init__(self, formatter):
# class constructor
htmllib.HTMLParser.__init__(self, formatter)
# base class constructor
self.links = []
# create an empty list for storing hyperlinks
def start_a(self, attrs) : # override handler of <A ...>...</A> tags
# process the attributes
if len(attrs) > 0 :
for attr in attrs :
if attr[0] == "href":
# ignore all non HREF attributes
self.links.append(attr[1]) # save the link info in the list
def get_links(self) :
# return the list of extracted links
return self.links
format = formatter.NullFormatter()
htmlparser = LinksExtractor(format)
data = open("links.html")
htmlparser.feed(data.read())
htmlparser.close()
hosts = htmlparser.links
queue = Queue.Queue()
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
#grabs host from queue
host = self.queue.get()
####################################
############FIX THIS PART###########
#VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV#
url = urllib2.urlopen(host)
morehtml = url.read() # your own your own with this
#signals to queue job is done
self.queue.task_done()
start = time.time()
def main():
#spawn a pool of threads, and pass them queue instance
for i in range(5):
t = ThreadUrl(queue)
t.setDaemon(True)
t.start()
#populate queue with data
for host in hosts:
queue.put(host)
#wait on the queue until everything has been processed
queue.join()
main()
print "Elapsed Time: %s" % (time.time() - start)