python urllib2多次下载

时间:2012-04-26 16:51:08

标签: python html urllib

我如何让下面的脚本一次下载多个链接,而不是一次下载urllib2?

蟒:

from BeautifulSoup import BeautifulSoup
import lxml.html as html
import urlparse
import os, sys
import urllib2
import re

print ("downloading and parsing Bibles...")
root = html.parse(open('links.html'))
for link in root.findall('//a'):
  url = link.get('href')
  name = urlparse.urlparse(url).path.split('/')[-1]
  dirname = urlparse.urlparse(url).path.split('.')[-1]
  f = urllib2.urlopen(url)
  s = f.read()
  if (os.path.isdir(dirname) == 0): 
    os.mkdir(dirname)
  soup = BeautifulSoup(s)
  articleTag = soup.html.body.article
  converted = str(articleTag)
  full_path = os.path.join(dirname, name)
  open(full_path, 'w').write(converted)
  print(name)
print("DOWNLOADS COMPLETE!")

links.html

<a href="http://www.youversion.com/bible/gen.1.nmv-fas">http://www.youversion.com/bible/gen.1.nmv-fas</a>

<a href="http://www.youversion.com/bible/gen.2.nmv-fas">http://www.youversion.com/bible/gen.2.nmv-fas</a>

<a href="http://www.youversion.com/bible/gen.3.nmv-fas">http://www.youversion.com/bible/gen.3.nmv-fas</a>

<a href="http://www.youversion.com/bible/gen.4.nmv-fas">http://www.youversion.com/bible/gen.4.nmv-fas</a>

<a href="http://www.youversion.com/bible/gen.5.nmv-fas">http://www.youversion.com/bible/gen.5.nmv-fas</a>

<a href="http://www.youversion.com/bible/gen.6.nmv-fas">http://www.youversion.com/bible/gen.6.nmv-fas</a>

1 个答案:

答案 0 :(得分:1)

Blainer,尝试穿线。

这是一个很好的实际例子

http://www.ibm.com/developerworks/aix/library/au-threadingpython/

然后引用python std库

http://docs.python.org/library/threading.html

如果查看实际示例,它实际上有一个urllib2并发下载的线程版本示例。我继续向你迈进了一步,你将不得不使用修复这个的部分来进一步解析你的html ..

#!/usr/bin/env python

import Queue
import threading
import urllib2
import time
import htmllib, formatter

class LinksExtractor(htmllib.HTMLParser):
    # derive new HTML parser

    def __init__(self, formatter):        
        # class constructor
        htmllib.HTMLParser.__init__(self, formatter)  
        # base class constructor
        self.links = []        
        # create an empty list for storing hyperlinks

    def start_a(self, attrs) :  # override handler of <A ...>...</A> tags
        # process the attributes
        if len(attrs) > 0 :
            for attr in attrs :
                if attr[0] == "href":         
                    # ignore all non HREF attributes
                    self.links.append(attr[1]) # save the link info in the list

    def get_links(self) :     
        # return the list of extracted links
        return self.links

format = formatter.NullFormatter()
htmlparser = LinksExtractor(format)

data = open("links.html")
htmlparser.feed(data.read())
htmlparser.close()

hosts = htmlparser.links

queue = Queue.Queue()

class ThreadUrl(threading.Thread):
    """Threaded Url Grab"""
    def __init__(self, queue):
        threading.Thread.__init__(self)
        self.queue = queue

    def run(self):
        while True:
            #grabs host from queue
            host = self.queue.get()

            ####################################
            ############FIX THIS PART###########
            #VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV#

            url = urllib2.urlopen(host)
            morehtml = url.read() # your own your own with this

            #signals to queue job is done
            self.queue.task_done()

start = time.time()
def main():
    #spawn a pool of threads, and pass them queue instance 
    for i in range(5):
        t = ThreadUrl(queue)
        t.setDaemon(True)
        t.start()

        #populate queue with data   
    for host in hosts:
        queue.put(host)

    #wait on the queue until everything has been processed     
    queue.join()

main()
print "Elapsed Time: %s" % (time.time() - start)