Question

我用threadpool编写了简单的网站crowler。问题是：然后爬虫已遍及整个网站它必须完成，但实际上它等待最终的东西，脚本没有完成，为什么会发生这种情况？

from Queue import Queue
from threading import Thread

import sys
from urllib import urlopen
from BeautifulSoup import BeautifulSoup, SoupStrainer
import re
from Queue import Queue, Empty
from threading import Thread

visited = set()
queue = Queue()

class Worker(Thread):
    """Thread executing tasks from a given tasks queue"""
    def __init__(self, tasks):
        Thread.__init__(self)
        self.tasks = tasks
        self.daemon = True
        self.start()

    def run(self):
        while True:
            func, args, kargs = self.tasks.get()
            print "startcall in thread",self
            print args
            try: func(*args, **kargs)
            except Exception, e: print e
            print "stopcall in thread",self
            self.tasks.task_done()

class ThreadPool:
    """Pool of threads consuming tasks from a queue"""
    def __init__(self, num_threads):
        self.tasks = Queue(num_threads)
        for _ in range(num_threads): Worker(self.tasks)

    def add_task(self, func, *args, **kargs):
        """Add a task to the queue"""
        self.tasks.put((func, args, kargs))

    def wait_completion(self):
        """Wait for completion of all the tasks in the queue"""
        self.tasks.join()


def process(pool,host,url):

    try:
        print "get url",url
        #content = urlopen(url).read().decode(charset)
        content = urlopen(url).read()
    except UnicodeDecodeError:
        return

    for link in BeautifulSoup(content, parseOnlyThese=SoupStrainer('a')):
        #print "link",link
        try:
            href = link['href']
        except KeyError:
            continue


        if not href.startswith('http://'):
            href = 'http://%s%s' % (host, href)
        if not href.startswith('http://%s%s' % (host, '/')):
            continue



        if href not in visited:
            visited.add(href)
            pool.add_task(process,pool,host,href)
            print href




def start(host,charset):

    pool = ThreadPool(7)
    pool.add_task(process,pool,host,'http://%s/' % (host))
    pool.wait_completion()

start('simplesite.com','utf8')

Answer 1

我看到的问题是你从未退出运行中的。所以，它将永远阻止。你需要在工作完成后打破这个循环。

您可以尝试：
1）插入

if not func: break

在运行中的 task.get（...）之后
。

2）追加

pool.add_task(None, None, None)

在流程结束时。

这是流程通知池的一种方式，即他没有更多任务需要处理。

python threadpool问题（等待什么）

1 个答案: