Question

我有一个程序应该是一个Web爬虫，可以同时处理和遍历来自不同页面的URL。当URL数组太满时，程序将数组拆分为四个四分之一，并使用线程递归计算它们。此程序旨在继续此机制，直到遍历网站上的所有页面，如受访阵列的最大大小（10,000）所示。我遇到的问题是由于某种原因，程序只运行递归序列的最后一个线程。以下是我遇到问题的部分代码：

def WebScraper(urls, threadID):
global visited

while len(urls) > 0 and len(visited) <= 10000:
    try:
        htmltext = urllib.urlopen(urls[0]).read()
        soup = BeautifulSoup(htmltext)
    except:
        print urls[0]

    url = urls[0]
    urls.pop(0)
    print "Thread #" + str(threadID) + " - " + str(len(urls)) + "\n"

    if len(urls) >= 150:

        print "Number visited " + str(len(visited)) + "\n"

        queueLock = threading.Lock()
        workQueue = Queue.Queue(10)
        threads = []
        thread = ""

        newThread1 = myThread(urls[:len(urls)/4])
        newThread1.start()
        threads.append(newThread1)

        newThread2 = myThread(urls[len(urls)/4:len(urls)/2])
        newThread2.start()
        threads.append(newThread2)

        newThread3 = myThread(urls[len(urls)/2:3 * len(urls)/4])
        newThread3.start()
        threads.append(newThread3)

        newThread4 = myThread(urls[3 * len(urls)/4:])
        newThread4.start()
        threads.append(newThread4)

        queueLock.acquire()

        for thread in threads:
            workQueue.put(thread)

        while not workQueue.empty():
            pass

        for t in threads:
            t.join()

        break

    else:
        for tags in soup.findAll("a",href=True):
            tags["href"] = urlparse.urljoin(url, tags["href"])
            #print tags["href"]
            if url in tags["href"] and tags["href"] not in visited:
                TotalUrls.append(tags["href"])
                urls.append(tags["href"])
                visited.append(tags["href"])

这是我得到的结果：

线程＃0 - 0

线程＃0 - 153

号码访问了154

线程＃：1 线程＃：2 线程＃：3 线程＃：4

线程＃4 - 37

线程＃4 - 38

线程＃4 - 37

线程＃4 - 36

线程＃4 - 37

线程＃4 - 48

感谢任何帮助。

Answer 1

如果您因某些原因不想使用Scrapy，以下代码将为您提供一个良好的开端。

通常，使用multiprocessing.Pool()工具可以更轻松地实现多线程程序。您创建一个池，向它发送一个巨大的作业列表，它会为您提供每个结果。

这对于抓取类型的问题并不适用，我们在开始时没有固定大小的输入作业列表。对于这类问题，我开发了以下解决方案。

启动线程，给每个输入队列和其他args
每个线程都有效，然后将更多作业（URL）添加到所有线程正在使用的相同输入队列
一个帖子不会缩放所有网址
当一个帖子在5秒钟内找不到新作业时，它会死掉

上述技术可以实现简单但功能强大的多线程队列解决方案。

玩得开心！

源

#!/bin/env python

import multiprocessing, threading, time, urllib, urlparse
import logging, os, Queue, sys
from BeautifulSoup import BeautifulSoup

logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)-4s %(levelname)s %(threadName)s %(message)s", 
    datefmt="%H:%M:%S",
    stream=sys.stderr,
)

def scan_page(url, soup):
    """return link URLs from HTML page"""
    for tags in soup.findAll("a",href=True):
        yield urlparse.urljoin(url, tags["href"])

def scan_url(url):
    """fetch page, return URLs, print timing"""
    start = time.time()
    page = urllib.urlopen(url).read()
    res = list( scan_page(url, BeautifulSoup(page)) )
    logging.debug(
        '- %s: %.2f, %d links', 
        url, time.time()-start, len(res)
    )
    return res

def scan_queue(inqueue, visited):
    logging.info('start')
    while True:
        try:
            url = inqueue.get(timeout=5)
            if len(visited) > 10:
                logging.info('exiting, visited 10 URLs')
                break                
            if url in visited:
                logging.info('%s: ignoring visited URL', url)
                continue
            logging.info('url: %s', url)
            visited.add( url )
        except Queue.Empty:
            break
        links = scan_url(url)
        for link in set(links) - visited:
            inqueue.put( link )
    logging.info('done')

def master(root):
    # native Python objects are thread-safe
    # TODO: quote manual
    visited = set()

    url_queue = Queue.Queue() # pylint: disable=E1101
    url_queue.put(root)

    threads = [
        threading.Thread(
            target=scan_queue, args=[
                url_queue, visited,
                ],
        )
        for _ in range(multiprocessing.cpu_count())
    ]

    for th in threads:
        th.start()
    for th in threads:
        th.join()

if __name__ == "__main__":
    master('http://example.com')

输出

13:26:57 INFO Thread-1 start
13:26:57 INFO Thread-1 url: http://example.com
13:26:57 INFO Thread-2 start
13:26:57 INFO Thread-3 start
13:26:57 INFO Thread-4 start
13:26:57 DEBUG Thread-1 - http://example.com: 0.12, 1 links
13:26:57 INFO Thread-1 url: http://www.iana.org/domains/example
13:26:58 DEBUG Thread-1 - http://www.iana.org/domains/example: 0.62, 64 links
13:26:58 INFO Thread-2 url: http://www.iana.org/contact
13:26:58 INFO Thread-1 url: http://www.iana.org/time-zones
13:26:58 INFO Thread-4 url: http://www.iana.org/domains/root/db/xn--kgbechtv.html
13:26:58 INFO Thread-3 url: http://www.iana.org/domains/root/db/xn--0zwm56d.html
13:26:58 DEBUG Thread-2 - http://www.iana.org/contact: 0.23, 34 links
13:26:58 INFO Thread-2 url: http://www.iana.org/domains/int/manage
13:26:58 DEBUG Thread-1 - http://www.iana.org/time-zones: 0.30, 39 links
13:26:58 INFO Thread-1 url: http://www.iana.org/go/rfc2606
13:26:58 DEBUG Thread-4 - http://www.iana.org/domains/root/db/xn--kgbechtv.html: 0.33, 49 links
13:26:58 INFO Thread-4 url: http://www.iana.org/numbers
13:26:59 DEBUG Thread-2 - http://www.iana.org/domains/int/manage: 0.23, 51 links
13:26:59 INFO Thread-2 url: http://www.iana.org/domains/arpa
13:26:59 DEBUG Thread-4 - http://www.iana.org/numbers: 0.21, 62 links
13:26:59 INFO Thread-4 url: http://www.icann.org/en/registries/agreements.htm
13:26:59 DEBUG Thread-2 - http://www.iana.org/domains/arpa: 0.19, 59 links
13:26:59 INFO Thread-2 exiting, visited 10 URLs
13:26:59 INFO Thread-2 done
13:26:59 DEBUG Thread-1 - http://www.iana.org/go/rfc2606: 0.42, 37 links
13:26:59 INFO Thread-1 exiting, visited 10 URLs
13:26:59 INFO Thread-1 done
13:26:59 DEBUG Thread-3 - http://www.iana.org/domains/root/db/xn--0zwm56d.html: 0.75, 49 links
13:26:59 INFO Thread-3 exiting, visited 10 URLs
13:26:59 INFO Thread-3 done
13:27:01 DEBUG Thread-4 - http://www.icann.org/en/registries/agreements.htm: 2.83, 1098 links
13:27:01 INFO Thread-4 exiting, visited 10 URLs
13:27:01 INFO Thread-4 done

使用线程在python中进行并行处理

1 个答案:

源

输出