python线程没有性能提升

时间:2015-06-16 13:29:45

标签: python multithreading mongodb web-crawler

我正在使用Python编写并行搜寻器,并且我在Mongodb中存储了一些信息。经过测试,我意识到我的代码,即使使用线程,也不是平行的。无论我使用单个线程还是10个或50个线程都没有区别。我无法弄清楚原因。

编辑:据我所知,大部分处理时间由soup = BeautifulSoup(html)占用。可能是这个命令无法使用线程进行并行化吗?

from threading import Thread
import Queue
import urllib2
import re
from BeautifulSoup import *
from urlparse import urljoin
from pymongo import MongoClient
from urlparse import urlparse
import time
import hashlib

start_time = time.time()

level = 1
client = MongoClient()
db = client.crawler
visited = {}

def doWork():
    while True:
        try:
            myUrl = q_start.get()
        except:
            continue
        try:
            c=urllib2.urlopen(myUrl)
        except:
            q_start.task_done()
            continue

        parsed_url = urlparse(myUrl)

        html=c.read()
        try:
            soup = BeautifulSoup(html)
        except:
            q_start.task_done()

            continue
        txt = soup.prettify()
        links = soup('a')
        m = hashlib.md5(myUrl)

        db.urls.insert(
                {
                    "url":myUrl,
                    "HTML":txt,
                    "level":level,
                    "domain":parsed_url.netloc,
                    "md5":m.hexdigest()
                }
        )

        for link in links:

            if('href' in dict(link.attrs)):
                url = urljoin(myUrl,link['href'])
                if url.find("'")!=-1:
                    continue
                url=url.split('#')[0]
                if url[0:4] == 'http':
                    if url in visited:
                        continue
                    else:
                        visited[url]=True
                        q_new.put(url)
        q_start.task_done() 

q_start = Queue.Queue()

q_new = Queue.Queue()

for i in range(50):
        t = Thread(target=doWork)
        t.daemon = True
        t.start()

q_start.put("http://google.com")
q_start.join()

for i in range(2,5):
    print "Depth: "
    print i
    print time.time() - start_time
    level += 1
    print q_new.qsize()
    q_aux = q_new
    q_new = Queue.Queue()
    while q_aux.empty() != True:
        x = q_aux.get()
        q_start.put(x)
    q_start.join()

print "end"

print time.time() - start_time

0 个答案:

没有答案