为什么我的多线程解析器不是多线程的?

时间:2016-02-24 12:19:20

标签: python xpath lxml elementtree python-multithreading

我有一个使用ElementTree Path Evaluator解析xml文件的脚本。它工作得很好,但它需要很长时间才能完成。所以我尝试制作多线程实现:

import fnmatch
import operator
import os
import lxml.etree
from nltk import FreqDist
from nltk.corpus import stopwords
from collections import defaultdict
from datetime import datetime
import threading
import Queue

STOPWORDS = stopwords.words('dutch')
STOPWORDS.extend(stopwords.words('english'))
DIR_NAME = 'A_DIRNAME'
PATTERN = '*.A_PATTERN'

def loadData(dir_name, pattern):
    nohyphen_files = []
    dir_names = []
    dir_paths = []
    for root, dirnames, filenames in os.walk(dir_name):
        dir_names.append(dirnames)
        dir_paths.append(root)
        for filename in fnmatch.filter(filenames, pattern):
            nohyphen_files.append(os.path.join(root, filename))
    return nohyphen_files, dir_names, dir_paths

def freq(element_list, descending = True):
    agglomerated = defaultdict(int)
    for e in element_list:
        agglomerated[e] += 1
    return sorted(agglomerated.items(), key=operator.itemgetter(1), reverse=descending)

def lexDiv(amount_words):
    return 1.0*len(set(amount_words))/len(amount_words)

def anotherFreq(list_types, list_words):
    fd = FreqDist(list_types)
    print 'top 10 most frequent types:'
    for t, freq in fd.items()[:10]:
        print t, freq
    print '\ntop 10 most frequent words:'
    agglomerated = defaultdict(int)
    for w in list_words:
        if not w.lower() in STOPWORDS:
            agglomerated[w] += 1
    sorted_dict = sorted(agglomerated.items(), key=operator.itemgetter(1),reverse=True)
    print sorted_dict[:10]

def extractor(f):
    print "check file: {}".format(f)
    try:
        # doc = lxml.etree.ElementTree(lxml.etree.XML(f))
        doc = lxml.etree.ElementTree(file=f)
    except lxml.etree.XMLSyntaxError, e:
        print e
        return
    doc_evaluator = lxml.etree.XPathEvaluator(doc)
    entities = doc_evaluator('//entity/*/externalRef/@reference')
    places_dbpedia = doc_evaluator('//entity[contains(@type, "Schema:Place")]/*/externalRef/@reference')
    non_people_dbpedia = set(doc_evaluator('//entity[not(contains(@type, "Schema:Person"))]'))
    people = doc_evaluator('//entity[contains(@type, "Schema:Person")]/*/externalRef/@reference')
    words = doc.xpath('text/wf[re:match(text(), "[A-Za-z-]")]/text()',\
        namespaces={"re": "http://exslt.org/regular-expressions"})
    unique_words = set(words)
    other_tokens = doc.xpath('text/wf[re:match(text(), "[^A-Za-z-]")]/text()',\
        namespaces={"re": "http://exslt.org/regular-expressions"})
    amount_of_sentences = doc_evaluator('text/wf/@sent')[-1]
    types = doc_evaluator('//term/@morphofeat')
    longest_sentence = freq(doc.xpath('text/wf[re:match(text(), "[A-Za-z-]")]/@sent',\
        namespaces={"re": "http://exslt.org/regular-expressions"}))[0]

    top_people = freq([e.split('/')[-1] for e in people])[:10]
    top_entities = freq([e.split('/')[-1] for e in entities])[:10]
    top_places = freq([e.split('/')[-1] for e in places_dbpedia])[:10]

def worker():
    while 1:
        job_number = q.get()
        extractor(job_number)
        q.task_done() #this thread is complete, move on

if __name__ =='__main__':
    startTime = datetime.now()
    files, dirs, path = loadData(DIR_NAME, PATTERN)
    startTime = datetime.now()

q = Queue.Queue()# job queue

for f in files:
    q.put(f)

for i in range(20): #make 20 workerthreads ready
    worker_thread = threading.Thread(target=worker)
    worker_thread.daemon = True
    worker_thread.start()

q.join()
print datetime.now() - startTime

这确实有所作为,但在计时时,它并不比普通版快。我认为它与打开和读取文件有关,使得穿线器不是多线程的。如果我使用一个函数,而不是解析xml文件只是睡几秒钟并打印一些东西,它确实有效,而且速度要快得多。如何拥有多线程XML解析器,我需要考虑什么?

1 个答案:

答案 0 :(得分:0)

Python中的线程并不像在其他语言中那样工作。它依赖于Global Interpreter Lock,确保一次只有一个线程处于活动状态(准确地运行字节码)。
你想要做的是使用多进程库 您可以在此处阅读有关GIL和线程的更多信息:
Digital Ocean page on Debian 8
https://docs.python.org/2/glossary.html#term-global-interpreter-lock