我有一个使用ElementTree Path Evaluator解析xml文件的脚本。它工作得很好,但它需要很长时间才能完成。所以我尝试制作多线程实现:
import fnmatch
import operator
import os
import lxml.etree
from nltk import FreqDist
from nltk.corpus import stopwords
from collections import defaultdict
from datetime import datetime
import threading
import Queue
STOPWORDS = stopwords.words('dutch')
STOPWORDS.extend(stopwords.words('english'))
DIR_NAME = 'A_DIRNAME'
PATTERN = '*.A_PATTERN'
def loadData(dir_name, pattern):
nohyphen_files = []
dir_names = []
dir_paths = []
for root, dirnames, filenames in os.walk(dir_name):
dir_names.append(dirnames)
dir_paths.append(root)
for filename in fnmatch.filter(filenames, pattern):
nohyphen_files.append(os.path.join(root, filename))
return nohyphen_files, dir_names, dir_paths
def freq(element_list, descending = True):
agglomerated = defaultdict(int)
for e in element_list:
agglomerated[e] += 1
return sorted(agglomerated.items(), key=operator.itemgetter(1), reverse=descending)
def lexDiv(amount_words):
return 1.0*len(set(amount_words))/len(amount_words)
def anotherFreq(list_types, list_words):
fd = FreqDist(list_types)
print 'top 10 most frequent types:'
for t, freq in fd.items()[:10]:
print t, freq
print '\ntop 10 most frequent words:'
agglomerated = defaultdict(int)
for w in list_words:
if not w.lower() in STOPWORDS:
agglomerated[w] += 1
sorted_dict = sorted(agglomerated.items(), key=operator.itemgetter(1),reverse=True)
print sorted_dict[:10]
def extractor(f):
print "check file: {}".format(f)
try:
# doc = lxml.etree.ElementTree(lxml.etree.XML(f))
doc = lxml.etree.ElementTree(file=f)
except lxml.etree.XMLSyntaxError, e:
print e
return
doc_evaluator = lxml.etree.XPathEvaluator(doc)
entities = doc_evaluator('//entity/*/externalRef/@reference')
places_dbpedia = doc_evaluator('//entity[contains(@type, "Schema:Place")]/*/externalRef/@reference')
non_people_dbpedia = set(doc_evaluator('//entity[not(contains(@type, "Schema:Person"))]'))
people = doc_evaluator('//entity[contains(@type, "Schema:Person")]/*/externalRef/@reference')
words = doc.xpath('text/wf[re:match(text(), "[A-Za-z-]")]/text()',\
namespaces={"re": "http://exslt.org/regular-expressions"})
unique_words = set(words)
other_tokens = doc.xpath('text/wf[re:match(text(), "[^A-Za-z-]")]/text()',\
namespaces={"re": "http://exslt.org/regular-expressions"})
amount_of_sentences = doc_evaluator('text/wf/@sent')[-1]
types = doc_evaluator('//term/@morphofeat')
longest_sentence = freq(doc.xpath('text/wf[re:match(text(), "[A-Za-z-]")]/@sent',\
namespaces={"re": "http://exslt.org/regular-expressions"}))[0]
top_people = freq([e.split('/')[-1] for e in people])[:10]
top_entities = freq([e.split('/')[-1] for e in entities])[:10]
top_places = freq([e.split('/')[-1] for e in places_dbpedia])[:10]
def worker():
while 1:
job_number = q.get()
extractor(job_number)
q.task_done() #this thread is complete, move on
if __name__ =='__main__':
startTime = datetime.now()
files, dirs, path = loadData(DIR_NAME, PATTERN)
startTime = datetime.now()
q = Queue.Queue()# job queue
for f in files:
q.put(f)
for i in range(20): #make 20 workerthreads ready
worker_thread = threading.Thread(target=worker)
worker_thread.daemon = True
worker_thread.start()
q.join()
print datetime.now() - startTime
这确实有所作为,但在计时时,它并不比普通版快。我认为它与打开和读取文件有关,使得穿线器不是多线程的。如果我使用一个函数,而不是解析xml文件只是睡几秒钟并打印一些东西,它确实有效,而且速度要快得多。如何拥有多线程XML解析器,我需要考虑什么?
答案 0 :(得分:0)
Python中的线程并不像在其他语言中那样工作。它依赖于Global Interpreter Lock,确保一次只有一个线程处于活动状态(准确地运行字节码)。
你想要做的是使用多进程库
您可以在此处阅读有关GIL和线程的更多信息:
Digital Ocean page on Debian 8
https://docs.python.org/2/glossary.html#term-global-interpreter-lock