我正在尝试利用计算机中的多个核心来处理使用python多处理库的数百万个文件中的文本。以下代码显示了主路径和辅助功能,它将路径作为输入重新运行(1000或更少)单词,这些单词在路径中的所有文件中出现最多(函数没有错误,仅供参考)。
import multiprocessing as mp
def worker(fileNameList):
'''Takes a file name list and reruns a word frequency map of all the files in a dict'''
vacob=dict()
for fileName in fileNameList:
xmlfile=open(fileName)
tree=html.fromstring(xmlfile.read())
paras=tree.xpath("//title/text()|//headline/text()|//text/p/text()")
docString="".join(paras)
wordList=preprocess_pipeline(docString)
for word in wordList:
if vacob.has_key(word):
vacob[word]=vacob[word]+1
else:
vacob[word]=1
xmlfile.close()
output.put(vacob)
def master(path,n=8):
'''Takes a path as input and returns a vocabulary of(10000 or less) words for all the files in the path'''
vacob=defaultdict(int)
xmlFiles=[f for f in listdir(path) if isfile(join(path,f)) and os.path.splitext(f)[1]=='.xml']
length=len(xmlFiles)
parts=length/n
processes=list()
for i in range(n):
processes.append(mp.Process(target=worker,args=[xmlFiles[i*parts:(i+1)*parts]]))
for i in processes:
i.start()
for i in processes:
i.join()
for j in range(n):
results=output.get()
for word in results:
vacob[word]+=1
vacob=sorted(vacob,key=vacob.get,reverse=True)
if(len(vacob)<10000):
return vacob
else:
return vacob[:10000]
output=mp.Queue()
vocab=master(path)
这应该利用我计算机的所有8个核心。但是所有进程只共享我的cpu的一个核心。下图显示我的textprocessing.py脚本生成的所有进程只使用一个核心。如何让脚本使用所有可用的核心?
当我尝试调试打印每个工作人员正在处理的文件时。它似乎利用了所有核心。但我仍然无法理解为什么一个简单的打印声明使用了所有核心。
以下是带有调试打印的修改后的代码。
import multiprocessing as mp
def worker(fileNameList,no):
'''Takes a file name list and reruns a word frequency map of all the files in a dict'''
vacob=dict()
for fileName in fileNameList:
print "processing ",fileName," worker",no
xmlfile=open(fileName)
tree=html.fromstring(xmlfile.read())
paras=tree.xpath("//title/text()|//headline/text()|//text/p/text()")
docString="".join(paras)
wordList=preprocess_pipeline(docString)
for word in wordList:
if vacob.has_key(word):
vacob[word]=vacob[word]+1
else:
vacob[word]=1
xmlfile.close()
output.put(vacob)
def master(path,n=8):
'''Takes a path as input and returns a vocabulary of(10000 or less) words for all the files in the path'''
vacob=defaultdict(int)
xmlFiles=[f for f in listdir(path) if isfile(join(path,f)) and os.path.splitext(f)[1]=='.xml']
length=len(xmlFiles)
parts=length/n
processes=list()
for i in range(n):
processes.append(mp.Process(target=worker,args=[xmlFiles[i*parts:(i+1)*parts],i]))
for i in processes:
i.start()
for i in processes:
i.join()
for j in range(n):
results=output.get()
for word in results:
vacob[word]+=1
vacob=sorted(vacob,key=vacob.get,reverse=True)
if(len(vacob)<10000):
return vacob
else:
return vacob[:10000]
output=mp.Queue()
vocab=master(path)
以下是htop和控制台的屏幕截图: