Question

我正在尝试利用计算机中的多个核心来处理使用python多处理库的数百万个文件中的文本。以下代码显示了主路径和辅助功能，它将路径作为输入重新运行（1000或更少）单词，这些单词在路径中的所有文件中出现最多（函数没有错误，仅供参考）。

    import multiprocessing as mp
    def worker(fileNameList):
        '''Takes a file name list and reruns a word frequency map of all the files in a dict'''
        vacob=dict()
        for fileName in fileNameList:
            xmlfile=open(fileName)
            tree=html.fromstring(xmlfile.read())
            paras=tree.xpath("//title/text()|//headline/text()|//text/p/text()")
            docString="".join(paras)
            wordList=preprocess_pipeline(docString)
            for word in wordList:
                if vacob.has_key(word):
                    vacob[word]=vacob[word]+1
                else:
                    vacob[word]=1
                xmlfile.close()
        output.put(vacob)


    def master(path,n=8):
        '''Takes a path as input and returns a vocabulary of(10000 or less) words for all the files in the path'''
        vacob=defaultdict(int)
        xmlFiles=[f for f in listdir(path) if isfile(join(path,f)) and os.path.splitext(f)[1]=='.xml']
        length=len(xmlFiles)
        parts=length/n
        processes=list()
        for i in range(n):
            processes.append(mp.Process(target=worker,args=[xmlFiles[i*parts:(i+1)*parts]]))
        for i in processes:
            i.start()
        for i in processes:
            i.join()

        for j in range(n):
            results=output.get()
            for word in results:
                vacob[word]+=1
        vacob=sorted(vacob,key=vacob.get,reverse=True)
        if(len(vacob)<10000):
            return vacob
        else:
            return vacob[:10000] 
    output=mp.Queue()
    vocab=master(path)

这应该利用我计算机的所有8个核心。但是所有进程只共享我的cpu的一个核心。下图显示我的textprocessing.py脚本生成的所有进程只使用一个核心。如何让脚本使用所有可用的核心？

Htop

当我尝试调试打印每个工作人员正在处理的文件时。它似乎利用了所有核心。但我仍然无法理解为什么一个简单的打印声明使用了所有核心。

以下是带有调试打印的修改后的代码。

    import multiprocessing as mp
    def worker(fileNameList,no):
        '''Takes a file name list and reruns a word frequency map of all the files in a dict'''
        vacob=dict()
        for fileName in fileNameList:
            print "processing ",fileName," worker",no
            xmlfile=open(fileName)
            tree=html.fromstring(xmlfile.read())
            paras=tree.xpath("//title/text()|//headline/text()|//text/p/text()")
            docString="".join(paras)
            wordList=preprocess_pipeline(docString)
            for word in wordList:
                if vacob.has_key(word):
                    vacob[word]=vacob[word]+1
                else:
                    vacob[word]=1
            xmlfile.close()
        output.put(vacob)


    def master(path,n=8):
        '''Takes a path as input and returns a vocabulary of(10000 or less) words for all the files in the path'''
        vacob=defaultdict(int)
        xmlFiles=[f for f in listdir(path) if isfile(join(path,f)) and os.path.splitext(f)[1]=='.xml']
        length=len(xmlFiles)
        parts=length/n
        processes=list()
        for i in range(n):
            processes.append(mp.Process(target=worker,args=[xmlFiles[i*parts:(i+1)*parts],i]))
        for i in processes:
            i.start()
        for i in processes:
            i.join()

        for j in range(n):
            results=output.get()
            for word in results:
                vacob[word]+=1
        vacob=sorted(vacob,key=vacob.get,reverse=True)
        if(len(vacob)<10000):
            return vacob
        else:
            return vacob[:10000] 
    output=mp.Queue()
    vocab=master(path)

以下是htop和控制台的屏幕截图： enter image description here

无法在python中使用多处理来利用多个核心

0 个答案: