我们如何使用Python的多处理模块编写程序?

时间:2016-06-08 06:11:22

标签: python parallel-processing multiprocessing

# -*- coding: utf-8 -*-
from __future__ import print_function
import os, codecs, re, string, mysql
import mysql.connector

'''Reading files with txt extension'''
y_ = ""
for root, dirs, files in os.walk("/Users/Documents/source-document/part1"):
    for file in files:
        if file.endswith(".txt"):
            x_ = codecs.open(os.path.join(root,file),"r", "utf-8-sig")
            for lines in x_.readlines():
                y_ = y_ + lines
#print(tokenized_docs) 

'''Tokenizing sentences of the text files'''

from nltk.tokenize import sent_tokenize
raw_docs = sent_tokenize(y_)

tokenized_docs = [sent_tokenize(y_) for sent in raw_docs]

'''Removing stop words'''

stopword_removed_sentences = []
from nltk.corpus import stopwords
stopset = stopwords.words("English")
for i in tokenized_docs[0]:
    tokenized_docs = ' '.join([word for word in i.split() if word not in stopset])
    stopword_removed_sentences.append(tokenized_docs)

''' Removing punctuation marks'''

regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html
nw = []
for review in stopword_removed_sentences:
    new_review = ''
    for token in review: 
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review += new_token
    nw.append(new_review)

'''Lowercasing letters after removing puctuation marks.'''

lw = []  #lw stands for lowercase word.
for i in nw:
    k = i.lower()
    lw.append(k) 

'''Removing number with a dummy symbol'''
nr = []
for j in lw:
    string = j
    regex = r'[^\[\]]+(?=\])'
# let "#" be the dummy symbol
    output = re.sub(regex,'#',string)
    nr.append(output)
nrfinal = []    
for j in nr:
    rem = 0
    outr = ''
    for i in j:
        if ord(i)>= 48 and ord(i)<=57:
            rem += 1
            if rem == 1:
                outr = outr+ '#'
        else:
             rem = 0             
             outr = outr+i
    nrfinal.append(outr)

'''Inserting into database'''
def connect():
    for j in nrfinal:
        conn = mysql.connector.connect(user = 'root', password = '', unix_socket = "/tmp/mysql.sock", database = 'Thesis' )
        cursor = conn.cursor()
        cursor.execute("""INSERT INTO splitted_sentences(sentence_id, splitted_sentences) VALUES(%s, %s)""",(cursor.lastrowid,j))
        conn.commit()
        conn.close()
if __name__ == '__main__':
    connect()

我没有收到此代码的任何错误。它对文本文件很有用。问题只是执行时间,因为我有很多文本文件(接近6Gb),程序花费了太多时间。在检查时,我发现它受CPU限制。因此,要解决它,需要进行多处理。请帮我用多处理模块编写代码,以便进行并行处理。     谢谢大家。

1 个答案:

答案 0 :(得分:1)

python docs中的一个示例演示了env | grep PROGRAM 的使用:

multiprocessing

您可以使用它来调整您的代码。获得文本文件后,使用from multiprocessing import Pool def f(x): return x*x if __name__ == '__main__': with Pool(5) as p: print(p.map(f, [1, 2, 3])) 函数并行执行其余文件。您必须定义一个封装您想要在多个核上执行的代码的函数。

但是,并行读取文件可能会降低性能。此外,以异步方式向数据库添加内容可能不起作用。所以你可能想在主线程中执行这两个任务,仍然是