我正在使用python进行频率字计数,即单进程版本:
#coding=utf-8
import string
import time
from collections import Counter
starttime = time.clock()
origin = open("document.txt", 'r').read().lower()
for_split = [',','\n','\t','\'','.','\"','!','?','-', '~']
#the words below will be ignoered when counting
ignored = ['the', 'and', 'i', 'to', 'of', 'a', 'in', 'was', 'that', 'had',
'he', 'you', 'his','my', 'it', 'as', 'with', 'her', 'for', 'on']
i=0
for ch in for_split:
origin = string.replace(origin, ch, ' ')
words = string.split(origin)
result = Counter(words).most_common(40)
for word, frequency in result:
if not word in ignored and i < 10:
print "%s : %d" % (word, frequency)
i = i+1
print time.clock() - starttime
然后多处理版本如下:
#coding=utf-8
import time
import multiprocessing
from collections import Counter
for_split = [',','\n','\t','\'','.','\"','!','?','-', '~']
ignored = ['the', 'and', 'i', 'to', 'of', 'a', 'in', 'was', 'that', 'had',
'he', 'you', 'his','my', 'it', 'as', 'with', 'her', 'for', 'on']
result_list = []
def worker(substr):
result = Counter(substr)
return result
def log_result(result):
result_list.append(result)
def main():
pool = multiprocessing.Pool(processes=5)
origin = open("document.txt", 'r').read().lower()
for ch in for_split:
origin = origin.replace(ch, ' ')
words = origin.split()
step = len(words)/4
substrs = [words[pos : pos+step] for pos in range(0, len(words), step)]
result = Counter()
for substr in substrs:
pool.apply_async(worker, args=(substr,), callback = log_result)
pool.close()
pool.join()
result = Counter()
for item in result_list:
result = result + item
result = result.most_common(40)
i=0
for word, frequency in result:
if not word in ignored and i < 10:
print "%s : %d" % (word, frequency)
i = i+1
if __name__ == "__main__":
starttime = time.clock()
main()
print time.clock() - starttime
答案 0 :(得分:4)
我认为这是将单个字符串分发给worker并接收结果的开销。如果我使用示例文档(Dostojevski的“犯罪和惩罚”)运行上面给出的并行代码,则运行大约需要0.32秒,而单进程版本只需0.09秒。如果我修改worker
函数只是处理字符串“test”而不是真实文档(仍然将真实字符串作为参数传递),运行时间将降至0.22秒。但是,如果我将“test”作为参数传递给map_async
函数,则运行时间会减少到0.06秒。因此,我会说在您的情况下,程序的运行时受到进程间通信开销的限制。
使用以下代码,我将并行版本的运行时间缩短到0.08秒:首先,我将文件分区为多个(几乎)相等长度的块,确保各个块之间的边界确实与新队。然后,我只是将块的长度和偏移传递给每个工作进程,让它打开文件,读取块,处理它并返回结果。与通过map_async函数直接分发字符串相比,这似乎导致了更少的开销。对于较大的文件大小,您应该能够使用此代码查看运行时的改进。此外,如果您可以容忍小计数错误,则可以省略确定正确的块边界的步骤,并将文件拆分为相同的大块。在我的例子中,这将运行时间降低到0.04秒,使得mp代码比单进程代码更快。
#coding=utf-8
import time
import multiprocessing
import string
from collections import Counter
import os
for_split = [',','\n','\t','\'','.','\"','!','?','-', '~']
ignored = ['the', 'and', 'i', 'to', 'of', 'a', 'in', 'was', 'that', 'had',
'he', 'you', 'his','my', 'it', 'as', 'with', 'her', 'for', 'on']
result_list = []
def worker(offset,length,filename):
origin = open(filename, 'r')
origin.seek(offset)
content = origin.read(length).lower()
for ch in for_split:
content = content.replace(ch, ' ')
words = string.split(content)
result = Counter(words)
origin.close()
return result
def log_result(result):
result_list.append(result)
def main():
processes = 5
pool = multiprocessing.Pool(processes=processes)
filename = "document.txt"
file_size = os.stat(filename)[6]
chunks = []
origin = open(filename, 'r')
while True:
lines = origin.readlines(file_size/processes)
if not lines:
break
chunks.append("\n".join(lines))
lengths = [len(chunk) for chunk in chunks]
offset = 0
for length in lengths:
pool.apply_async(worker, args=(offset,length,filename,), callback = log_result)
offset += length
pool.close()
pool.join()
result = Counter()
for item in result_list:
result = result + item
result = result.most_common(40)
i=0
for word, frequency in result:
if not word in ignored and i < 10:
print "%s : %d" % (word, frequency)
i = i+1
if __name__ == "__main__":
starttime = time.clock()
main()
print time.clock() - starttime