我需要计算一个3GB压缩的英文句子纯文本文件的单词频率,解压缩后大约为30 GB。
我有一个带有collections.Counter
和gzip.open
的单线程脚本,需要几个小时才能完成。
由于逐行读取文件比拆分和计数要快得多,所以我正在考虑使用生产者-消费者流通过文件读取器来生成行,并由多个使用者进行拆分和计数,最后合并Counter
来获得单词出现。
但是,我找不到ProcessPoolExecutor
向Executor
发送队列的示例,它们只是列表中的map
个单项。
asyncio.Queue
只有单线程示例。
这是一个巨大的文件,因此我无法读取整个文件并在计数前获取list
,因此无法使用concurrent.futures.Executor.map
。但是我阅读的所有示例都使用固定列表作为开始。
拆分和计算一个句子的时间相当于派生一个进程,因此我必须使每个消费者进程的寿命更长。我认为map
不能合并Counter
,所以我不能使用chunksize
> 1。因此,我必须给使用者一个队列,让他们继续计数直到整个文件完成。但是大多数示例仅向消费者发送一件商品,并使用chunksize=1000
减少fork
的时间。
您能为我写一个例子吗?
我希望代码能够与Python 3.5.3向后兼容,因为PyPy更快。
我的真实情况是针对更特定的文件格式:
chr1 10011 141 0 157 4 41 50
chr1 10012 146 1 158 4 42 51
chr1 10013 150 0 163 4 43 53
chr1 10014 164 3 167 4 44 54
我需要计算从第3列到第8列的单个列的每个直方图。 所以我以单词频率为例。
我的代码是:
#!/usr/bin/env pypy3
import sys
SamplesList = ('D_Crick', 'D_Watson', 'Normal_Crick', 'Normal_Watson', 'D_WGS', 'Normal_WGS')
def main():
import math
if len(sys.argv) < 3 :
print('Usage:',sys.argv[0],'<samtools.depth.gz> <out.tsv> [verbose=0]',file=sys.stderr,flush=True)
exit(0)
try:
verbose = int(sys.argv[3])
except: # `except IndexError:` and `except ValueError:`
verbose = 0
inDepthFile = sys.argv[1]
outFile = sys.argv[2]
print('From:[{}], To:[{}].\nVerbose: [{}].'.format(inDepthFile,outFile,verbose),file=sys.stderr,flush=True)
RecordCnt,MaxDepth,cDepthCnt,cDepthStat = inStat(inDepthFile,verbose)
for k in SamplesList:
cDepthStat[k][2] = cDepthStat[k][0] / RecordCnt # E(X)
cDepthStat[k][3] = cDepthStat[k][1] / RecordCnt # E(X^2)
cDepthStat[k][4] = math.sqrt(cDepthStat[k][3] - cDepthStat[k][2]*cDepthStat[k][2]) # E(X^2)-E(X)^2
tsvout = open(outFile, 'wt')
print('#{}\t{}'.format('Depth','\t'.join(SamplesList)),file=tsvout)
#RecordCntLength = len(str(RecordCnt))
print( '#N={},SD:\t{}'.format(RecordCnt,'\t'.join(str(round(cDepthStat[col][4],1)) for col in SamplesList)),file=tsvout)
for depth in range(0,MaxDepth+1):
print( '{}\t{}'.format(depth,'\t'.join(str(cDepthCnt[col][depth]) for col in SamplesList)),file=tsvout)
tsvout.close()
pass
def inStat(inDepthFile,verbose):
import gzip
import csv
from collections import Counter
# Looking up things in global scope takes longer then looking up stuff in local scope. <https://stackoverflow.com/a/54645851/159695>
cDepthCnt = {key:Counter() for key in SamplesList}
cDepthStat = {key:[0,0,0,0,0] for key in SamplesList} # x and x^2
RecordCnt = 0
MaxDepth = 0
with gzip.open(inDepthFile, 'rt') as tsvin:
tsvin = csv.DictReader(tsvin, delimiter='\t', fieldnames=('ChrID','Pos')+SamplesList )
try:
for row in tsvin:
RecordCnt += 1
for k in SamplesList:
theValue = int(row[k])
if theValue > MaxDepth:
MaxDepth = theValue
cDepthCnt[k][theValue] += 1 # PyPy3:29.82 ns, Python3:30.61 ns
cDepthStat[k][0] += theValue
cDepthStat[k][1] += theValue * theValue
#print(MaxDepth,DepthCnt)
except KeyboardInterrupt:
print('\n[!]Ctrl+C pressed.',file=sys.stderr,flush=True)
pass
print('[!]Lines Read:[{}], MaxDepth is [{}].'.format(RecordCnt,MaxDepth),file=sys.stderr,flush=True)
return RecordCnt,MaxDepth,cDepthCnt,cDepthStat
if __name__ == "__main__":
main() # time python3 ./samdepthplot.py t.tsv.gz 1
csv.DictReader
花费的时间最多。
我的问题是,尽管gzip阅读器速度很快,而csv阅读器速度却很快,但我需要数以十亿计的行。而且,csv阅读器肯定比gzip阅读器要慢。
因此,我需要将行分布到csv阅读器的不同工作进程中,并分别进行下游计数。在一个生产者和许多消费者之间使用队列很方便。
由于我使用的是Python,而不是C,是否有一些抽象包装用于多处理和队列?可以在ProcessPoolExecutor
类中使用Queue
吗?
答案 0 :(得分:0)
我从未测试过此代码,但应该可以。
第一件事是检查行数
f =('myfile.txt')
def file_len(f):
with open(f) as f:
for i, l in enumerate(f):
pass
return i + 1
num_lines = file_len(f)
将数据拆分为n个分区
n = threads (8 for example)
split_size = num_lines//n if num_lines//n > 0 else 1
parts = [x for x in range(0, num_lines, split_size)]
现在开始作业:
from multiprocessing import Process
import linecache
jobs = []
for part in range(len(parts)):
p = Process(target = function_here, args = ('myfile.txt', parts[part], split_size))
jobs.append(p)
p.start()
for p in jobs:
p.join()
函数示例
def function_here(your_file_name, line_number, split_size):
for current_line in range(line_number, (line_number+split_size)+1):
print( linecache.getline(your_file_name, current_line))
仍然,在执行任何操作之前,您需要检查行数
答案 1 :(得分:0)
一个30 GB的文本文件足够大,可以将您的问题放入大数据领域。因此,为解决此问题,我建议使用Hadoop和Spark等大数据工具。您所说的“生产者-消费者流”基本上就是 MapReduce
算法的设计目标。字数计数频率是一个典型的MapReduce问题。查找它,您将找到大量示例。
答案 2 :(得分:0)
这个想法是将巨大的文件分成较小的文件。调用许多将执行计数工作并返回计数器的工作人员。 最后合并计数器。
from itertools import islice
from multiprocessing import Pool
from collections import Counter
import os
NUM_OF_LINES = 3
INPUT_FILE = 'huge.txt'
POOL_SIZE = 10
def slice_huge_file():
cnt = 0
with open(INPUT_FILE) as f:
while True:
next_n_lines = list(islice(f, NUM_OF_LINES))
cnt += 1
if not next_n_lines:
break
with open('sub_huge_{}.txt'.format(cnt), 'w') as out:
out.writelines(next_n_lines)
def count_file_words(input_file):
with open(input_file, 'r') as f:
return Counter([w.strip() for w in f.readlines()])
if __name__ == '__main__':
slice_huge_file()
pool = Pool(POOL_SIZE)
sub_files = [os.path.join('.',f) for f in os.listdir('.') if f.startswith('sub_huge')]
results = pool.map(count_file_words, sub_files)
final_counter = Counter()
for counter in results:
final_counter += counter
print(final_counter)
答案 3 :(得分:0)
只是一些伪代码:
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import Manager
import traceback
WORKER_POOL_SIZE = 10 # you should set this as the number of your processes
QUEUE_SIZE = 100 # 10 times to your pool size is good enough
def main():
with Manager() as manager:
q = manager.Queue(QUEUE_SIZE)
# init worker pool
executor = ProcessPoolExecutor(max_workers=WORKER_POOL_SIZE)
workers_pool = [executor.submit(worker, i, q) for i in range(WORKER_POOL_SIZE)]
# start producer
run_producer(q)
# wait to done
for f in workers_pool:
try:
f.result()
except Exception:
traceback.print_exc()
def run_producer(q):
try:
with open("your file path") as fp:
for line in fp:
q.put(line)
except Exception:
traceback.print_exc()
finally:
q.put(None)
def worker(i, q):
while 1:
line = q.get()
if line is None:
print(f'worker {i} is done')
q.put(None)
return
# do something with this line
# ...
答案 4 :(得分:0)
我在周末学习了多处理库。
按Ctrl + C停止并写入当前结果功能仍然无效。
现在主要功能很好。
#!/usr/bin/env pypy3
import sys
from collections import Counter
from multiprocessing import Pool, Process, Manager, current_process, freeze_support
SamplesList = ('D_Crick', 'D_Watson', 'Normal_Crick', 'Normal_Watson', 'D_WGS', 'Normal_WGS')
ChunkSize = 1024 * 128
verbose = 0
Nworkers = 16
def main():
import math
if len(sys.argv) < 3 :
print('Usage:',sys.argv[0],'<samtools.depth.gz> <out.tsv> [verbose=0]',file=sys.stderr,flush=True)
exit(0)
try:
verbose = int(sys.argv[3])
except: # `except IndexError:` and `except ValueError:`
verbose = 0
inDepthFile = sys.argv[1]
outFile = sys.argv[2]
print('From:[{}], To:[{}].\nVerbose: [{}].'.format(inDepthFile,outFile,verbose),file=sys.stderr,flush=True)
RecordCnt,MaxDepth,cDepthCnt,cDepthStat = CallStat(inDepthFile)
for k in SamplesList:
cDepthStat[k][2] = cDepthStat[k][0] / RecordCnt # E(X)
cDepthStat[k][3] = cDepthStat[k][1] / RecordCnt # E(X^2)
cDepthStat[k][4] = math.sqrt(cDepthStat[k][3] - cDepthStat[k][2]*cDepthStat[k][2]) # E(X^2)-E(X)^2
tsvout = open(outFile, 'wt')
print('#{}\t{}'.format('Depth','\t'.join(SamplesList)),file=tsvout)
#RecordCntLength = len(str(RecordCnt))
print( '#N={},SD:\t{}'.format(RecordCnt,'\t'.join(str(round(cDepthStat[col][4],1)) for col in SamplesList)),file=tsvout)
for depth in range(0,MaxDepth+1):
#print( '{}\t{}'.format(depth,'\t'.join(str(DepthCnt[col][depth]) for col in SamplesList)) )
#print( '{}\t{}'.format(depth,'\t'.join(str(yDepthCnt[depth][col]) for col in SamplesList)) )
print( '{}\t{}'.format(depth,'\t'.join(str(cDepthCnt[col][depth]) for col in SamplesList)),file=tsvout)
#pass
#print('#MaxDepth={}'.format(MaxDepth),file=tsvout)
tsvout.close()
pass
def CallStat(inDepthFile):
import gzip
import itertools
RecordCnt = 0
MaxDepth = 0
cDepthCnt = {key:Counter() for key in SamplesList}
cDepthStat = {key:[0,0,0,0,0] for key in SamplesList} # x and x^2
#lines_queue = Queue()
manager = Manager()
lines_queue = manager.Queue()
stater_pool = Pool(Nworkers)
TASKS = itertools.repeat((lines_queue,SamplesList),Nworkers)
#ApplyResult = [stater_pool.apply_async(iStator,x) for x in TASKS]
#MapResult = stater_pool.map_async(iStator,TASKS,1)
AsyncResult = stater_pool.imap_unordered(iStator,TASKS,1)
try:
with gzip.open(inDepthFile, 'rt') as tsvfin:
while True:
lines = tsvfin.readlines(ChunkSize)
lines_queue.put(lines)
if not lines:
for i in range(Nworkers):
lines_queue.put(b'\n\n')
break
except KeyboardInterrupt:
print('\n[!]Ctrl+C pressed.',file=sys.stderr,flush=True)
for i in range(Nworkers):
lines_queue.put(b'\n\n')
pass
#for results in ApplyResult:
#(iRecordCnt,iMaxDepth,icDepthCnt,icDepthStat) = results.get()
#for (iRecordCnt,iMaxDepth,icDepthCnt,icDepthStat) in MapResult.get():
for (iRecordCnt,iMaxDepth,icDepthCnt,icDepthStat) in AsyncResult:
RecordCnt += iRecordCnt
if iMaxDepth > MaxDepth:
MaxDepth = iMaxDepth
for k in SamplesList:
cDepthCnt[k].update(icDepthCnt[k])
cDepthStat[k][0] += icDepthStat[k][0]
cDepthStat[k][1] += icDepthStat[k][1]
return RecordCnt,MaxDepth,cDepthCnt,cDepthStat
#def iStator(inQueue,inSamplesList):
def iStator(args):
(inQueue,inSamplesList) = args
import csv
# Looking up things in global scope takes longer then looking up stuff in local scope. <https://stackoverflow.com/a/54645851/159695>
cDepthCnt = {key:Counter() for key in inSamplesList}
cDepthStat = {key:[0,0] for key in inSamplesList} # x and x^2
RecordCnt = 0
MaxDepth = 0
for lines in iter(inQueue.get, b'\n\n'):
try:
tsvin = csv.DictReader(lines, delimiter='\t', fieldnames=('ChrID','Pos')+inSamplesList )
for row in tsvin:
#print(', '.join(row[col] for col in inSamplesList))
RecordCnt += 1
for k in inSamplesList:
theValue = int(row[k])
if theValue > MaxDepth:
MaxDepth = theValue
#DepthCnt[k][theValue] += 1 # PyPy3:30.54 ns, Python3:22.23 ns
#yDepthCnt[theValue][k] += 1 # PyPy3:30.47 ns, Python3:21.50 ns
cDepthCnt[k][theValue] += 1 # PyPy3:29.82 ns, Python3:30.61 ns
cDepthStat[k][0] += theValue
cDepthStat[k][1] += theValue * theValue
#print(MaxDepth,DepthCnt)
except KeyboardInterrupt:
print('\n[!]Ctrl+C pressed.',file=sys.stderr,flush=True)
pass
#print('[!]{} Lines Read:[{}], MaxDepth is [{}].'.format(current_process().name,RecordCnt,MaxDepth),file=sys.stderr,flush=True)
return RecordCnt,MaxDepth,cDepthCnt,cDepthStat
if __name__ == "__main__":
main() # time python3 ./samdepthplot.py t.tsv.gz 1