这是在我的系统上运行的代码,代码工作正常,数据量很少,但是相同的代码不能处理大量的数据集或文本文件,是的@ennikiller是对的,有些文件是为空(意味着它们没有Esgn
值,但它们有其他值),我想要处理它,它应该为Esgn
指定零或只计算所有其他值的最小值并留下{{1 (在那个不包含Esgn
的特定文件中)
Esgn
这是我的代码,这段代码在10/20文本文件上工作正常,但是当我运行3000 txt文件时,它会给我以下错误
import os.path
import glob
import re
import itertools
from collections import namedtuple, deque
from operator import attrgetter
R_PREFIX_VALUE = re.compile(r'^(?P<prefix>[A-Z]+)(?P<suffix>\d+)\s+(?P<value>\d+)\s*$')
getvalue = attrgetter('value')
def interleave(seq, val):
return itertools.chain.from_iterable(itertools.izip(seq, itertools.repeat(val)))
class Fileline(namedtuple('Fileline', 'filename prefix suffix value')):
@classmethod
def _fromstr(cls, s, filename=None, rematch=R_PREFIX_VALUE.match):
m = rematch(s)
if not m:
raise ValueError('No valid line found in %r' % s)
d = m.groupdict()
d['value'] = int(d['value'])
d['filename'] = filename
return cls(**d)
def _asstr(self):
return '{}{} {}'.format(self.prefix, self.suffix, self.value)
def max_value_with_prefix(lineseq, prefix, getvalue=getvalue):
withprefix = (line for line in lineseq if line.prefix==prefix)
return max_value(withprefix)
def filter_lt_line(lineseq, maxline):
for line in lineseq:
if line.prefix != maxline.prefix or line.value >= maxline.value:
yield line
def extreme_value(fn, lineseq, getvalue=getvalue):
try:
return fn((l for l in lineseq if l is not None), key=getvalue)
except ValueError:
return None
def max_value(lineseq):
return extreme_value(max, lineseq)
def min_value(lineseq):
return extreme_value(min, lineseq)
def read_lines(fn, maker=Fileline._fromstr):
with open(fn, 'rb') as f:
return deque(maker(l, fn) for l in f)
def write_file(fn, lineseq):
lines = (l._asstr() for l in lineseq)
newlines = interleave(lines, '\n')
with open(fn, 'wb') as f:
f.writelines(newlines)
def write_output_file(fn, lineseq):
lines = ("{} {}".format(l.filename, l.value) for l in lineseq)
newlines = interleave(lines, "\n")
with open(fn, 'wb') as f:
f.writelines(newlines)
def filter_max_returning_min(fn, prefix):
lineseq = read_lines(fn)
maxvalue = max_value_with_prefix(lineseq, prefix)
filteredlineseq = deque(filter_lt_line(lineseq, maxvalue))
write_file(fn, filteredlineseq)
minline = min_value(filteredlineseq)
return minline
def main(fileglob, prefix, outputfile):
minline = None
for fn in glob.iglob(fileglob):
fileminline = filter_max_returning_min(fn, prefix)
minline = min_value([minline, fileminline])
write_output_file(outputfile, [minline])
def _worker(args):
return filter_max_returning_min(*args)
"""def multi_main(fileglob, prefix, outputfile, processes):
from multiprocessing import Pool
pool = Pool(processes=processes)
workerargs = ((fn, prefix) for fn in glob.iglob(fileglob))
minlines = pool.imap_unordered(_worker, workerargs, processes)
minline = min_value(minlines)
write_file(outputfile, [minline])"""
def main(fileglob, prefix, outputfile):
minlines = []
for fn in glob.iglob(fileglob):
minlines.append(filter_max_returning_min(fn, prefix))
write_output_file(outputfile, minlines)
main('C:\Python27\DataSet\*.txt', 'ENSG', 'output.txt')
答案 0 :(得分:0)
尝试打印出line和maxline的值
显然,其中一个或另一个被设置为无。最可能的是它是maxline,如果事实肯定是。因此,找出你为maxline传递None的原因。所以你应该真的看这里:
try:
lineseq = read_lines(fn)
maxvalue = max_value_with_prefix(lineseq, prefix)
filteredlineseq = deque(filter_lt_line(lineseq, maxvalue))
write_file(fn, filteredlineseq)
minline = min_value(filteredlineseq)
except:
print "%s is apparently empty" % fn
minline = None
finally:
return milline
fn是否可能是空文件?
正如Joran所说,你只需要添加一些异常处理(我在上面添加了它)