代码不能在庞大的数据集上运行

时间:2013-04-09 19:48:08

标签: python python-2.7

这是在我的系统上运行的代码,代码工作正常,数据量很少,但是相同的代码不能处理大量的数据集或文本文件,是的@ennikiller是对的,有些文件是为空(意味着它们没有Esgn值,但它们有其他值),我想要处理它,它应该为Esgn指定零或只计算所有其他值的最小值并留下{{1 (在那个不包含Esgn的特定文件中)

Esgn

这是我的代码,这段代码在10/20文本文件上工作正常,但是当我运行3000 txt文件时,它会给我以下错误

    import os.path
    import glob
    import re
    import itertools
    from collections import namedtuple, deque
    from operator import attrgetter

    R_PREFIX_VALUE = re.compile(r'^(?P<prefix>[A-Z]+)(?P<suffix>\d+)\s+(?P<value>\d+)\s*$')

    getvalue  = attrgetter('value')

    def interleave(seq, val):
        return itertools.chain.from_iterable(itertools.izip(seq, itertools.repeat(val)))

    class Fileline(namedtuple('Fileline', 'filename prefix suffix value')):
        @classmethod
        def _fromstr(cls, s, filename=None, rematch=R_PREFIX_VALUE.match):
            m = rematch(s)
            if not m:
                raise ValueError('No valid line found in %r' % s)
            d = m.groupdict()
            d['value'] = int(d['value'])
            d['filename'] = filename
            return cls(**d)

        def _asstr(self):
            return '{}{} {}'.format(self.prefix, self.suffix, self.value)

    def max_value_with_prefix(lineseq, prefix, getvalue=getvalue):
        withprefix = (line for line in lineseq if line.prefix==prefix)
        return max_value(withprefix)

    def filter_lt_line(lineseq, maxline):
        for line in lineseq:
            if line.prefix != maxline.prefix or line.value >= maxline.value:
                yield line

    def extreme_value(fn, lineseq, getvalue=getvalue):
        try:
            return fn((l for l in lineseq if l is not None), key=getvalue)
        except ValueError:
            return None

    def max_value(lineseq):
        return extreme_value(max, lineseq)

    def min_value(lineseq):
        return extreme_value(min, lineseq)

    def read_lines(fn, maker=Fileline._fromstr):
        with open(fn, 'rb') as f:
            return deque(maker(l, fn) for l in f)

    def write_file(fn, lineseq):
        lines = (l._asstr() for l in lineseq)
        newlines = interleave(lines, '\n')
        with open(fn, 'wb') as f:
            f.writelines(newlines)

    def write_output_file(fn, lineseq):
        lines = ("{} {}".format(l.filename, l.value) for l in lineseq)
        newlines = interleave(lines, "\n")
        with open(fn, 'wb') as f:
            f.writelines(newlines)

    def filter_max_returning_min(fn, prefix):
        lineseq = read_lines(fn)
        maxvalue = max_value_with_prefix(lineseq, prefix)
        filteredlineseq = deque(filter_lt_line(lineseq, maxvalue))
        write_file(fn, filteredlineseq)
        minline = min_value(filteredlineseq)
        return minline

    def main(fileglob, prefix, outputfile):
        minline = None
        for fn in glob.iglob(fileglob):
            fileminline = filter_max_returning_min(fn, prefix)
            minline = min_value([minline, fileminline])
        write_output_file(outputfile, [minline])
    def _worker(args):
        return filter_max_returning_min(*args)

    """def multi_main(fileglob, prefix, outputfile, processes):
        from multiprocessing import Pool
        pool = Pool(processes=processes)
        workerargs = ((fn, prefix) for fn in glob.iglob(fileglob))
        minlines = pool.imap_unordered(_worker, workerargs, processes)
        minline = min_value(minlines)
        write_file(outputfile, [minline])"""
    def main(fileglob, prefix, outputfile):
        minlines = []
        for fn in glob.iglob(fileglob):
            minlines.append(filter_max_returning_min(fn, prefix))
        write_output_file(outputfile, minlines)
    main('C:\Python27\DataSet\*.txt', 'ENSG', 'output.txt') 

1 个答案:

答案 0 :(得分:0)

尝试打印出line和maxline的值

显然,其中一个或另一个被设置为无。最可能的是它是maxline,如果事实肯定是。因此,找出你为maxline传递None的原因。所以你应该真的看这里:

try:
    lineseq = read_lines(fn)
    maxvalue = max_value_with_prefix(lineseq, prefix)
    filteredlineseq = deque(filter_lt_line(lineseq, maxvalue))
    write_file(fn, filteredlineseq)
    minline = min_value(filteredlineseq)
except:
    print "%s is apparently empty" % fn
    minline = None
finally:
    return milline

fn是否可能是空文件?

正如Joran所说,你只需要添加一些异常处理(我在上面添加了它)