将数据写入磁盘时,python内存异常

时间:2018-07-23 06:24:30

标签: python list memory

我有一个关于8G的大文件要排序。我将其分成块(每个3G),然后进行排序,最后将排序后的数据写入磁盘,当写入磁盘时,内存使用量立即增加,甚至达到9G。我认为内存使用量不应该太大,使用后内存可能没有释放,当我添加del行和gc.collect()时,内存使用量就变得正常了,但是时间成本却是之前的2倍,有人知道为什么吗?

这是我的代码

```

分类器(对象):

def sort(self, filename='input', inp..ut_stream=None, out_filename=None, key=KEY_BY):
    sort_key = key
    def getLines(fname):
        for _ in open(fname, 'r'):
            yield (sort_key(_), _)
    st = time.time()
    splitter = FileSplitter(filename)
    if input_stream:
        splitter.split(self.block_size, input_stream=input_stream, key=sort_key, buffer_size=self.buffer_size)
    else:
        splitter.split(self.block_size, key=sort_key, buffer_size=self.buffer_size)
        # splitter.split(self.block_size, sort_key, self.buffer_size)
    print >> sys.stderr, 'sort', time.time() - st
    st = time.time()
    filelist = map(getLines, splitter.get_block_filenames())
    r = heapq.merge(*filelist)
    if not out_filename:
        f = open(filename + '.out', 'w')
    else:
        f = open(out_filename, 'w')
    map(lambda _: f.write(_[1]), r)
    print >> sys.stderr, 'merge', time.time() - st
    splitter.cleanup()

FileSplitter类(对象):

BLOCK_FILENAME_FORMAT = 'block_{0}.dat'

def __init__(self, filename):
    self.filename = filename
    self.block_filenames = []

def write_block(self, data, block_number, buffer_size):
    filename = self.BLOCK_FILENAME_FORMAT.format(block_number)
    file = open(filename, 'w', buffer_size)
    file.write(data)
    file.close()
    self.block_filenames.append(filename)

def get_block_filenames(self):
    return self.block_filenames

def split(self, block_size, input_stream=None, key=None, buffer_size=0):
    sort_key = key
    if not input_stream:
        file = open(self.filename, 'r', buffer_size)
    else:
        file = input_stream
    i = 0
    while True:
        lines = file.readlines(block_size)
        if lines == []:
            break
        if sort_key is None:
            lines.sort()
        else:
            lines.sort(key=sort_key)
        self.write_block(''.join(lines), i, buffer_size=buffer_size)
        i += 1

def cleanup(self):
    map(lambda f: os.remove(f), self.block_filenames)

```

这是每行中内存的使用情况:

```

第#行,内存使用量增加行内容

21 6817.344 MiB 18170.859 MiB       @profile
22                                 def write_block(self, data, block_number, buffer_size):
23 6817.344 MiB -1652.430 MiB           filename = self.BLOCK_FILENAME_FORMAT.format(block_number)
24 6817.344 MiB -1652.430 MiB           file = open(filename, 'w', buffer_size)
25 6817.344 MiB -1652.430 MiB           file.write(data)
26 6817.344 MiB -1652.430 MiB           file.close()
27 6817.344 MiB -1652.430 MiB           self.block_filenames.append(filename)

文件名:../../ isorter.py

第#行,内存使用量增加行内容

32   21.504 MiB   21.504 MiB       @profile
33                                 def split(self, block_size, input_stream=None, key=None, buffer_size=0):
36   21.504 MiB    0.000 MiB           sort_key = key
37   21.504 MiB    0.000 MiB           if not input_stream:
38                                         file = open(self.filename, 'r', buffer_size)
39                                     else:
40   21.504 MiB    0.000 MiB               file = input_stream
41   21.504 MiB    0.000 MiB           i = 0

43 5164.914 MiB    0.000 MiB           while True:
44 5164.801 MiB 4979.527 MiB               lines = file.readlines(block_size)
45 5164.801 MiB -157.926 MiB               if lines == []:
46 5006.875 MiB -157.926 MiB                   break
47 5164.801 MiB    0.000 MiB               if sort_key is None:
48                                             lines.sort()
49                                         else:
50 5164.914 MiB    5.844 MiB                   lines.sort(key=sort_key)
51 5164.914 MiB 12020.230 MiB               self.write_block(''.join(lines), i, buffer_size=buffer_size)
52 5164.914 MiB    0.000 MiB               i += 1
53 5006.879 MiB    0.004 MiB           print

文件名:../../ isorter.py

第#行,内存使用量增加行内容

56 5006.906 MiB 5006.906 MiB       @profile
57                                 def cleanup(self):
58 5006.906 MiB    0.000 MiB           map(lambda f: os.remove(f), self.block_filenames)

文件名:../../ isorter.py

第#行,内存使用量增加行内容

61 5006.898 MiB 5006.898 MiB   @profile
62                             def merge(filelist):
63 5006.898 MiB    0.000 MiB       return heapq.merge(*filelist)

文件名:../../ isorter.py

第#行,内存使用量增加行内容

71   21.340 MiB   21.340 MiB       @profile
72                                 def sort(self, filename='input', input_stream=None, out_filename=None, key=KEY_BY):
76   21.504 MiB    0.000 MiB           sort_key = key
77 5006.902 MiB    0.000 MiB           def getLines(fname):
78 5006.918 MiB -189.023 MiB               for _ in open(fname, 'r'):
79 5006.918 MiB -378.035 MiB                   yield (sort_key(_), _)
80   21.504 MiB    0.000 MiB           st = time.time()
83   21.504 MiB    0.000 MiB           splitter = FileSplitter(filename)
84   21.504 MiB    0.000 MiB           if input_stream:
85 5006.879 MiB 5006.879 MiB               splitter.split(self.block_size, input_stream=input_stream, key=sort_key, buffer_size=self.buffer_size)
86                                     else:
87                                         splitter.split(self.block_size, key=sort_key, buffer_size=self.buffer_size)
88 5006.898 MiB    0.020 MiB           print >> sys.stderr, 'sort', time.time() - st
90 5006.898 MiB    0.000 MiB           st = time.time()
91 5006.898 MiB    0.000 MiB           filelist = map(getLines, splitter.get_block_filenames())
94 5006.898 MiB 5006.898 MiB           r = merge(filelist)
96 5006.898 MiB    0.000 MiB           if not out_filename:
97                                         f = open(filename + '.out', 'w')
98                                     else:
99 5006.898 MiB    0.000 MiB               f = open(out_filename, 'w')

100 5006.918 MiB -378.051 MiB映射(lambda :f.write( [1]),r)    101 5006.906 MiB -0.012 MiB打印>> sys.stderr,'merge',time.time()-st    102 5006.906 MiB 5006.906 MiB splitter.cleanup()

```

1 个答案:

答案 0 :(得分:0)

您可以使用numpy数组代替python的列表。 Numpy数组使用的内存少于列表。 例如,如果您的代码是:

x = [] 
for a in mylist: 
    x.append(a)

替换为:

import numpy as np
x = np.array([])
for a in mylist:
    x = np.append(x,a)