我有一个关于8G的大文件要排序。我将其分成块(每个3G),然后进行排序,最后将排序后的数据写入磁盘,当写入磁盘时,内存使用量立即增加,甚至达到9G。我认为内存使用量不应该太大,使用后内存可能没有释放,当我添加del行和gc.collect()时,内存使用量就变得正常了,但是时间成本却是之前的2倍,有人知道为什么吗?
这是我的代码
```
分类器(对象):
def sort(self, filename='input', inp..ut_stream=None, out_filename=None, key=KEY_BY):
sort_key = key
def getLines(fname):
for _ in open(fname, 'r'):
yield (sort_key(_), _)
st = time.time()
splitter = FileSplitter(filename)
if input_stream:
splitter.split(self.block_size, input_stream=input_stream, key=sort_key, buffer_size=self.buffer_size)
else:
splitter.split(self.block_size, key=sort_key, buffer_size=self.buffer_size)
# splitter.split(self.block_size, sort_key, self.buffer_size)
print >> sys.stderr, 'sort', time.time() - st
st = time.time()
filelist = map(getLines, splitter.get_block_filenames())
r = heapq.merge(*filelist)
if not out_filename:
f = open(filename + '.out', 'w')
else:
f = open(out_filename, 'w')
map(lambda _: f.write(_[1]), r)
print >> sys.stderr, 'merge', time.time() - st
splitter.cleanup()
FileSplitter类(对象):
BLOCK_FILENAME_FORMAT = 'block_{0}.dat'
def __init__(self, filename):
self.filename = filename
self.block_filenames = []
def write_block(self, data, block_number, buffer_size):
filename = self.BLOCK_FILENAME_FORMAT.format(block_number)
file = open(filename, 'w', buffer_size)
file.write(data)
file.close()
self.block_filenames.append(filename)
def get_block_filenames(self):
return self.block_filenames
def split(self, block_size, input_stream=None, key=None, buffer_size=0):
sort_key = key
if not input_stream:
file = open(self.filename, 'r', buffer_size)
else:
file = input_stream
i = 0
while True:
lines = file.readlines(block_size)
if lines == []:
break
if sort_key is None:
lines.sort()
else:
lines.sort(key=sort_key)
self.write_block(''.join(lines), i, buffer_size=buffer_size)
i += 1
def cleanup(self):
map(lambda f: os.remove(f), self.block_filenames)
```
这是每行中内存的使用情况:
```
21 6817.344 MiB 18170.859 MiB @profile
22 def write_block(self, data, block_number, buffer_size):
23 6817.344 MiB -1652.430 MiB filename = self.BLOCK_FILENAME_FORMAT.format(block_number)
24 6817.344 MiB -1652.430 MiB file = open(filename, 'w', buffer_size)
25 6817.344 MiB -1652.430 MiB file.write(data)
26 6817.344 MiB -1652.430 MiB file.close()
27 6817.344 MiB -1652.430 MiB self.block_filenames.append(filename)
文件名:../../ isorter.py
32 21.504 MiB 21.504 MiB @profile
33 def split(self, block_size, input_stream=None, key=None, buffer_size=0):
36 21.504 MiB 0.000 MiB sort_key = key
37 21.504 MiB 0.000 MiB if not input_stream:
38 file = open(self.filename, 'r', buffer_size)
39 else:
40 21.504 MiB 0.000 MiB file = input_stream
41 21.504 MiB 0.000 MiB i = 0
43 5164.914 MiB 0.000 MiB while True:
44 5164.801 MiB 4979.527 MiB lines = file.readlines(block_size)
45 5164.801 MiB -157.926 MiB if lines == []:
46 5006.875 MiB -157.926 MiB break
47 5164.801 MiB 0.000 MiB if sort_key is None:
48 lines.sort()
49 else:
50 5164.914 MiB 5.844 MiB lines.sort(key=sort_key)
51 5164.914 MiB 12020.230 MiB self.write_block(''.join(lines), i, buffer_size=buffer_size)
52 5164.914 MiB 0.000 MiB i += 1
53 5006.879 MiB 0.004 MiB print
文件名:../../ isorter.py
56 5006.906 MiB 5006.906 MiB @profile
57 def cleanup(self):
58 5006.906 MiB 0.000 MiB map(lambda f: os.remove(f), self.block_filenames)
文件名:../../ isorter.py
61 5006.898 MiB 5006.898 MiB @profile
62 def merge(filelist):
63 5006.898 MiB 0.000 MiB return heapq.merge(*filelist)
文件名:../../ isorter.py
71 21.340 MiB 21.340 MiB @profile
72 def sort(self, filename='input', input_stream=None, out_filename=None, key=KEY_BY):
76 21.504 MiB 0.000 MiB sort_key = key
77 5006.902 MiB 0.000 MiB def getLines(fname):
78 5006.918 MiB -189.023 MiB for _ in open(fname, 'r'):
79 5006.918 MiB -378.035 MiB yield (sort_key(_), _)
80 21.504 MiB 0.000 MiB st = time.time()
83 21.504 MiB 0.000 MiB splitter = FileSplitter(filename)
84 21.504 MiB 0.000 MiB if input_stream:
85 5006.879 MiB 5006.879 MiB splitter.split(self.block_size, input_stream=input_stream, key=sort_key, buffer_size=self.buffer_size)
86 else:
87 splitter.split(self.block_size, key=sort_key, buffer_size=self.buffer_size)
88 5006.898 MiB 0.020 MiB print >> sys.stderr, 'sort', time.time() - st
90 5006.898 MiB 0.000 MiB st = time.time()
91 5006.898 MiB 0.000 MiB filelist = map(getLines, splitter.get_block_filenames())
94 5006.898 MiB 5006.898 MiB r = merge(filelist)
96 5006.898 MiB 0.000 MiB if not out_filename:
97 f = open(filename + '.out', 'w')
98 else:
99 5006.898 MiB 0.000 MiB f = open(out_filename, 'w')
100 5006.918 MiB -378.051 MiB映射(lambda :f.write( [1]),r) 101 5006.906 MiB -0.012 MiB打印>> sys.stderr,'merge',time.time()-st 102 5006.906 MiB 5006.906 MiB splitter.cleanup()
```
答案 0 :(得分:0)
您可以使用numpy数组代替python的列表。 Numpy数组使用的内存少于列表。 例如,如果您的代码是:
x = []
for a in mylist:
x.append(a)
替换为:
import numpy as np
x = np.array([])
for a in mylist:
x = np.append(x,a)