我想将大小为50GB的大文本文件拆分成多个文件。 文件中的数据如下所示 - [x = 0-9之间的任何整数]
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
...............
...............
文件中可能有几十亿行,我想写每个文件30/40百万。 我猜步骤是 -
我想知道如何将所有这些步骤放在一个有效且更快的内存中。我在堆栈中看到了一些例子,但没有一个完全帮助我真正需要的东西。如果有人能帮助我,我真的很感激。
答案 0 :(得分:19)
此工作解决方案使用shell中提供的split
命令。由于作者已经接受了非python解决方案的可能性,请不要低估。
首先,我使用
创建了一个包含1000M条目(15 GB)的测试文件awk 'BEGIN{for (i = 0; i < 1000000000; i++) {print "123.123.123.123"} }' > t.txt
然后我使用split
:
split --lines=30000000 --numeric-suffixes --suffix-length=2 t.txt t
生成一组名为t00
- t33
的34个小文件需要5分钟。 33个文件各458 MB,最后t33
为153 MB。
答案 1 :(得分:13)
from itertools import chain, islice
def chunks(iterable, n):
"chunks(ABCDE,2) => AB CD E"
iterable = iter(iterable)
while True:
# store one line in memory,
# chain it to an iterator on the rest of the chunk
yield chain([next(iterable)], islice(iterable, n-1))
l = 30*10**6
file_large = 'large_file.txt'
with open(file_large) as bigfile:
for i, lines in enumerate(chunks(bigfile, l)):
file_split = '{}.{}'.format(file_large, i)
with open(file_split, 'w') as f:
f.writelines(lines)
答案 2 :(得分:4)
我会使用Unix实用程序拆分,如果它可用,你唯一的任务就是拆分文件。然而,这是一个纯Python解决方案:
import contextlib
file_large = 'large_file.txt'
l = 30*10**6 # lines per split file
with contextlib.ExitStack() as stack:
fd_in = stack.enter_context(open(file_large))
for i, line in enumerate(fd_in):
if not i % l:
file_split = '{}.{}'.format(file_large, i//l)
fd_out = stack.enter_context(open(file_split, 'w'))
fd_out.write('{}\n'.format(line))
如果您的所有行都有4个3位数字,并且您有多个可用核心,那么您可以利用文件搜索并运行多个进程。
答案 3 :(得分:0)
此类可以解决您的问题。 我已经在Linux和Windows操作系统上对其进行了测试,并且在两个操作系统上均能完美运行。 另外,我每次都测试大小不同的二进制文件和文本文件,这很棒。 享受:)
import os
import math
class FileSpliter:
# If file type is text then CHUNK_SIZE is count of chars
# If file type is binary then CHUNK_SIZE is count of bytes
def __init__(self, InputFile, FileType="b", CHUNK_SIZE=524288, OutFile="outFile"):
self.CHUNK_SIZE = CHUNK_SIZE # byte or char
self.InputFile = InputFile
self.FileType = FileType # b: binary, t: text
self.OutFile = OutFile
self.FileSize = 0
self.Parts = None
self.CurrentPartNo = 0
self.Progress = 0.0
def Prepare(self):
if not(os.path.isfile(self.InputFile) and os.path.getsize(self.InputFile) > 0):
print("ERROR: The file is not exists or empty!")
return False
self.FileSize = os.path.getsize(self.InputFile)
if self.CHUNK_SIZE >= self.FileSize:
self.Parts = 1
else:
self.Parts = math.ceil(self.FileSize / self.CHUNK_SIZE)
return True
def Split(self):
if self.FileSize == 0 or self.Parts == None:
print("ERROR: File is not prepared for split!")
return False
with open(self.InputFile, "r" + self.FileType) as f:
while True:
if self.FileType == "b":
buf = bytearray(f.read(self.CHUNK_SIZE))
elif self.FileType == "t":
buf = f.read(self.CHUNK_SIZE)
else:
print("ERROR: File type error!")
if not buf:
# we've read the entire file in, so we're done.
break
of = self.OutFile + str(self.CurrentPartNo)
outFile = open(of, "w" + self.FileType)
outFile.write(buf)
outFile.close()
self.CurrentPartNo += 1
self.ProgressBar()
return True
def Rebuild(self):
self.CurrentPartNo = 0
if self.Parts == None:
return False
with open(self.OutFile, "w" + self.FileType) as f:
while self.CurrentPartNo < self.Parts:
If = self.OutFile + str(self.CurrentPartNo)
if not(os.path.isfile(If) and os.path.getsize(If) > 0):
print("ERROR: The file [" + If + "] is not exists or empty!")
return False
InputFile = open(If, "r" + self.FileType)
buf = InputFile.read()
if not buf:
# we've read the entire file in, so we're done.
break
f.write(buf)
InputFile.close()
os.remove(If)
self.CurrentPartNo += 1
self.ProgressBar()
return True
def ProgressBar(self, BarLength=20, ProgressIcon="#", BarIcon="-"):
try:
# You can't have a progress bar with zero or negative length.
if BarLength <1:
BarLength = 20
# Use status variable for going to the next line after progress completion.
Status = ""
# Calcuting progress between 0 and 1 for percentage.
self.Progress = float(self.CurrentPartNo) / float(self.Parts)
# Doing this conditions at final progressing.
if self.Progress >= 1.:
self.Progress = 1
Status = "\r\n" # Going to the next line
# Calculating how many places should be filled
Block = int(round(BarLength * self.Progress))
# Show this
Bar = "\r[{}] {:.0f}% {}".format(ProgressIcon * Block + BarIcon * (BarLength - Block), round(self.Progress * 100, 0), Status)
print(Bar, end="")
except:
print("\rERROR")
def main():
fp = FileSpliter(InputFile="inFile", FileType="b") #, CHUNK_SIZE=300000)
if fp.Prepare():
# Spliting ...
print("Spliting ...")
sr = fp.Split()
if sr == True:
print("The file splited successfully.")
print()
# Rebuilding ...
print("Rebuilding ...")
rr = fp.Rebuild()
if rr == True:
print("The file rebuilded successfully.")
if __name__ == "__main__":
main()
答案 4 :(得分:0)
我正在编写一个Python3代码解决方案,通常用于拆分大小为MB的文件。 但是,我还没有尝试过GB大小的文件。
TextFileSplitter.py
import traceback
#get a file name to be read
fileToRead = input("Enter file name : ")
# max lines you want to write in a single file
fileLineCount = 2000
lineCount = 0
fileCount = 1
try:
print('Start splitting...')
#read a file
fileReader = open(fileToRead)
line = fileReader.readline()
fileWriter = open(str(fileCount)+".txt","a")
while line != '':#empty is EOF
if lineCount == 0:
#create a file in append mode
fileWriter = open(str(fileCount)+".txt","a")
#increament file count, use it for new file name
fileCount += 1
#write a line
fileWriter.write(line+"\n")
lineCount += 1
if lineCount == fileLineCount:
lineCount = 0
fileWriter.close()
#read a line
line = fileReader.readline()
fileWriter.close()
except Exception as e:
#print the exception if any
print(e.__traceback__)
traceback.print_exc()
finally:
#close the file reader
fileReader.close()
o / p看起来像是文件,每个文件都有fileLineCount(即2000)行,它们在与:p相同的目录中创建
1.txt
2.txt
3.txt
。
。
。
。
n.txt