快速csv文件拆分

Question

由于内存错误，我必须拆分我的csv文件。我做了研究。我从其中一个堆栈溢出用户Aziz Alto找到了它。这是他的代码。

csvfile = open('#', 'r').readlines()
filename = 1
for i in range(len(csvfile)):
if i % 10000000 == 0:
    open(str(filename) + '.csv', 'w+').writelines(csvfile[i:i+10000000])
    filename += 1

它运行良好，但对于第二个文件，代码没有添加标题，这对我来说非常重要。我的问题是如何为第二个文件添加标题？

Answer 1

在第二个到最后一个文件中，您必须始终添加原始文件的第一行（包含标题的文件）：

# this loads the first file fully into memory
with open('#', 'r') as f:
    csvfile = f.readlines()

linesPerFile = 1000000
filename = 1
# this is better then your former loop, it loops in 1000000 lines a peice,
# instead of incrementing 1000000 times and only write on the millionth one
for i in range(0,len(csvfile),linesPerFile):
    with open(str(filename) + '.csv', 'w+') as f:
        if filename > 1: # this is the second or later file, we need to write the
            f.write(csvfile[0]) # header again if 2nd.... file
        f.writelines(csvfile[i:i+linesPerFile])
    filename += 1

Answer 2

import pandas as pd 
rows = pd.read_csv("csvfile.csv", chunksize=5000000) 
for i, chuck in enumerate(rows): 
    chuck.to_csv('out{}.csv'.format(i)) # i is for chunk number of each iteration

chucksize您指定了多少行，在excel中，您最多可以拥有1,048,576行。这将保存为5000000并带有标题。

希望这有助于!!

Answer 3

快速csv文件拆分

如果文件很大，并且必须尝试不同的分区（例如，寻找最佳分割方法），则上述解决方案太慢了。

解决此问题的另一种方法（也是非常快速的一种方法）是通过记录号创建索引文件。创建一个 6867839行和9 Gb 的csv文件的索引文件大约需要六分钟，而joblib则需要另外2分钟才能将其存储在磁盘上。

如果要处理3 Gb或更大的大型文件，此方法尤其令人印象深刻。

以下是用于创建索引文件的代码：

# Usage:

# creaidx.py filename.csv

# indexes a csv file by record number. This can be used to
# access any record directly or to split a file without the 
# need of reading it all. The index file is joblib-stored as
# filename.index

# filename.csv is the file to create index for

import os,sys,joblib

BLKSIZE=512

def checkopen(s,m='r',bz=None):
    if os.access(s,os.F_OK):
        if bz==None:
            return open(s,m)     # returns open file
        else:
            return open(s,m,bz)  # returns open file with buffer size
    else:
        return None

def get_blk():
    global ix,off,blk,buff
    while True:            # dealing with special cases
        if ix==0:
            n=0
            break
        if buff[0]==b'\r':
            n=2
            off=0
            break
        if off==BLKSIZE-2:
            n=0
            off=0
            break
        if off==BLKSIZE-1:
            n=0
            off=1
            break
        n=2
        off=buff.find(b'\r')
        break
    while (off>=0 and off<BLKSIZE-2):
        idx.append([ix,blk,off+n]) 
#        g.write('{},{},{}\n'.format(ix,blk,off+n)) 
        print(ix,end='\r')
        n=2
        ix+=1
        off= buff.find(b'\r',off+2)

def crea_idx():
    global buff,blk
    buff=f.read(BLKSIZE)
    while len(buff)==BLKSIZE:
        get_blk()
        buff=f.read(BLKSIZE)
        blk+=1        
    get_blk()
    idx[-1][2]=-1 
    return

if len(sys.argv)==1:
    sys.exit("Need to provide a csv filename!")
ix=0
blk=0
off=0
idx=[]
buff=b'0'
s=sys.argv[1]
f=checkopen(s,'rb')
idxfile=s.replace('.csv','.index')
if checkopen(idxfile)==None:
    with open(idxfile,'w') as g:
            crea_idx()
            joblib.dump(idx,idxfile)
else:
    if os.path.getctime(idxfile)<os.path.getctime(s):
        with open(idxfile,'w') as g:
            crea_idx()
            joblib.dump(idx,idxfile)
f.close()

让我们使用一个玩具示例：

strings,numbers,colors
string1,1,blue
string2,2,red
string3,3,green
string4,4,yellow

索引文件将是：

   [[0, 0, 0], 
    [1, 0, 24], 
    [2, 0, 40], 
    [3, 0, 55], 
    [4, 0, 72], 
    [5, 0, -1]]

请注意最后一个索引元素上的-1，以指示在顺序访问的情况下索引文件的结尾。您可以使用这样的工具来访问csv文件的任何单独的行：

def get_rec(n=1,binary=False):
    n=1 if n<0 else n+1
    s=b'' if binary else '' 
    if len(idx)==0:return ''
    if idx[n-1][2]==-1:return ''
    f.seek(idx[n-1][1]*BLKSIZE+idx[n-1][2])
    buff=f.read(BLKSIZE)
    x=buff.find(b'\r')
    while x==-1:
        s=s+buff if binary else s+buff.decode()
        buff=f.read(BLKSIZE)
        x=buff.find(b'\r')
    return s+buff[:x]+b'\r\n' if binary else s+buff[:x].decode()

索引记录的第一个字段显然是不必要的。它被保留在那里用于调试目的。附带说明一下，如果您将该字段替换为csv记录中的任何字段，并按该字段对索引文件进行排序，那么如果您使用索引字段，则您的csv文件将按该字段进行排序来访问csv文件。

现在，一旦您创建了索引文件，您只需使用文件名（已创建索引的文件）和以下介于1和100之间的数字调用以下程序，该数字将作为命令将文件分割的百分比线路参数：

start_time = time.time()
BLKSIZE=512
WSIZE=1048576 # pow(2,20) 1Mb for faster reading/writing
import sys
import joblib
from common import Drv,checkopen
ix=0
blk=0
off=0
idx=[]
buff=b'0'
if len(sys.argv)<3:
    sys.exit('Argument missing!')
s=Drv+sys.argv[1]
if sys.argv[2].isnumeric():
    pct=int(sys.argv[2])/100
else:
    sys.exit('Bad percentage: '+sys.argv[2])

f=checkopen(s,'rb')
idxfile=s.replace('.csv','.index')
if checkopen(idxfile):
    print('Loading index...')
    idx=joblib.load(idxfile)
    print('Done loading index.')
else:
    sys.exit(idxfile+' does not exist.')
head=get_rec(0,True)
n=int(pct*(len(idx)-2))
off=idx[n+1][1]*BLKSIZE+idx[n+1][2]-len(head)-1
num=off//WSIZE
res=off%WSIZE
sout=s.replace('.csv','.part1.csv')
i=0
with open(sout,'wb') as g:
    g.write(head)
    f.seek(idx[1][1]*BLKSIZE+idx[1][2])
    for x in range(num):
        print(i,end='\r')
        i+=1
        buff=f.read(WSIZE)
        g.write(buff)
    buff=f.read(res)
    g.write(buff)
print()
i=0    
sout=s.replace('.csv','.part2.csv')    
with open(sout,'wb') as g:
    g.write(head)
    f.seek(idx[n+1][1]*BLKSIZE+idx[n+1][2])
    buff=f.read(WSIZE)
    while len(buff)==WSIZE:
        g.write(buff)
        print(i,end='\r')
        i+=1
        buff=f.read(WSIZE)
    g.write(buff)
    
end_time = time.time()

使用1048576字节的块创建文件。您可以使用该数字来加快文件创建速度，或将其调整为内存资源较少的计算机。

仅在两个分区上分割文件，每个分区都有原始文件的标题。更改代码以使其变得不太困难将文件分成两个以上的分区。

最后，从一个角度来看，将6867839行和9 Gb的csv文件拆分50％，创建索引文件大约花了6分钟，而joblib将其花费2分钟存储在磁盘上。花费了3分钟来拆分文件。

如何在python中拆分csv文件？

3 个答案:

快速csv文件拆分