我想将一个巨大的文件拆分成许多文件,并在所有拆分文件中包含标题。使用python

时间:2016-11-17 21:36:53

标签: python

"SURNAME","GIVENNAME","MIDDLENAME","UPIN","NAME","CODE"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770

让我们假设这是大文件的格式,我想分成许多具有指定大小的文件,在每个文件中我需要标题(“SURNAME”,“GIVENNAME”,“MIDDLENAME”,“UPIN” “,”“NAME”,“CODE”)将出席。谢谢。

import os
import sys

def getfilesize(filename):
   with open(filename,"rb") as fr:
       fr.seek(0,2) # move to end of the file
       size=fr.tell()
       print("getfilesize: size: %s" % size)
       return fr.tell()

def splitfile(filename, splitsize):
   # Open original file in read only mode
   if not os.path.isfile(filename):
       print("No such file as: \"%s\"" % filename)
       return

   filesize=getfilesize(filename)
   with open(filename,"rb") as fr:
    counter=1
    orginalfilename = filename.split(".")
    readlimit = 1000000 #read 5kb at a time
    n_splits = filesize//splitsize
    print("splitfile: No of splits required: %s" % str(n_splits))
    for i in range(n_splits+1):
        chunks_count = int(splitsize)//int(readlimit)
        data_5kb = fr.read(readlimit) # read
        # Create split files
        print("chunks_count: %d" % chunks_count)
        with open(orginalfilename[0]+"_{id}.".format(id=str(counter))+orginalfilename[1],"ab") as fw:
            fw.seek(0) 
            fw.truncate()# truncate original if present
            while data_5kb:                
                fw.write(data_5kb)
                if chunks_count:
                    chunks_count-=1
                    data_5kb = fr.read(readlimit)
                else: break            
        counter+=1 

if __name__ == "__main__":
   if len(sys.argv) < 3: print("Filename or splitsize not provided: Usage:     filesplit.py filename splitsizeinkb ")
   else:
       filesize = int(sys.argv[2]) * 1000 #make into kb
       filename = sys.argv[1]
       splitfile(filename, filesize)

这个工作正常,但无法获得Headers,我很抱歉我是Stackoverflow的新手。

2 个答案:

答案 0 :(得分:2)

我使用pandas将大文件拆分成较小的文件

import pandas as pd

infile = #path to your file

n=0
for chunk in pd.read_csv(infile, sep = ',', chunksize=1000000):
    data = chunk
    oPath = 'chunk_' +str(n)+'.csv'
    data.to_csv(oPath, sep=' ',index=False, header=true)
    n +=1

chunksize表示输出文件中需要多少行。

答案 1 :(得分:0)

这应该这样做

import os

maxlines = 1000  # how many lines did you want each new file to have?
infilepath = 'path/to/file'
with open(infilepath) as infile:
    dirpath = os.path.dirname(infilepath)
    fname = os.path.basename(infilepath)
    fname, ext = fname.rsplit('.',1)

    header = infile.readline()
    outfile = open(os.path.join(dirpath, "{}{}.{}".format(fname, 0, ext)), 'w')

    for i,line in enumerate(infile):
        if not i%maxlines:
            outfile.close()
            outfile = open(os.path.join(dirpath, "{}{}.{}".format(fname, i//maxlines, ext)), 'w')
            outfile.write(header)
        outfile.write(line)

    try: outfile.close()
    except: pass