我在python中有一个大约5000行的csv文件,我想把它分成五个文件。
我为它编写了一个代码,但它无法正常工作
import codecs
import csv
NO_OF_LINES_PER_FILE = 1000
def again(count_file_header,count):
f3 = open('write_'+count_file_header+'.csv', 'at')
with open('import_1458922827.csv', 'rb') as csvfile:
candidate_info_reader = csv.reader(csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
co = 0
for row in candidate_info_reader:
co = co + 1
count = count + 1
if count <= count:
pass
elif count >= NO_OF_LINES_PER_FILE:
count_file_header = count + NO_OF_LINES_PER_FILE
again(count_file_header,count)
else:
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(row)
def read_write():
f3 = open('write_'+NO_OF_LINES_PER_FILE+'.csv', 'at')
with open('import_1458922827.csv', 'rb') as csvfile:
candidate_info_reader = csv.reader(csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
count = 0
for row in candidate_info_reader:
count = count + 1
if count >= NO_OF_LINES_PER_FILE:
count_file_header = count + NO_OF_LINES_PER_FILE
again(count_file_header,count)
else:
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(row)
read_write()
上面的代码会创建许多内容为空的文件。
如何将一个文件拆分为五个csv文件?
答案 0 :(得分:13)
在Python中
使用readlines()
和writelines()
来做到这一点,这是一个例子:
>>> csvfile = open('import_1458922827.csv', 'r').readlines()
>>> filename = 1
>>> for i in range(len(csvfile)):
... if i % 1000 == 0:
... open(str(filename) + '.csv', 'w+').writelines(csvfile[i:i+1000])
... filename += 1
输出文件名将编号为1.csv
,2.csv
,......等等。
来自终端
仅供参考,您可以使用split
从命令行执行此操作,如下所示:
$ split -l 1000 import_1458922827.csv
答案 1 :(得分:10)
我建议你不要发明一个轮子。有现成的解决方案。来源here
import os
def split(filehandler, delimiter=',', row_limit=1000,
output_name_template='output_%s.csv', output_path='.', keep_headers=True):
import csv
reader = csv.reader(filehandler, delimiter=delimiter)
current_piece = 1
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
current_limit = row_limit
if keep_headers:
headers = reader.next()
current_out_writer.writerow(headers)
for i, row in enumerate(reader):
if i + 1 > current_limit:
current_piece += 1
current_limit = row_limit * current_piece
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
if keep_headers:
current_out_writer.writerow(headers)
current_out_writer.writerow(row)
使用它像:
split(open('/your/pat/input.csv', 'r'));
答案 2 :(得分:2)
一个python3友好的解决方案:
def split_csv(source_filepath, dest_folder, split_file_prefix,
records_per_file):
"""
Split a source csv into multiple csvs of equal numbers of records,
except the last file.
Includes the initial header row in each split file.
Split files follow a zero-index sequential naming convention like so:
`{split_file_prefix}_0.csv`
"""
if records_per_file <= 0:
raise Exception('records_per_file must be > 0')
with open(source_filepath, 'r') as source:
reader = csv.reader(source)
headers = next(reader)
file_idx = 0
records_exist = True
while records_exist:
i = 0
target_filename = f'{split_file_prefix}_{file_idx}.csv'
target_filepath = os.path.join(dest_folder, target_filename)
with open(target_filepath, 'w') as target:
writer = csv.writer(target)
while i < records_per_file:
if i == 0:
writer.writerow(headers)
try:
writer.writerow(next(reader))
i += 1
except:
records_exist = False
break
if i == 0:
# we only wrote the header, so delete that file
os.remove(target_filepath)
file_idx += 1
答案 3 :(得分:2)
我对接受的答案做了一些修改,以使其更简单
def split_csv_into_chunks(file_location, out_dir, file_size=2):
count = 0
current_piece = 1
# file_to_split_name.csv
file_name = file_location.split("/")[-1].split(".")[0]
split_file_name_template = file_name + "__%s.csv"
splited_files_path = []
if not os.path.exists(out_dir):
os.makedirs(download_location)
try:
with open(file_location, "rb") as csv_file:
rows = csv.reader(csv_file, delimiter=",")
headers_row = rows.next()
for row in rows:
if count % file_size == 0:
current_out_path = os.path.join(out_dir,
split_file_name_template%str(current_piece))
current_out_writer = None
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=",")
current_out_writer.writerow(headers_row)
splited_files_path.append(current_out_path)
current_piece += 1
current_out_writer.writerow(row)
count += 1
return True, splited_files_path
except Exception as e:
print "Exception occurred as {}".format(e)
return False, splited_files_path
答案 4 :(得分:1)
答案 5 :(得分:1)
一个更简单的脚本适合我。
data: [{
label: 'RootFolder',
children: [{
label: 'Folder1',
children: [{
label: 'File1.doc',
details: {
size: '23111',
url : 'http://example.com/storage/file.txt',
name: 'File1',
type: 'txt'
}
}],
children: [{
label: 'SubFolder1',
children: [{
label: 'File1.txt',
details: {
size: '23111',
url : 'http://example.com/storage/file.txt',
name: 'File1',
type: 'txt'
}
}, {
label: 'File2.txt',
details: {
size: '23111',
url : 'http://example.com/storage/file.txt',
name: 'File1',
type: 'txt'
}
}],
}]
},
{
label: 'Folder2',
children: [{
label: 'SubFolder1',
children: [{
label: 'File1.txt',
details: {
size: '23111',
url : 'http://example.com/storage/file.txt',
name: 'File1',
type: 'txt'
}
}, {
label: 'File2.txt',
details: {
size: '23111',
url : 'http://example.com/storage/file.txt',
name: 'File1',
type: 'txt'
}
}, {
label: 'SubSubFolder1',
children: [{
label: 'File4.doc',
details: {
size: '23111',
url : 'http://example.com/storage/file.txt',
name: 'File1',
type: 'txt'
}
}, ],
}]
}]
},
],
}],
答案 6 :(得分:1)
使用Pandas的简单Python 3解决方案,不会切断最后一批
def to_csv_batch(src_csv, dst_dir, size=30000, index=False):
import pandas as pd
import math
# Read source csv
df = pd.read_csv(src_csv)
# Initial values
low = 0
high = size
# Loop through batches
for i in range(math.ceil(len(df) / size)):
fname = dst_dir+'/Batch_' + str(i+1) + '.csv'
df[low:high].to_csv(fname, index=index)
# Update selection
low = high
if (high + size < len(df)):
high = high + size
else:
high = len(df)
用法示例
to_csv_batch('Batch_All.csv', 'Batches')
答案 7 :(得分:1)
另一种熊猫解决方案(每1000行),类似于Aziz Alto解决方案:
suffix = 1
for i in range(len(df)):
if i % 1000 == 0:
df[i:i+1000].to_csv(f"processed/{filename}_{suffix}.csv", sep ='|', index=False, index_label=False)
suffix += 1
其中df
是作为pandas.DataFrame加载的csv; filename
是原始文件名,管道是分隔符; index
和index_label
的false表示跳过自动递增的索引列
答案 8 :(得分:0)
@ Ryan,Python3代码对我有用,我使用了newline =''来避免出现空行问题, 以open(target_filepath,'w',newline ='')作为目标:
答案 9 :(得分:0)
我建议您利用熊猫提供的可能性。这是您可以用来执行此操作的函数:
def csv_count_rows(file):
"""
Counts the number of rows in a file.
:param file: path to the file.
:return: number of lines in the designated file.
"""
with open(file) as f:
nb_lines = sum(1 for line in f)
return nb_lines
def split_csv(file, sep=",", output_path=".", nrows=None, chunksize=None, low_memory=True, usecols=None):
"""
Split a csv into several files.
:param file: path to the original csv.
:param sep: View pandas.read_csv doc.
:param output_path: path in which to output the resulting parts of the splitting.
:param nrows: Number of rows to split the original csv by, also view pandas.read_csv doc.
:param chunksize: View pandas.read_csv doc.
:param low_memory: View pandas.read_csv doc.
:param usecols: View pandas.read_csv doc.
"""
nb_of_rows = csv_count_rows(file)
# Parsing file elements : Path, name, extension, etc...
# file_path = "/".join(file.split("/")[0:-1])
file_name = file.split("/")[-1]
# file_ext = file_name.split(".")[-1]
file_name_trunk = file_name.split(".")[0]
split_files_name_trunk = file_name_trunk + "_part_"
# Number of chunks to partition the original file into
nb_of_chunks = math.ceil(nb_of_rows / nrows)
if nrows:
log_debug_process_start = f"The file '{file_name}' contains {nb_of_rows} ROWS. " \
f"\nIt will be split into {nb_of_chunks} chunks of a max number of rows : {nrows}." \
f"\nThe resulting files will be output in '{output_path}' as '{split_files_name_trunk}0 to {nb_of_chunks - 1}'"
logging.debug(log_debug_process_start)
for i in range(nb_of_chunks):
# Number of rows to skip is determined by (the number of the chunk being processed) multiplied by (the nrows parameter).
rows_to_skip = range(1, i * nrows) if i else None
output_file = f"{output_path}/{split_files_name_trunk}{i}.csv"
log_debug_chunk_processing = f"Processing chunk {i} of the file '{file_name}'"
logging.debug(log_debug_chunk_processing)
# Fetching the original csv file and handling it with skiprows and nrows to process its data
df_chunk = pd.read_csv(filepath_or_buffer=file, sep=sep, nrows=nrows, skiprows=rows_to_skip,
chunksize=chunksize, low_memory=low_memory, usecols=usecols)
df_chunk.to_csv(path_or_buf=output_file, sep=sep)
log_info_file_output = f"Chunk {i} of file '{file_name}' created in '{output_file}'"
logging.info(log_info_file_output)
然后在您的主笔记本电脑或Jupyter笔记本中放入:
# This is how you initiate logging in the most basic way.
logging.basicConfig(level=logging.DEBUG)
file = {#Path to your file}
split_csv(file,sep=";" ,output_path={#Path where you'd like to output it},nrows = 4000000, low_memory = False)
P.S.1:我放nrows = 4000000
是因为这是个人喜好。您可以根据需要更改该数字。
P.S.2:我使用日志记录库显示消息。当将这种功能应用于远程服务器上存在的大文件时,您确实要避免“简单打印”并合并日志记录功能。您可以将logging.info
或logging.debug
替换为print
P.S.3:当然,您需要用自己的参数替换代码的{# Blablabla}
部分。
答案 10 :(得分:0)
import pandas as pd
df = pd.read_csv('input.csv')
file_len = len(df)
filename = 'output'
n = 1
for i in range(file_len):
if i % 10 == 0:
sf = (df[i:i+10])
sf.to_csv(f'{filename}_{n}.csv', index=False)
n += 1