我正在使用此代码拆分大型CSV文件。这完美无缺,但我想知道如何调整以从输出文件中删除列?我根据第2列中的值拆分我的csv,并且只返回第1列。
#!/usr/bin/env python3
import binascii
import csv
import os.path
import sys
from tkinter.filedialog import askopenfilename, askdirectory
from tkinter.simpledialog import askinteger
def split_csv_file(f, dst_dir, keyfunc):
csv_reader = csv.reader(f)
csv_writers = {}
for row in csv_reader:
k = keyfunc(row)
if k not in csv_writers:
csv_writers[k] = csv.writer(open(os.path.join(dst_dir, k),
mode='w', newline=''))
csv_writers[k].writerow(row)
def get_args_from_cli():
input_filename = sys.argv[1]
column = int(sys.argv[2])
dst_dir = sys.argv[3]
return (input_filename, column, dst_dir)
def get_args_from_gui():
input_filename = askopenfilename(
filetypes=(('CSV', '.csv'),),
title='Select CSV Input File')
column = askinteger('Choose Table Column', 'Table column')
dst_dir = askdirectory(title='Select Destination Directory')
return (input_filename, column, dst_dir)
if __name__ == '__main__':
if len(sys.argv) == 1:
input_filename, column, dst_dir = get_args_from_gui()
elif len(sys.argv) == 4:
input_filename, column, dst_dir = get_args_from_cli()
else:
raise Exception("Invalid number of arguments")
with open(input_filename, mode='r', newline='') as f:
split_csv_file(f, dst_dir, lambda r: r[column-1]+'.csv')
# if the column has funky values resulting in invalid filenames
# replace the line from above with:
# split_csv_file(f, dst_dir, lambda r: binascii.b2a_hex(r[column-1].encode('utf-8')).decode('utf-8')+'.csv')
以下是开始CSV的示例
"<option value="""">Choose Year</option>",ParentID
"<option value=""Civic1990"">1990</option>",Civic
"<option value=""CRX1990"">1990</option>",CRX
"<option value=""Prelude1990"">1990</option>",Prelude
"<option value=""Accord1990"">1990</option>",Accord
"<option value=""Prelude1991"">1991</option>",Prelude
"<option value=""Civic1991"">1991</option>",Civic
"<option value=""CRX1991"">1991</option>",CRX
"<option value=""Accord1991"">1991</option>",Accord
"<option value=""Prelude1992"">1992</option>",Prelude
"<option value=""Civic1992"">1992</option>",Civic
"<option value=""Accord1992"">1992</option>",Accord
"<option value=""Prelude1993"">1993</option>",Prelude
"<option value=""Civic1993"">1993</option>",Civic
"<option value=""CivicdelSol1993"">1993</option>",CivicdelSol
"<option value=""Accord1993"">1993</option>",Accord
"<option value=""Passport1994"">1994</option>",Passport
完成后我希望结果如下:
<option value="">Choose Year</option>
<option value="Civic1990">1990</option>
<option value="Civic1991">1991</option>
<option value="Civic1992">1992</option>
<option value="Civic1993">1993</option>
<option value="Civic1994">1994</option>
<option value="Civic1995">1995</option>
<option value="Civic1996">1996</option>
<option value="Civic1997">1997</option>
<option value="Civic1998">1998</option>
<option value="">Choose Year</option>
<option value="Accord1990">1990</option>
<option value="Accord1991">1991</option>
<option value="Accord1992">1992</option>
<option value="Accord1993">1993</option>
<option value="Accord1994">1994</option>
<option value="Accord1995">1995</option>
<option value="Accord1996">1996</option>
<option value="Accord1997">1997</option>
<option value="Accord1998">1998</option>
...等 所以每年和特定模型的选项值都在他们自己的csv或txt文件中。
答案 0 :(得分:0)
答案 1 :(得分:0)
行只是Python字符串的列表,所以请尝试:
csv_writers[k].writerow(row[0:1])
这只会写第一列。
第二个问题:
在Python中,您可以使用str.replace(substr, new_substr)
。
这里我们可能会讨论一个字符串列表(我知道在这个场景中列表中只有一个字符串),所以列表理解就派上用场了。
csv_writers[k].writerow([v.replace('""', '"') for v in row[0:1]])
这会生成一个新列表,其中所有字符串都已将""
替换为"
。
希望它有所帮助!