我有一个文件,其中许多字段以“|”结尾(管)字符。 我想读取此文件并创建与特定字段的值一样多的文件。 这是一个例子:
L219| |791|P|PIPPO|PLUTO|1|18081926|I262|XYZXCV12D35F345S||
L219| |1241|P|PAPERINO|TOPOLINO|2|21041937|F335|FVGHWU54G56S456U||
L219| |437793|G|TOPOLANDIA SAS|L219|12345678910|
L219| |437794|G|PAPERANDIA|L219|10987654321|
如果第四个字段等于“G”,则记录进入“file_pg.txt”,否则如果等于“P”则进入“file_pf.txt”。
我编写下面的代码(我是Python中的新代码),但执行具有大尺寸(300mb)的文件需要很长时间,你有什么建议可以改进吗?
file = open('D:\\mydirectory\\soggetti.txt','r')
file_pf = open("D:\\mydirectory\\file_pf.txt","w")
file_pg = open("D:\\mydirectory\\file_pg.txt","w")
file_pf.close()
file_pg.close()
i = 0
with file:
for line in file:
i = 0
c = 0
while i < len(line):
carattere = line[i]
if carattere == "|":
c = c + 1
if c == 4:
if line[i-1] == "P":
file_pf = open("D:\\mydirectory\\file_pf.txt","a")
file_pf.write(line)
file_pf.close()
break
elif line[i-1] == "G":
file_pg = open("D:\\mydirectory\\file_pg.txt","a")
file_pg.write(line)
file_pg.close()
break
i = i + 1
file.close()
谢谢!
阿尔贝托
答案 0 :(得分:1)
我会选择:
with open('D:\\mydirectory\\soggetti.txt','r') as source_file:
with open("D:\\mydirectory\\file_pf.txt","w") as file_pf:
with open("D:\\mydirectory\\file_pg.txt","w") as file_pg:
for line in source_file:
if line.split("|")[3] == "P":
file_pf.write(line)
elif line.split("|")[3] == "G":
file_pg.write(line)
如果您关心速度,最好这样做:
with open('D:\\mydirectory\\soggetti.txt','r') as source_file:
listP = []
listG = []
for line in source_file:
char = line.split("|")[3]
if char == "P":
listP.append(line)
file_pf.write(line)
elif char == "G":
listG.append(line)
file_pg.write(line)
with open("D:\\mydirectory\\file_pf.txt","w") as file_pf:
for line in listP
file_pf.write(line)
with open("D:\\mydirectory\\file_pg.txt","w") as file_pg:
for line in listG
file_pg.write(line)
答案 1 :(得分:0)
打开和关闭文件的操作相对较慢。如果可能,您应该只打开和关闭一次文件。在您的情况下,您可以将p和g行存储在列表中,然后在循环结束后立即写入所有行。
file = open('D:\\mydirectory\\soggetti.txt','r')
file_pf = open("D:\\mydirectory\\file_pf.txt","w")
file_pg = open("D:\\mydirectory\\file_pg.txt","w")
file_pf.close()
file_pg.close()
p_lines = []
g_lines = []
i = 0
with file:
for line in file:
i = 0
c = 0
while i < len(line):
carattere = line[i]
if carattere == "|":
c = c + 1
if c == 4:
if line[i-1] == "P":
p_lines.append(line)
break
elif line[i-1] == "G":
g_lines.append(line)
break
i = i + 1
file.close()
file_pf = open("D:\\mydirectory\\file_pf.txt","w")
file_pf.writelines(p_lines)
file_pf.close()
file_pg = open("D:\\mydirectory\\file_pg.txt","w")
file_pg.writelines(g_lines)
file_pg.close()
您还可以使用split
更轻松地识别每行中字段的内容。
file = open('D:\\mydirectory\\soggetti.txt','r')
file_pf = open("D:\\mydirectory\\file_pf.txt","w")
file_pg = open("D:\\mydirectory\\file_pg.txt","w")
file_pf.close()
file_pg.close()
p_lines = []
g_lines = []
with file:
for line in file:
fields = line.split("|")
if fields[3] == "P":
p_lines.append(line)
elif fields[3] == "G":
g_lines.append(line)
file.close()
file_pf = open("D:\\mydirectory\\file_pf.txt","w")
file_pf.writelines(p_lines)
file_pf.close()
file_pg = open("D:\\mydirectory\\file_pg.txt","w")
file_pg.writelines(g_lines)
file_pg.close()
顺便说一下,严格来说,一旦完成文件,您就不需要使用with
和显式关闭文件。你可以做其中一个。并且没有必要在脚本开头打开并立即关闭file_pf
和file_pg
。
p_lines = []
g_lines = []
with open('D:\\mydirectory\\soggetti.txt','r') as file:
for line in file:
fields = line.split("|")
if fields[3] == "P":
p_lines.append(line)
elif fields[3] == "G":
g_lines.append(line)
file_pf = open("D:\\mydirectory\\file_pf.txt","w")
file_pf.writelines(p_lines)
file_pf.close()
file_pg = open("D:\\mydirectory\\file_pg.txt","w")
file_pg.writelines(g_lines)
file_pg.close()
如果您希望将来有更多的行类型而不是“p”和“g”,可能会节省一些时间将各种行存储在字典中:
from collections import defaultdict
lines_to_write = defaultdict(list)
with file as open('D:\\mydirectory\\soggetti.txt','r'):
for line in file:
fields = line.split("|")
lineType = fields[3].lower()
lines_to_write[lineType].append(line)
for lineType, lines in lines_to_write.iteritems():
filename = "D:\\mydirectory\\file_{}f.txt".format(lineType)
with file as open(filename,"w"):
file.writelines(lines)
您可以通过跟踪您所在的行号并定期打印消息来向用户报告已处理的行数。
how_often_to_report = 100 #prints message every one hundred lines
with file as open('D:\\mydirectory\\soggetti.txt','r'):
for line_number, line in enumerate(file):
if line_number % how_often_to_report == 0:
print "{} lines processed", line_number
#do rest of processing work here
答案 2 :(得分:0)
Read line from file
split on |
P = empty list
G = empty list
if splitted_line[index] is equal to P
add line to P
elif splitted_line[index] is equal to G
add line to G
open file for P
write all lines in P
close file for P
open file for G
write all lines in G
close file for G
答案 3 :(得分:0)
我没有对此进行测试,但下面的内容应该更快
file = open('D:\\mydirectory\\soggetti.txt','r')
file_pf = open("D:\\mydirectory\\file_pf.txt","a")
file_pg = open("D:\\mydirectory\\file_pg.txt","a")
for line in file:
bits = line.split("|")
if bits[3] == "P":
file_pf.write(line)
if bits[3] == "G":
file_pg.write(line)
file.close()
file_pf.close()
file_pg.close()
答案 4 :(得分:0)
下面的代码应该比你正在做的更快,因为。
file = open('D:\\mydirectory\\soggetti.txt','r')
file_pf = open("D:\\mydirectory\\file_pf.txt","w")
file_pg = open("D:\\mydirectory\\file_pg.txt","w")
file_pf.close()
file_pg.close()
file_pf = open("D:\\mydirectory\\file_pf.txt","a")
file_pg = open("D:\\mydirectory\\file_pg.txt","a")
with file:
for line in file:
switch = line.split('|')[3]
write = file_pf.write if 'P' in switch else file_pg.write
write(line)
file_pg.close()
file_pf.cloe()
file.close()