我不确定这是否足以提供信息,但我目前正在尝试从包含许多JSON对象的非常大的文件中提取并格式化为csv数据的子集,并将其转储到一个csv中文件。我有以下实现。速度不是太糟糕但我想知道是否有更有效的方法来做到这一点。我觉得像我创建数据帧的熊猫部分可以更好一些:
for files in zip_files:
with zipfile.ZipFile(files, 'r') as myzip:
for logfile in myzip.namelist():
list1 = []
list2 = []
f = myzip.open(logfile)
contents = f.readlines()
for line in contents[:]:
try:
parsed = json.loads(line[:-2])
if "key1" in parsed.keys():
if "val1" in parsed['key1']['key2']:
if "val2" in parsed['key3']:
list1.append(parsed['key1'])
list2.append(parsed['key3'])
except ValueError as e:
pass
else:
pass
df1 = pd.DataFrame(list1)
df2 = pd.DataFrame(list2)
df3 = df2.join(df1)
df3['col1'] = df3['col1'].apply(lambda x: ','.join([str(i) for i in x]))
df3 = df3.drop_duplicates()
with open(csvout, 'a') as f2:
df.to_csv(f2, header=None, index=False)
f2.close()
f.close()
答案 0 :(得分:1)
我做了以下事情:
STRANGE
注释原始版本(=我不确定您在做什么),EFFICIENT
(=可以提高效率),SIMPLIFY
(=可以更简单)。要验证这些建议确实有用,请考虑使用ipython %timeit
。在IPython提示符下输入以下内容:
In [0]: %timeit -n<N> %run script.py
其中<N>
是平均值的运行次数(默认为1000,可能需要太长时间)。
for files in zip_files:
with zipfile.ZipFile(files, 'r') as myzip:
for logfile in myzip.namelist():
list1 = []
list2 = []
f = myzip.open(logfile)
# contents = f.readlines()
# for line in contents[:]:
for line in f: # EFFICIENT: does the same without making a copy
try:
parsed = json.loads(line[:-2])
# if "key1" in parsed.keys():
if "key1" in parsed: # EFFICIENT: no copy
# STRANGE: 'val' in dict checks for key existence by
# default, are you sure this is what you want?
if "val1" in parsed['key1']['key2']:
if "val2" in parsed['key3']:
list1.append(parsed['key1'])
list2.append(parsed['key3'])
except ValueError as e:
pass
# STRANGE: Why is this here?
# else:
# pass
df1 = pd.DataFrame(list1)
df2 = pd.DataFrame(list2)
df3 = df2.join(df1)
# EFFICIENT: prefer generator over list comprehension
# df3['col1'] = df3['col1'].apply(lambda x: ','.join([str(i) for i in x]))
df3['col1'] = df3['col1'].apply(lambda x: ','.join(str(i) for i in x))
df3.drop_duplicates(inplace=True)
# SIMPLIFY:
# with open(csvout, 'a') as f2:
# df.to_csv(f2, header=None, index=False)
# f2.close()
# STRANGE: where does `df` come from? Shouldn't this be df3?
df.to_csv(csvout, mode='a', header=None, index=False)
# STRANGE: you open f in a loop, but close it outside of the loop?
f.close()
如果你有足够的内存,下面的可能更快:而不是附加到文件,你首先连接内存中的所有文件。 这也略微改变了行为:
还有一些风格上的变化:
for files in zip_files:
with zipfile.ZipFile(files, 'r') as myzip:
list1, list2 = [], [] # Notice these are outside the loop
for logfile in myzip.namelist():
with myzip.open(logfile) as f:
for line in f:
try:
parsed = json.loads(line[:-2])
except ValueError as e: # Presumably we only wish to catch json value errors
pass
else:
if ("key1" in parsed
and "val1" in parsed['key1']['key2']
and "val2" in parsed['key3']):
list1.append(parsed['key1'])
list2.append(parsed['key3'])
# Write only once
df = pd.DataFrame(list2).join(pd.DataFrame(list1))
df['col1'] = df['col1'].apply(lambda x: ','.join(str(i) for i in x))
df.drop_duplicates(inplace=True)
df.to_csv(csvout, header=None, index=False)
将重复过滤保持在每个文件的本地:
for files in zip_files:
with zipfile.ZipFile(files, 'r') as myzip:
dfs = []
for logfile in myzip.namelist():
list1, list2 = [], []
with myzip.open(logfile) as f:
for line in f:
try:
parsed = json.loads(line[:-2])
except ValueError as e: # Presumably we only wish to catch json value errors
pass
else:
if ("key1" in parsed
and "val1" in parsed['key1']['key2']
and "val2" in parsed['key3']):
list1.append(parsed['key1'])
list2.append(parsed['key3'])
# Build a temporary dataframe to filter the duplicates:
tmp = pd.DataFrame(list2).join(pd.DataFrame(list1))
tmp['col1'] = tmp['col1'].apply(lambda x: ','.join(str(i) for i in x))
tmp.drop_duplicates(inplace=True)
dfs.append(tmp)
# Write only once
pd.concat(dfs, ignore_index=True).to_csv(csvout, header=None, index=False)