我已经在python中创建了一个日志解析器,用于解析日志数据并对其进行结构化并将结果存储在csv中。我在不同文件中有5 GB的数据。我可以成功解析数据,但是完成这项工作所花费的时间却更多,这可以改善这一点,因为我将多处理作为一种选择,但是效率却没有太大提高。我该怎么做才能有效地解析这些大数据并减少解析时间。
附带相同的代码。
import glob
import errno
import re
import pandas as pd
import concurrent.futures
log_pattern = regex expression
log_dict = []
def parse_file(name):
try:
with open(name, "r", encoding= "utf-8") as f:
for line in f:
match = log_pattern.match(line)
if not match:
continue
grps = match.groups()
log_dict.append({"date": grps[0], "timestamp": grps[1] , "type": grps[2], "text": grps[3]})
df = pd.DataFrame(log_dict)
df.to_csv("temp.csv")
#return df
#f.close()
except IOError as exc:
if exc.errno != errno.EISDIR:
raise
return 0
#Create a pool of process. By default one is created for each CPU in your machine
def main():
with concurrent.futures.ProcessPoolExecutor() as executor:
files = glob.glob(*.log)
for file, name in zip(files, executor.map(parse_file, files)):
print(file, name)
if __name__ == '__main__':
main()