下面的代码需要2.5秒才能导入包含100万行代码的日志文件。
是否有更好的代码方式并缩短执行时间?
“”“此代码用于将日志文件读入内存并转换为数据框 加载日志文件后,检查IPQuery文件中的每个项目是否存在,并将结果打印到控制台“”“
#importing python modules required for this script to perform operations
import pandas as pd
import time
import sys
#code to check the arguments passed """
if len(sys.argv)!= 3:
raise ValueError(""" PLEASE PASS THE BOTH LOG FILE AND IPQUERY FILE AS INPUT TO SCRIPT
ex: python program.py log_file query_file """)
# extracting file names from command line """
log_file_name=sys.argv[1]
query_file_name = sys.argv[2]
start = time.time()#capturing time instance
#Reading the content from the log file into dataframe log_df """
log_df = pd.read_csv(log_file_name," ",header=None ,names = ['DATE','TIME', 'IPADDR','URL','STATUS'],skip_blank_lines = True)
#Reading the content from the IPquery file into the data frame query_df """
query_df = pd.read_csv(query_file_name," ",header=None,skip_blank_lines=True )
#Cheking if the IP address exists in the log file"""
Ipfound = query_df.isin(log_df.IPADDR).astype(int)
#print all the results to the Query results onto the stdout"""
for items in Ipfound[0]:
print items
print "Execution Time of this script is %f" %(time.time() - start)
答案 0 :(得分:0)
#importing python modules required for this script to perform operations
import time
import sys
start = time.time()#capturing time instance
class IpQuery:
"""Below methods contain the functionality to read file paths ,import log and query data
and provide the result to the console """
def __init__(self):
self.log_file_name= ""
self.query_file_name = ""
self.logset = {}
self.IPlist= []
def Inputfiles(self):
"""code to check the arguments passed and throw an error """
if len(sys.argv)!= 3:
raise ValueError(""" PLEASE PASS THE BOTH LOG FILE AND IPQUERY FILE AS INPUT TO SCRIPT
ex: python program.py log_file query_file """)
# extracting file names from command line
self.log_file_name=sys.argv[1]
self.query_file_name = sys.argv[2]
def read_logfile(self):
#Reading the log data
with open(self.log_file_name,'r') as f:
self.logset = {line.split(' ')[2] for line in f if not line.isspace()}
def read_Queryfile(self):
#Reading the query file into the dataframe"""
with open(self.query_file_name,'r') as f:
self.IPlist = [line.rstrip('\n') for line in f if not line.isspace() ]
def CheckIpAdress(self):
#Ip address from query file ae checked against the log file """
dummy= self.logset.intersection(set(self.IPlist))
for element in self.IPlist:
if element in dummy:
print "1"
else :
print "0"
try:
#Create an instance of the IpQuery file
msd=IpQuery()
#Extracting the input file information
msd.Inputfiles()
#Importing the Ip information from the log files
msd.read_logfile()
#Importing the Ipquery information from the query file
msd.read_Queryfile()
#Searching for the Ip in log file
msd.CheckIpAdress()
except IOError:
print "Error: can\'t find file or read data"
except ValueError :
print "PLEASE PASS THE BOTH LOG FILE AND IPQUERY FILE AS INPUT TO SCRIPT "