如何提高在python中导入数据的执行时间

时间:2017-04-24 15:10:52

标签: python

下面的代码需要2.5秒才能导入包含100万行代码的日志文件。

是否有更好的代码方式并缩短执行时间?

“”“此代码用于将日志文件读入内存并转换为数据框     加载日志文件后,检查IPQuery文件中的每个项目是否存在,并将结果打印到控制台“”“

#importing  python modules required for this script to perform operations
import pandas as pd
import time
import sys

#code to check the arguments passed    """
if len(sys.argv)!= 3:
    raise  ValueError(""" PLEASE PASS THE BOTH LOG FILE AND IPQUERY FILE AS INPUT TO SCRIPT
                      ex: python program.py log_file query_file """) 


# extracting file names from command line  """
log_file_name=sys.argv[1]
query_file_name = sys.argv[2]



start = time.time()#capturing time instance

#Reading the content from the log file into dataframe log_df  """
log_df = pd.read_csv(log_file_name," ",header=None ,names = ['DATE','TIME', 'IPADDR','URL','STATUS'],skip_blank_lines = True)

#Reading the content from the IPquery file into the data frame query_df """
query_df = pd.read_csv(query_file_name," ",header=None,skip_blank_lines=True )

#Cheking if the IP address exists in the log file"""
Ipfound = query_df.isin(log_df.IPADDR).astype(int)

#print all the results to the Query results onto the stdout"""
for items in Ipfound[0]:
    print items



print "Execution Time of this script is  %f" %(time.time() - start)

1 个答案:

答案 0 :(得分:0)

#importing  python modules required for this script to perform operations

import time
import sys



start = time.time()#capturing time instance


class IpQuery:
    """Below methods contain the functionality to read file paths ,import log and query data 
       and provide the result to the console """    
    def __init__(self):
        self.log_file_name= ""
        self.query_file_name = ""
        self.logset = {}
        self.IPlist= []

    def Inputfiles(self):
        """code to check the arguments passed  and throw an error  """
        if len(sys.argv)!= 3:
            raise  ValueError(""" PLEASE PASS THE BOTH LOG FILE AND IPQUERY FILE AS INPUT TO SCRIPT
                              ex: python program.py log_file query_file """) 
       # extracting file names from command line
        self.log_file_name=sys.argv[1]
        self.query_file_name = sys.argv[2]

    def read_logfile(self):
        #Reading the log data
        with open(self.log_file_name,'r') as f:

            self.logset = {line.split(' ')[2] for line in f if not line.isspace()}


    def read_Queryfile(self):
        #Reading the query file into the  dataframe"""
        with open(self.query_file_name,'r') as f:
            self.IPlist = [line.rstrip('\n') for line in f if not line.isspace() ]


    def CheckIpAdress(self):
        #Ip address from query file ae checked against the log file """ 
        dummy= self.logset.intersection(set(self.IPlist))

        for element in self.IPlist:
            if element in dummy:
                print "1"
            else :
                print "0"
try: 
    #Create an instance of the IpQuery file
    msd=IpQuery()
    #Extracting the input file information 
    msd.Inputfiles()
    #Importing the Ip information from the log files
    msd.read_logfile()
    #Importing the Ipquery information from the query file
    msd.read_Queryfile()
    #Searching for the Ip in log file
    msd.CheckIpAdress()
except IOError:
   print "Error: can\'t find file or read data"
except ValueError :
   print "PLEASE PASS THE BOTH LOG FILE AND IPQUERY FILE AS INPUT TO SCRIPT "