Python - 批量合并多个大型CSV,过滤数据,跳过标题,垂直追加到单个CSV

时间:2017-02-22 04:24:54

1)在12小时内生产。我需要查看一周价值的csv才能合并 2)在2列信息上过滤单个CSV(否则为多行) 3)垂直附加到单个csv'主表'中,其命名约定为“最后一班的日期”


** FYI - 代码后的数据集(我刚刚为此目的切出了16列数据)


import os, csv                                                                 
import pandas as pd
import io
import glob
from datetime import date                                                      
import time
import collections

# Process data and filter #

def ProcessData( data ):                                                        
    processedData = []                                                          

    for row in data:
        if row[ 15 ] == ( 'OPERATING' ):                                        
            outputRow = row[ 0:3 ] + row[ 15:17 ]                               
            processedData.append( outputRow )                           

    return processedData                                                             

# Process and write #

def ProcessAndWrite( data, filename ):                                               
    processedData = ProcessData( data ) 

    name, ext = os.path.splitext( filename )                                         
    outputfilename = name + '_week_combined.csv'                                

    print "writing data to " + str( outputfilename )                                

    with open(outputfilename, 'wb') as csvfile:                                      
        writer = csv.writer(csvfile)
        for row in processedData:

# select the correct weeks worth of files #   

def filedate( data, datetime ):                                                    
    root = 'E:\Rs\\'                                                           

    date_outputfilename_list = []
    for file in date_outputfilename_list:

        folder, file_name = os.path.split(file[1])

        file_date = time.strftime("%y-%m-%d", file[0])
        date_name_list.append((file_date, file_name))

    date_count_dict = {}

    date_name_dict = {}

    for date, name in date_name_list:

        date_count_dict=collections.defaultdict( int )
        date_name_dict.setdefault(date, []).append(name)

    import pprint
    print("Files with the same date:")
    print("Same dates count:")

# Function #

if __name__ == "__main__":                                                          

    import sys
    path = r'E:\Rs'                                                             
    filenames = glob.glob(os.path.join(path, '*.csv'))                              

    data = []                                                                       

    for filename in filenames:                                                      

        with open(filename, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter = ',')                       
            header = []                                                         
            for headerCount in range( 2 ):                                      
            data.extend( [ row for row in reader ] )                          

        if( filedate ):                                                         
            ProcessAndWrite( data, filename )
            data = [ProcessData]

    if ( len( data ) > 0 ):
        ProcessAndWrite( data, filename )        

数据集: position_x,position_y,position_z,start_time,opreason,stage, 标题2,标题2,标题2,标题2,标题2,标题2 649794,4764274,1147,2 / 11/2016 00:00,操作,声音,


    import os, csv                                                                           # Import csv library
    import io
    import glob
    import datetime                                                       
    import time
    import collections

    def ProcessData( data ):                                                                #   Function definition: filter data
        processedData = []                                                                  #   empty process data list

        for row in data:
            if (row[ 15 ] == 'OPERATING' and row[ 6 ] == 'truck'):                 #   Filter explination
                n1=datetime.datetime.strptime(row[3], '%Y-%m-%d %H:%M:%S')                  #    Strip date from timedate for duration calc
                n2=datetime.datetime.strptime(row[4], '%Y-%m-%d %H:%M:%S')                  #   Strip date from timedate for duration calc
                diff = n2 - n1                                                              #   duration calc   
                outputRow = row[ 0:3 ] + row[ 3:5 ] + [diff.total_seconds()]            
                processedData.append( outputRow )                                           #   process the last of the list information from the csv and append new file

        return processedData                                                                #   Final Processed data

    def ProcessAndWrite( data, filename ):                                                  #   Function Definition: Write data
        processedData = ProcessData( data ) 

        name, ext = os.path.splitext( filename )                                            #   Split the file name from the original to define the output as weeks mastersheet
        outputfilename = name + '_week_combined.csv'                                

        print "writing data to " + str( outputfilename )                                    #   Screen output describing file to look for

        with open(outputfilename, 'wb') as csvfile:                                         #   'wb' is write binary file

            writer = csv.writer(csvfile)                                                    #   Next line is a hack to put headers in the csv
            writer.writerow(['position_x','position_y','position_z','start_time','end_time','model','number','speed','o','stage','duration', 'cummulative_duration'])
            for row in processedData:

    if __name__ == "__main__":                                                              #   Run script directly through python (not imported)

        import sys
        path = r'E:\\'                                                                  #   Set correct folder location for file merge
        filenames = glob.glob(os.path.join(path, '*.csv'))                                  #   Select correct files for merge    
        filenames.sort()                                                                    #   Sort the folder so that the files are in date order to make sure you dont crash the script later

        data = []                                                                           #   Blank data list

        def dateFromFilename( name ):                                                       #   Function to select the correct files from truck speed folder
            path,filename = os.path.split(name)                                             
            splitName = filename.split('_')
            dateStr = splitName[0]
            date = datetime.datetime.strptime(dateStr,'%Y-%m-%d')                           #   Split file name date and words
            return date                                                                     #   Need to put this is so it returns an actual value!

        firstFileDate = None
        lastFilename = None

        for filename in filenames:                                                          #   Select file
            currentFileDate = dateFromFilename( filename )

            if firstFileDate:
                diff = currentFileDate - firstFileDate
                # somehow convert this to days
                if ( diff.days >= 1 ):                                                      #   Selct the previous 24hrs worth of data
                    ProcessAndWrite( data, lastFilename )                                   #   Call function to write data
                    data = []
            firstFileDate = currentFileDate
            lastFilename = filename

            with open(filename, 'r') as csvfile:                                            #   For new CSV files
                reader = csv.reader(csvfile, delimiter = ',')                               #   read the csv
                header = []                                                                 #   Blank header list (do this to skip the header rows for merge)
                for headerCount in range( 3 ):                                              #   Start reading from line 3
                data.extend( [ row for row in reader ] )                                    #   extend is to continue the data stacking with the next csv data

        if ( len( data ) > 0 ):                                                             #   If the list of data has data then continue to process and write
            ProcessAndWrite( data, filename )

