Python - 批量合并多个大型CSV,过滤数据,跳过标题,垂直追加到单个CSV

时间:2017-02-22 04:24:54

标签: python csv append batch-processing

**注意我修改了原版下面的代码,以显示适用于我需要的代码

下午好,

关于csv数据组合的很多问题,但到目前为止我还没有找到任何可以帮助我解决代码需求的问题。

我有大的固定标题CSV:

1)在12小时内生产。我需要查看一周价值的csv才能合并 2)在2列信息上过滤单个CSV(否则为多行) 3)垂直附加到单个csv'主表'中,其命名约定为“最后一班的日期”

**文件以单独的CSV形式出现。我需要他们加入一个

** FYI - 代码后的数据集(我刚刚为此目的切出了16列数据)

以下是我到目前为止的情况。为这个烂摊子道歉!

import os, csv                                                                 
import pandas as pd
import io
import glob
from datetime import date                                                      
import time
import collections

# Process data and filter #

def ProcessData( data ):                                                        
    processedData = []                                                          

    for row in data:
        if row[ 15 ] == ( 'OPERATING' ):                                        
            outputRow = row[ 0:3 ] + row[ 15:17 ]                               
            processedData.append( outputRow )                           

    return processedData                                                             

# Process and write #

def ProcessAndWrite( data, filename ):                                               
    processedData = ProcessData( data ) 

    name, ext = os.path.splitext( filename )                                         
    outputfilename = name + '_week_combined.csv'                                

    print "writing data to " + str( outputfilename )                                

    with open(outputfilename, 'wb') as csvfile:                                      
        writer = csv.writer(csvfile)
        for row in processedData:
        writer.writerow(row)

# select the correct weeks worth of files #   

def filedate( data, datetime ):                                                    
    root = 'E:\Rs\\'                                                           


    date_outputfilename_list = []
    for file in date_outputfilename_list:


        folder, file_name = os.path.split(file[1])



        file_date = time.strftime("%y-%m-%d", file[0])
        date_name_list.append((file_date, file_name))



    date_count_dict = {}



    date_name_dict = {}

    for date, name in date_name_list:



        date_count_dict=collections.defaultdict( int )
        date_name_dict.setdefault(date, []).append(name)

    import pprint
    print("Files with the same date:")
    pprint.pprint(date_name_dict)
    print('-'*60)
    print("Same dates count:")
    pprint.pprint(date_count_dict)

# Function #

if __name__ == "__main__":                                                          

    import sys
    path = r'E:\Rs'                                                             
    filenames = glob.glob(os.path.join(path, '*.csv'))                              
    filenames.sort()                                                               

    data = []                                                                       

    for filename in filenames:                                                      


        with open(filename, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter = ',')                       
            header = []                                                         
            for headerCount in range( 2 ):                                      
                header.append(next(reader))
            data.extend( [ row for row in reader ] )                          


        if( filedate ):                                                         
            ProcessAndWrite( data, filename )
            data = [ProcessData]


    if ( len( data ) > 0 ):
        ProcessAndWrite( data, filename )        

数据集: position_x,position_y,position_z,start_time,opreason,stage, 标题2,标题2,标题2,标题2,标题2,标题2 649794,4764274,1147,2 / 11/2016 00:00,操作,声音,

修改后的脚本适用于我的目的

    import os, csv                                                                           # Import csv library
    import io
    import glob
    import datetime                                                       
    import time
    import collections

    def ProcessData( data ):                                                                #   Function definition: filter data
        processedData = []                                                                  #   empty process data list

        for row in data:
            if (row[ 15 ] == 'OPERATING' and row[ 6 ] == 'truck'):                 #   Filter explination
                n1=datetime.datetime.strptime(row[3], '%Y-%m-%d %H:%M:%S')                  #    Strip date from timedate for duration calc
                n2=datetime.datetime.strptime(row[4], '%Y-%m-%d %H:%M:%S')                  #   Strip date from timedate for duration calc
                diff = n2 - n1                                                              #   duration calc   
                outputRow = row[ 0:3 ] + row[ 3:5 ] + [diff.total_seconds()]            
                processedData.append( outputRow )                                           #   process the last of the list information from the csv and append new file

        return processedData                                                                #   Final Processed data


    def ProcessAndWrite( data, filename ):                                                  #   Function Definition: Write data
        processedData = ProcessData( data ) 

        name, ext = os.path.splitext( filename )                                            #   Split the file name from the original to define the output as weeks mastersheet
        outputfilename = name + '_week_combined.csv'                                

        print "writing data to " + str( outputfilename )                                    #   Screen output describing file to look for

        with open(outputfilename, 'wb') as csvfile:                                         #   'wb' is write binary file

            writer = csv.writer(csvfile)                                                    #   Next line is a hack to put headers in the csv
            writer.writerow(['position_x','position_y','position_z','start_time','end_time','model','number','speed','o','stage','duration', 'cummulative_duration'])
            for row in processedData:
                writer.writerow(row)


    if __name__ == "__main__":                                                              #   Run script directly through python (not imported)

        import sys
        path = r'E:\\'                                                                  #   Set correct folder location for file merge
        filenames = glob.glob(os.path.join(path, '*.csv'))                                  #   Select correct files for merge    
        filenames.sort()                                                                    #   Sort the folder so that the files are in date order to make sure you dont crash the script later

        data = []                                                                           #   Blank data list

        def dateFromFilename( name ):                                                       #   Function to select the correct files from truck speed folder
            path,filename = os.path.split(name)                                             
            splitName = filename.split('_')
            dateStr = splitName[0]
            date = datetime.datetime.strptime(dateStr,'%Y-%m-%d')                           #   Split file name date and words
            return date                                                                     #   Need to put this is so it returns an actual value!

        firstFileDate = None
        lastFilename = None

        for filename in filenames:                                                          #   Select file
            currentFileDate = dateFromFilename( filename )

            if firstFileDate:
                diff = currentFileDate - firstFileDate
                # somehow convert this to days
                if ( diff.days >= 1 ):                                                      #   Selct the previous 24hrs worth of data
                    ProcessAndWrite( data, lastFilename )                                   #   Call function to write data
                    data = []
            firstFileDate = currentFileDate
            lastFilename = filename

            with open(filename, 'r') as csvfile:                                            #   For new CSV files
                reader = csv.reader(csvfile, delimiter = ',')                               #   read the csv
                header = []                                                                 #   Blank header list (do this to skip the header rows for merge)
                for headerCount in range( 3 ):                                              #   Start reading from line 3
                    header.append(next(reader))
                data.extend( [ row for row in reader ] )                                    #   extend is to continue the data stacking with the next csv data

        if ( len( data ) > 0 ):                                                             #   If the list of data has data then continue to process and write
            ProcessAndWrite( data, filename )

0 个答案:

没有答案