将多个csv文件连接成一个

时间:2016-12-20 10:16:49

标签: csv pandas

我有多个.csv个文件,我想将它们连接成一个文件。基本上我想选择某些列并将它们并排添加。

我这里的代码不起作用。完全没有错误消息。它什么也没做。

有人知道如何解决它吗?

import pandas as pd
import datetime
import numpy as np
import glob
import csv
import os



def concatenate(indir='/My Documents/Python/Test/in',
                outfile='/My Documents/Python/Test/out/Forecast.csv'):
    os.chdir(indir)
    fileList = glob.glob('*.csv')
    print(fileList)
    dfList = []
    colnames=["DateTime","WindSpeed","Capacity","p0.025","p0.05","p0.1","p0.5","p0.9","p0.95","p0.975","suffix"]
    for filename in fileList:
        print(filename)
        df = pd.read_csv(filename ,delimiter=',',engine = 'python', encoding='latin-1', index_col = False)
        dfList.append(df)
    concatDF = pd.concat(dfList,axis=0)
    concatDF.columns=colnames
    concatDF.to_csv(outfile,index=None)

1 个答案:

答案 0 :(得分:0)

我运行此代码以在我的文件系统上设置文件

<强> 设置

import pandas as pd
import numpy as np

def setup_test_files(indir='in'):
    colnames = [
        "WindSpeed", "Capacity",
        "p0.025", "p0.05", "p0.1", "p0.5",
        "p0.9", "p0.95", "p0.975", "suffix"
    ]
    tidx = pd.date_range('2016-03-31', periods=3, freq='M', name='DateTime')

    for filename in ['in/fn_{}.csv'.format(i) for i in range(3)]:
        pd.DataFrame(
            np.random.rand(3, len(colnames)),
            tidx, colnames
        ).round(2).to_csv(filename)
        print(filename)

setup_test_files()

这创建了3个名为['fn_0.csv', 'fn_1.csv', 'fn_2.csv']的文件 他们看起来像这样

with open('in/fn_0.csv', 'r') as fo:
    print(''.join(fo.readlines()))

DateTime,WindSpeed,Capacity,p0.025,p0.05,p0.1,p0.5,p0.9,p0.95,p0.975,suffix
2016-03-31,0.03,0.76,0.62,0.21,0.76,0.36,0.44,0.61,0.23,0.04
2016-04-30,0.39,0.12,0.31,0.99,0.86,0.35,0.15,0.61,0.55,0.03
2016-05-31,0.72,1.0,0.71,0.86,0.41,0.79,0.22,0.76,0.92,0.79

我将定义一个解析器函数和一个单独执行串联的函数。为什么?因为我认为这样更容易。

import pandas as pd
import glob
import os


def read_csv(fn):
    colnames = [
        "DateTime", "WindSpeed", "Capacity",
        "p0.025", "p0.05", "p0.1", "p0.5",
        "p0.9", "p0.95", "p0.975", "suffix"
    ]
    df = pd.read_csv(fn, encoding='latin-1')
    df.columns = colnames
    return df


def concatenate(indir='in', outfile='out/Forecast.csv'):
    curdir = os.getcwd()

    try:
        os.chdir(indir)
        file_list = glob.glob('*.csv')
        df_names = [fn.replace('.csv', '') for fn in file_list]

        concat_df = pd.concat(
            [read_csv(fn) for fn in file_list],
            axis=1, keys=df_names)

        # notice I was nice enough to change directory back :-)
        os.chdir(curdir)

        concat_df.to_csv(outfile, index=None)

    except:
        os.chdir(curdir)

然后运行连接

concatenate()

您可以像这样阅读结果

print(pd.read_csv('out/Forecast.csv', header=[0, 1]))

         fn_0                                                                 \
     DateTime WindSpeed Capacity p0.025 p0.05  p0.1  p0.5  p0.9 p0.95 p0.975   
0  2016-03-31      0.03     0.76   0.62  0.21  0.76  0.36  0.44  0.61   0.23   
1  2016-04-30      0.39     0.12   0.31  0.99  0.86  0.35  0.15  0.61   0.55   
2  2016-05-31      0.72     1.00   0.71  0.86  0.41  0.79  0.22  0.76   0.92   

   ...        fn_2                                                              
   ...   WindSpeed Capacity p0.025 p0.05  p0.1  p0.5  p0.9 p0.95 p0.975 suffix  
0  ...        0.80     0.79   0.38  0.94  0.91  0.18  0.27  0.14   0.39   0.91  
1  ...        0.60     0.97   0.04  0.69  0.04  0.65  0.94  0.81   0.37   0.22  
2  ...        0.78     0.53   0.83  0.93  0.92  0.12  0.15  0.65   0.06   0.11  

[3 rows x 33 columns]

注意:

您没有注意将DateTime作为索引。我想这可能就是你想要的。如果是,请将read_csvconcatenate功能更改为此

import pandas as pd
import glob
import os


def read_csv(fn):
    colnames = [
        "WindSpeed", "Capacity",
        "p0.025", "p0.05", "p0.1", "p0.5",
        "p0.9", "p0.95", "p0.975", "suffix"
    ]
    # notice extra parameters for specifying index and parsing dates
    df = pd.read_csv(fn, index_col=0, parse_dates=[0], encoding='latin-1')
    df.index.name = "DateTime"
    df.columns = colnames
    return df


def concatenate(indir='in', outfile='out/Forecast.csv'):
    curdir = os.getcwd()
    try:
        os.chdir(indir)
        file_list = glob.glob('*.csv')
        df_names = [fn.replace('.csv', '') for fn in file_list]

        concat_df = pd.concat(
            [read_csv(fn) for fn in file_list],
            axis=1, keys=df_names)

        os.chdir(curdir)
        concat_df.to_csv(outfile)
    except:
        os.chdir(curdir)

这是此更改的最终结果,请注意日期将以这种方式对齐

                fn_0                                                       \
           WindSpeed Capacity p0.025 p0.05  p0.1  p0.5  p0.9 p0.95 p0.975   
DateTime                                                                    
2016-03-31      0.03     0.76   0.62  0.21  0.76  0.36  0.44  0.61   0.23   
2016-04-30      0.39     0.12   0.31  0.99  0.86  0.35  0.15  0.61   0.55   
2016-05-31      0.72     1.00   0.71  0.86  0.41  0.79  0.22  0.76   0.92   

                   ...        fn_2                                          \
           suffix  ...   WindSpeed Capacity p0.025 p0.05  p0.1  p0.5  p0.9   
DateTime           ...                                                       
2016-03-31   0.04  ...        0.80     0.79   0.38  0.94  0.91  0.18  0.27   
2016-04-30   0.03  ...        0.60     0.97   0.04  0.69  0.04  0.65  0.94   
2016-05-31   0.79  ...        0.78     0.53   0.83  0.93  0.92  0.12  0.15   


           p0.95 p0.975 suffix  
DateTime                        
2016-03-31  0.14   0.39   0.91  
2016-04-30  0.81   0.37   0.22  
2016-05-31  0.65   0.06   0.11  

[3 rows x 30 columns]