如何使用Pandas数据帧将我的Python程序平行化以使用2000 csv文件清理数据集?

时间:2018-03-07 04:28:47

标签: python pandas csv multiprocessing

这是处理一个文件的代码。

import pandas as pd
import numpy as npimport pandas as pd
inputfile = open('dataset1.csv', 'r')
df = pd.read_csv("cleaning/semicleaned1.csv", sep=",", 
names["press", "gph", "temp","wspd"])
df = df.drop(['gph'], axis=1)
df.to_csv("cleaning/FINAL.csv", sep=',', index=False)

1 个答案:

答案 0 :(得分:1)

要处理cleaning文件夹中的所有CSV文件,您可以使用glob来获取列表。我建议根据您的输入文件名创建一个输出文件名,例如:将cleaned_添加到文件名:

import pandas as pd
import glob
import os


for csv_filename in glob.glob('cleaning/*.csv'):
    # Create a suitable output filename based on the input filename
    split = list(os.path.split(csv_filename))
    name, ext = os.path.splitext(split[-1])
    split[-1] = 'cleaned_{}{}'.format(name, ext)
    cleaned_filename = os.path.join(*split)
    print('{} -> {}'.format(csv_filename, cleaned_filename))

    # Process CSV
    df = pd.read_csv(csv_filename, names=["press", "gph", "temp", "wspd"])
    df = df.drop(['gph'], axis=1)
    df.to_csv(cleaned_filename, sep=',', index=False)        

请注意,如果您的文件有标题,则需要小心。

您可以使用multiprocessing.Pool()并行运行:

from multiprocessing import Pool
import pandas as pd
import glob
import os


def clean_csv(csv_filename):
    # Create a suitable output filename based on the input filename
    split = list(os.path.split(csv_filename))
    name, ext = os.path.splitext(split[-1])
    split[-1] = 'cleaned_{}{}'.format(name, ext)
    cleaned_filename = os.path.join(*split)
    print('{} -> {}'.format(csv_filename, cleaned_filename))

    # Process CSV
    df = pd.read_csv(csv_filename, names=["press", "gph", "temp", "wspd"])
    df = df.drop(['gph'], axis=1)
    df.to_csv(cleaned_filename, sep=',', index=False)         

if __name__ == '__main__':
    with Pool(10) as pool:
        print(pool.map(clean_csv, glob.glob('cleaning/*.csv')))