python内存错误循环文件

时间:2017-12-14 22:26:32

标签: python parallel-processing out-of-memory

我循环遍历.3和3gb之间的60个文件,执行一些基本计算,然后写出来。我无法弄清楚为什么我在文件48周围耗尽内存。必须是一个或多个对象被存储在某个地方并且它不断增长,但我无法弄清楚它是什么。在最后一次内存错误时,我使用了120/125 gb的RAM ...我也在两点并行运行。

主要功能:

def write_dynamic_columns():
    perf_path = BASE_DIR + RAW_DIR
    perf_files = glob.glob(perf_path + "/*" + INPUT_FILE_TYPE)
    loanmap = BASE_DIR + APPL_DIR + LOAN_MAP_DIR + LOAN_MAP_FILE_NAME
    loanmapdata = pd.read_table(loanmap, sep='|')
    for file_ in perf_files:
        if 'time' in file_:
            print("Working on file: " + file_)
            start = timeit.default_timer()
            data = pd.read_table(file_, delimiter='|', header=None)
            final_data = pd.merge(data, loanmapdata, left_on=0,
                                  right_on='loan_id', how='left')
            final_data = final_data.iloc[:, :-1]

            with mp.Pool(processes=14) as pool:
                chunks = pool.map(make_chunks, np.array_split(final_data, 14))
                pool.close()
                pool.join()

            with mp.Pool(processes=14) as pool:
                pool.map(write_in_parallel, np.ravel(chunks))
            stop = timeit.default_timer()
            total_time = stop - start
            mins, secs = divmod(total_time, 60)
            hours, mins = divmod(mins, 60)
            print("Finished writing quarter: " + file_ + "\n" + "ime: %d:%d:%d." % (hours, mins, secs))

其他功能称为:

def make_chunks(data):
    # fill missing vlaues with missing character
    data.fillna(MISSING_DATA_CHAR, inplace=True)
    # convert mon to string
    data.iloc[:, 1] = data.iloc[:, 1].astype(str)
    # create list of months to output
    file_list = list(data.iloc[:, 1].astype('str').unique())
    # create dictionary of key=month value=dataframe
    dictinput = make_slices(file_list, data)

    return dictinput

def make_slices(files, data):
    outlist = dict()
    for mon in files:
        data_slice = np.array(data[data.iloc[:, 1] == mon])
        outlist[mon] = data_slice
    return outlist

def write_in_parallel(dictinput):
    for key, value in dictinput.items():
        data = pd.DataFrame(value)
        file_name = BASE_DIR + OUT_DIR + DYNAMIC_FILE_NAME + key + OUTPUT_FILE_TYPE
        file_exists = os.path.isfile(file_name)
        with open(BASE_DIR + OUT_DIR + DYNAMIC_FILE_NAME + key +
                  OUTPUT_FILE_TYPE, 'a') as open_file:
            if not file_exists:
                data.to_csv(open_file, sep='|', index=False, header=HEADERS)
                open_file.close()
            else:
                data.to_csv(open_file, sep='|', index=False, header=False)
                open_file.close()

0 个答案:

没有答案