我循环遍历.3和3gb之间的60个文件,执行一些基本计算,然后写出来。我无法弄清楚为什么我在文件48周围耗尽内存。必须是一个或多个对象被存储在某个地方并且它不断增长,但我无法弄清楚它是什么。在最后一次内存错误时,我使用了120/125 gb的RAM ...我也在两点并行运行。
主要功能:
def write_dynamic_columns():
perf_path = BASE_DIR + RAW_DIR
perf_files = glob.glob(perf_path + "/*" + INPUT_FILE_TYPE)
loanmap = BASE_DIR + APPL_DIR + LOAN_MAP_DIR + LOAN_MAP_FILE_NAME
loanmapdata = pd.read_table(loanmap, sep='|')
for file_ in perf_files:
if 'time' in file_:
print("Working on file: " + file_)
start = timeit.default_timer()
data = pd.read_table(file_, delimiter='|', header=None)
final_data = pd.merge(data, loanmapdata, left_on=0,
right_on='loan_id', how='left')
final_data = final_data.iloc[:, :-1]
with mp.Pool(processes=14) as pool:
chunks = pool.map(make_chunks, np.array_split(final_data, 14))
pool.close()
pool.join()
with mp.Pool(processes=14) as pool:
pool.map(write_in_parallel, np.ravel(chunks))
stop = timeit.default_timer()
total_time = stop - start
mins, secs = divmod(total_time, 60)
hours, mins = divmod(mins, 60)
print("Finished writing quarter: " + file_ + "\n" + "ime: %d:%d:%d." % (hours, mins, secs))
其他功能称为:
def make_chunks(data):
# fill missing vlaues with missing character
data.fillna(MISSING_DATA_CHAR, inplace=True)
# convert mon to string
data.iloc[:, 1] = data.iloc[:, 1].astype(str)
# create list of months to output
file_list = list(data.iloc[:, 1].astype('str').unique())
# create dictionary of key=month value=dataframe
dictinput = make_slices(file_list, data)
return dictinput
def make_slices(files, data):
outlist = dict()
for mon in files:
data_slice = np.array(data[data.iloc[:, 1] == mon])
outlist[mon] = data_slice
return outlist
def write_in_parallel(dictinput):
for key, value in dictinput.items():
data = pd.DataFrame(value)
file_name = BASE_DIR + OUT_DIR + DYNAMIC_FILE_NAME + key + OUTPUT_FILE_TYPE
file_exists = os.path.isfile(file_name)
with open(BASE_DIR + OUT_DIR + DYNAMIC_FILE_NAME + key +
OUTPUT_FILE_TYPE, 'a') as open_file:
if not file_exists:
data.to_csv(open_file, sep='|', index=False, header=HEADERS)
open_file.close()
else:
data.to_csv(open_file, sep='|', index=False, header=False)
open_file.close()