我找不到与自己的问题有关的任何问题,因此无法解决。 刚开始在数据分析中使用并行计算。我正在使用随机森林进行预测。当我使用for循环运行它时,它可以很好地工作,但是会花费更长的时间。但是使用池进行多处理,速度更快,但即使它们应该有所不同,也可以继续为我提供相同的结果。我在做什么错。
import data_process as imb
import datetime
import os
import pandas as pd
import numpy as np
from multiprocessing import Pool
from functools import partial
from myModels import RandomForest
def Average(lst):
return sum(lst) / len(lst)
def split_imbal_process(df, a):
pred_start = datetime.datetime(2018, 6, 1)
start_data = pred_start - datetime.timedelta(days=a)
df = df[start_data.strftime('%Y-%m-%d'):]
d = [i.strftime('%Y-%m-%d') for i in df.dropna().index]
n_train = 48 * a
n_records = len(d)
feature_list = list(df.drop('price', axis=1).columns)
data = [(i, j * 48) for j, i in enumerate(range(n_train, n_records, 48))]
pool = Pool(os.cpu_count() - 2)
func = partial(process_model, d, df, feature_list)
result = pool.starmap(func, data)
pool.close()
pool.join()
mape = Average([i[0] for i in result])
error = Average([i[1] for i in result])
print(mape, error)
return mape, error
def process_model(d, df, feature_list, i, j):
train_index, test_index = d[j: i], d[i: i + 48 * 5]
train_data = df[train_index[0]: train_index[-1]]
test_data = df[test_index[0]: test_index[-1]]
train_features = np.array(train_data.drop('price', axis=1))
train_target = np.array(train_data['price'])
test_features = np.array(test_data.drop('price', axis=1))
test_target = np.array(test_data['price'])
index_test = test_data.index
mape, errors = RandomForest(
train_features,
train_target,
test_features,
test_target,
feature_list,
index_test)
return mape, errors
if __name__ == '__main__':
df = imb.imbalance_feature()[:'2018-06-30'].drop('date', axis=1)
a = int(len(df[:'2018-05-31']) / 48)
d = pd.DataFrame()
for i in range(a, 7, -40):
d_mape, d_error = split_imbal_process(df, a)
print(d_mape, d_error)