我正在尝试并行化这段代码,但它的运行速度比joblib多核慢,我认为是因为它在功能上存在瓶颈。
我需要执行大约10,000次此过程,所以执行时间的任何改善都会很大。
在gridsearch部分中,它可以单独完成,但随后每次拟合都是顺序完成
from dask.distributed import Client, progress
client = Client('tcp://xx')
import dask
from math import sqrt
dask.config.set(scheduler="threads")
from warnings import catch_warnings
from warnings import filterwarnings
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error
from pandas import read_csv
from numpy import array
import pandas as pd
import numpy as np
import time
start_time = time.time()
# one-step Holt Winter’s Exponential Smoothing forecast
def exp_smoothing_forecast(history, config):
t,d,s,p,b,r = config
# define model
history = array(history)
model= dask.delayed(ExponentialSmoothing)(history, trend=t, damped=d, seasonal=s, seasonal_periods=p)
model=dask.compute(model)
# fit model
model_fit = model[0].fit(optimized=True, use_boxcox=b, remove_bias=r)
#model_fit=dask.compute(model_fit)
# make one step forecast
yhat = model_fit.predict(len(history), len(history))
return yhat[0]
# root mean squared error or rmse
def measure_rmse(actual, predicted):
return sqrt(mean_squared_error(actual, predicted))
# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
return data[:-n_test], data[-n_test:]
# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, cfg):
predictions = list()
# split dataset
train, test = train_test_split(data, n_test)
# seed history with training dataset
history = [x for x in train]
# step over each time-step in the test set
for i in range(len(test)):
# fit model and make forecast for history
yhat = dask.delayed(exp_smoothing_forecast)(history, cfg)
yhat =dask.compute(yhat)
# store forecast in list of predictions
predictions.append(yhat[0])
# add actual observation to history for the next loop
history.append(test[i])
# estimate prediction error
error = measure_rmse(test, predictions)
return error
# score a model, return None on failure
def score_model(data, n_test, cfg, debug=False):
result = None
# convert config to a key
key = str(cfg)
# show all warnings and fail on exception if debugging
if debug:
result = walk_forward_validation(data, n_test, cfg)
else:
# one failure during model validation suggests an unstable config
try:
# never show warnings when grid searching, too noisy
with catch_warnings():
filterwarnings("ignore")
task = dask.delayed(walk_forward_validation)(data, n_test, cfg)
result=dask.compute(task)
except:
error = None
return (key, result)
# grid search configs
def grid_search(data, cfg_list, n_test):
scores = None
task = (dask.delayed(score_model)(data, n_test, cfg) for cfg in cfg_list)
scores=dask.compute(task)
# remove empty results
scores = [r for r in scores if r[1] != None]
# sort configs by error, asc
scores.sort(key=lambda tup: tup[1])
return scores
# create a set of exponential smoothing configs to try
def exp_smoothing_configs(seasonal=[None]):
models = list()
# define config lists
t_params = ['add', 'mul', None]
d_params = [True, False]
s_params = ['add', 'mul', None]
p_params = seasonal
b_params = [True ,False]
r_params = [True ,False]
# create config instances
for t in t_params:
for d in d_params:
for s in s_params:
for p in p_params:
for b in b_params:
for r in r_params:
cfg = [t,d,s,p,b,r]
models.append(cfg)
return models
if __name__ == '__main__':
# load dataset
df = read_csv(r"DB SI SO.csv", parse_dates=True)
df['date']=pd.to_datetime(df['date'])
nombre_columnas=df.columns.values.tolist()
df = df.set_index(['date'])
save=pd.DataFrame()
rango=4
for i in range(rango):
y=df.iloc[:,i]
y.dropna()
data = y.values
data = data[np.logical_not(np.isnan(data))]
#.values
# data split
n_test = 4
# model configs
cfg_list = exp_smoothing_configs(seasonal=[12])
# grid search
scores = grid_search(data, cfg_list, n_test)
print(scores)
print("--- %.2f Segundos ---" % (time.time() - start_time))