延迟的功能DASK不良的CPU使用率和较慢的作业库

时间:2019-12-27 05:35:01

标签: python cpu dask joblib dask-delayed

我正在尝试并行化这段代码,但它的运行速度比joblib多核慢,我认为是因为它在功能上存在瓶颈。

我需要执行大约10,000次此过程,所以执行时间的任何改善都会很大。

在gridsearch部分中,它可以单独完成,但随后每次拟合都是顺序完成

from dask.distributed import Client, progress
client = Client('tcp://xx')
import dask
from math import sqrt
dask.config.set(scheduler="threads")
from warnings import catch_warnings
from warnings import filterwarnings
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error
from pandas import read_csv
from numpy import array
import pandas as pd
import numpy as np
import time

start_time = time.time()
# one-step Holt Winter’s Exponential Smoothing forecast
def exp_smoothing_forecast(history, config):
    t,d,s,p,b,r = config
    # define model
    history = array(history)
    model= dask.delayed(ExponentialSmoothing)(history, trend=t, damped=d, seasonal=s, seasonal_periods=p)
    model=dask.compute(model)
    # fit model
    model_fit = model[0].fit(optimized=True, use_boxcox=b, remove_bias=r)
    #model_fit=dask.compute(model_fit)
    # make one step forecast
    yhat = model_fit.predict(len(history), len(history))
    return yhat[0]

# root mean squared error or rmse
def measure_rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
    return data[:-n_test], data[-n_test:]

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, cfg):
    predictions = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # seed history with training dataset
    history = [x for x in train]
    # step over each time-step in the test set
    for i in range(len(test)):
        # fit model and make forecast for history
        yhat = dask.delayed(exp_smoothing_forecast)(history, cfg)
        yhat =dask.compute(yhat)
        # store forecast in list of predictions
        predictions.append(yhat[0])
        # add actual observation to history for the next loop
        history.append(test[i])
    # estimate prediction error
    error = measure_rmse(test, predictions)
    return error

# score a model, return None on failure
def score_model(data, n_test, cfg, debug=False):
    result = None
    # convert config to a key
    key = str(cfg)
    # show all warnings and fail on exception if debugging
    if debug:
        result = walk_forward_validation(data, n_test, cfg)   
    else:
        # one failure during model validation suggests an unstable config
        try:
            # never show warnings when grid searching, too noisy
            with catch_warnings():
                filterwarnings("ignore")
                task = dask.delayed(walk_forward_validation)(data, n_test, cfg)
                result=dask.compute(task)
        except:
            error = None
    return (key, result)

# grid search configs
def grid_search(data, cfg_list, n_test):
    scores = None
    task = (dask.delayed(score_model)(data, n_test, cfg) for cfg in cfg_list)
    scores=dask.compute(task)
    # remove empty results
    scores = [r for r in scores if r[1] != None]
    # sort configs by error, asc
    scores.sort(key=lambda tup: tup[1])
    return scores

# create a set of exponential smoothing configs to try
def exp_smoothing_configs(seasonal=[None]):
    models = list()
    # define config lists
    t_params = ['add', 'mul', None]
    d_params = [True, False]
    s_params = ['add', 'mul', None]
    p_params = seasonal
    b_params = [True ,False]
    r_params = [True ,False]
    # create config instances
    for t in t_params:
        for d in d_params:
            for s in s_params:
                for p in p_params:
                    for b in b_params:
                        for r in r_params:
                            cfg = [t,d,s,p,b,r]
                            models.append(cfg)
    return models

if __name__ == '__main__':
    # load dataset
    df = read_csv(r"DB SI SO.csv", parse_dates=True)
    df['date']=pd.to_datetime(df['date'])
    nombre_columnas=df.columns.values.tolist()
    df = df.set_index(['date'])
    save=pd.DataFrame()
    rango=4
    for i in range(rango):
        y=df.iloc[:,i]
        y.dropna()
        data = y.values
        data = data[np.logical_not(np.isnan(data))]
        #.values
        # data split
        n_test = 4
        # model configs
        cfg_list = exp_smoothing_configs(seasonal=[12])
        # grid search
        scores = grid_search(data, cfg_list, n_test)
        print(scores)
    print("---     %.2f Segundos     ---" % (time.time() - start_time))

0 个答案:

没有答案