从使用Dask延迟的简单示例中我期望,我已经阅读到,我基本上可以使用以下几个函数调用从scikit-learn复制gridsearchcv。看来该模型永远都不适合(model.fit(...)),因为循环的其余部分仍在继续(pred(...))?
我如何嵌套函数有问题吗?我知道有dask的gridsearchcv,但是问题是我的真实模型是多输入Keras LSTM,您不能将3d数组传递为“ X”。该代码在没有Dask的情况下可以正常工作。
这是一个可复制的小例子:
import dask
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold,ParameterGrid
from sklearn.metrics import mean_squared_error
from keras import Sequential
from keras.layers import Dense
boston = load_boston()
y=boston.target
X=boston.data
@dask.delayed
def create_model(dense_nodes):
model = Sequential()
model.add(Dense(dense_nodes, input_dim=13, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
# Compile model
model.compile(loss='mean_squared_error', optimizer='adam')
return model
@dask.delayed
def cv_model(X,y,kf,params_dct):
dense_nodes = params_dct['dense']
hold_actual=np.zeros((X.shape[0],1))
hold_preds=np.zeros((X.shape[0],1))
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
model=create_model(dense_nodes)
model.fit(X_train,y_train,batch_size=64, epochs=5)
pred=model.predict(X_test)
hold_actual[test_index,0]=y_test.ravel()
hold_preds[test_index,0]=pred.ravel()
return(mean_squared_error(hold_actual,hold_preds))
kfold=KFold(n_splits=3,random_state=4521)
grid=ParameterGrid({'dense':[2,3,4,5,6,7,8,9,10]})
output=[]
for i in grid:
output.append(cv_model(X,y,kfold,grid[0]))
total=dask.delayed(output)
total.compute()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-53-2116b76de18c> in <module>()
52
53 total=dask.delayed(output)
---> 54 total.compute()
~/anaconda3/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
153 dask.base.compute
154 """
--> 155 (result,) = compute(self, traverse=False, **kwargs)
156 return result
157
~/anaconda3/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
402 postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
403 else (None, a) for a in args]
--> 404 results = get(dsk, keys, **kwargs)
405 results_iter = iter(results)
406 return tuple(a if f is None else f(next(results_iter), *a)
~/anaconda3/lib/python3.6/site-packages/dask/threaded.py in get(dsk, result, cache, num_workers, **kwargs)
73 results = get_async(pool.apply_async, len(pool._pool), dsk, result,
74 cache=cache, get_id=_thread_get_id,
---> 75 pack_exception=pack_exception, **kwargs)
76
77 # Cleanup pools associated to dead threads
~/anaconda3/lib/python3.6/site-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
519 _execute_task(task, data) # Re-execute locally
520 else:
--> 521 raise_exception(exc, tb)
522 res, worker_id = loads(res_info)
523 state['cache'][key] = res
~/anaconda3/lib/python3.6/site-packages/dask/compatibility.py in reraise(exc, tb)
65 if exc.__traceback__ is not tb:
66 raise exc.with_traceback(tb)
---> 67 raise exc
68
69 else:
~/anaconda3/lib/python3.6/site-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
288 try:
289 task, data = loads(task_info)
--> 290 result = _execute_task(task, data)
291 id = get_id()
292 result = dumps((result, id))
~/anaconda3/lib/python3.6/site-packages/dask/local.py in _execute_task(arg, cache, dsk)
269 func, args = arg[0], arg[1:]
270 args2 = [_execute_task(a, cache) for a in args]
--> 271 return func(*args2)
272 elif not ishashable(arg):
273 return arg
<ipython-input-53-2116b76de18c> in cv_model(X, y, kf, params_dct)
38 pred=model.predict(X_test)
39 hold_actual[test_index,0]=y_test.ravel()
---> 40 hold_preds[test_index,0]=pred.ravel()
41
42 return(mean_squared_error(hold_actual,hold_preds))
ValueError: setting an array element with a sequence.
添加#1
这是第二次尝试,错误仍然存在。
import dask
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold,ParameterGrid
from sklearn.metrics import mean_squared_error
from keras import Sequential
from keras.layers import Dense
import tensorflow as tf
boston = load_boston()
y=boston.target
X=boston.data
import tensorflow as tf
#You never want to call delayed functions from within other delayed functions
#https://stackoverflow.com/questions/51219354/cant-train-keras-model-with-dask
@dask.delayed
def create_model(dense_nodes):
model = Sequential()
model.add(Dense(dense_nodes, input_dim=13, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
# Compile model
model.compile(loss='mean_squared_error', optimizer='adam')
return model
def cv_model(X,y,kf,params_dct):
dense_nodes = params_dct['dense']
hold_actual=np.zeros((X.shape[0],1))
hold_preds=np.zeros((X.shape[0],1))
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
model=create_model(dense_nodes)
model.fit(X_train,y_train,batch_size=64, epochs=5)
pred=model.predict(X_test)
hold_actual[test_index,0]=y_test.ravel()
hold_preds[test_index,0]=pred.ravel()
return(dask.delayed(mean_squared_error(hold_actual,hold_preds)))
kfold=KFold(n_splits=3,random_state=4521)
grid=ParameterGrid({'dense':[2,3,4,5,6,7,8,9,10]})
output=[]
for i in grid:
delayed_value=cv_model(X,y,kfold,grid[0])
result=delayed_value.compute()
ADD#2
事实证明,Keras / TF出现了一个导致Dask以外的错误的问题。我将在另一个问题中解决这个问题。因此,我将Keras模型换成Xgboost模型,以便为此目的适当设置Dask。
这是代码。我确实发现我需要注释掉在mean_squared_error位中延迟发送给Dask的电话。
import dask
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold,ParameterGrid
from sklearn.metrics import mean_squared_error
import xgboost as xgb
boston = load_boston()
y=boston.target
X=boston.data
@dask.delayed
def cv_model(X,y,kf,params_dct):
hold_actual=np.zeros((X.shape[0],1))
hold_preds=np.zeros((X.shape[0],1))
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
dtrain=xgb.DMatrix(data=X_train, label=y_train)
dtest=xgb.DMatrix(data=X_test, label=y_test)
regmod = xgb.train(params_dct, dtrain, 10)
pred=regmod.predict(dtest)
hold_actual[test_index,0]=y_test.ravel()
hold_preds[test_index,0]=pred.ravel()
#return(dask.delayed(mean_squared_error)(np.array(hold_actual),np.array(hold_preds)))
return({'result':mean_squared_error(np.array(hold_actual),np.array(hold_preds)),'param':params_dct})
kfold=KFold(n_splits=3,random_state=4521)
grid=ParameterGrid({'max_depth':[2,3,4,5,6,7,8,9,10], 'eta':[0.01,0.05], 'min_child_weight': [1,2,3,4,5]})
output=[]
for i in grid:
output.append(cv_model(X,y,kfold,i))
total=dask.delayed(output)
result=total.compute()
答案 0 :(得分:1)
您不想在cv_model
函数上调用dask.delayed。您永远都不想从其他延迟函数中调用延迟函数。相反,调用延迟函数的函数通常非常快(它们不做任何工作),因此您希望立即而不是懒惰地调用它们。
您的for循环似乎懒惰地创建了许多模型,调用了那些模型的方法(也将是惰性的),然后对结果调用mean_squared_error
。此功能可能还必须标记为延迟,例如
return dask.delayed(mean_squared_error)(hold_actual, hold_preds))
然后,如果您从cv_model中删除延迟的装饰器,则应该可以执行以下操作:
delayed_value = cv_model(...)
result = delayed_value.compute()
在第二个示例中,您不使用返回值调用model.fit
:
model=create_model(dense_nodes)
model.fit(X_train,y_train,batch_size=64, epochs=5)
pred=model.predict(X_test)
延迟未在适当位置运行,因此仅调用model.fit
不会执行任何操作。你可能想要
model = model.fit(...)
在这里,您是在结果而不是mean_squared_error
函数上调用dask.delay
return(dask.delayed(mean_squared_error(hold_actual,hold_preds)))