我正在尝试使用python dask创建一个更新的随机森林分类示例,如最初描述的here。
当我尝试将训练集传递给Client.map函数时,它会抛出一个KeyError,我不确定基于错误消息我做错了什么。
这就是我所拥有的:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from distributed import Client, progress, wait
c = Client('127.0.0.1:8786')
c
columns = ['trip_distance', 'pickup_longitude', 'pickup_latitude',
'dropoff_longitude', 'dropoff_latitude', 'payment_type',
'fare_amount', 'mta_tax', 'tip_amount', 'tolls_amount']
import dask.dataframe as dd
dfs = dd.read_csv('s3://dask-data/nyc-taxi/2015/*.csv',
parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'],
storage_options={'anon': True})
dfs = c.persist(dfs)
progress(dfs)
def fit(df):
est = RandomForestClassifier(n_estimators=4)
est.fit(df[columns], df.passenger_count)
return est
train, test = dfs.random_split([0.7, 0.3])
estimators = c.map(fit, train)
progress(estimators, complete=False)
抛出错误:
KeyError Traceback (most recent call last)
/opt/anaconda/lib/python3.5/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2524 try:
-> 2525 return self._engine.get_loc(key)
2526 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 0
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-61-9846f819ffca> in <module>()
8 train, test = dfs.random_split([0.7, 0.3])
9
---> 10 estimators = c.map(fit, train)
11 progress(estimators, complete=False)
/opt/anaconda/lib/python3.5/site-packages/distributed/client.py in map(self, func, *iterables, **kwargs)
1243 raise ValueError("Only use allow_other_workers= if using workers=")
1244
-> 1245 iterables = list(zip(*zip(*iterables)))
1246 if isinstance(key, list):
1247 keys = key
/opt/anaconda/lib/python3.5/site-packages/dask/dataframe/core.py in __getitem__(self, key)
2284
2285 # error is raised from pandas
-> 2286 meta = self._meta[_extract_meta(key)]
2287 dsk = dict(((name, i), (operator.getitem, (self._name, i), key))
2288 for i in range(self.npartitions))
/opt/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in __getitem__(self, key)
2137 return self._getitem_multilevel(key)
2138 else:
-> 2139 return self._getitem_column(key)
2140
2141 def _getitem_column(self, key):
/opt/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _getitem_column(self, key)
2144 # get column
2145 if self.columns.is_unique:
-> 2146 return self._get_item_cache(key)
2147
2148 # duplicate columns & possible reduce dimensionality
/opt/anaconda/lib/python3.5/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1840 res = cache.get(item)
1841 if res is None:
-> 1842 values = self._data.get(item)
1843 res = self._box_item_values(item, values)
1844 cache[item] = res
/opt/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in get(self, item, fastpath)
3841
3842 if not isna(item):
-> 3843 loc = self.items.get_loc(item)
3844 else:
3845 indexer = np.arange(len(self.items))[isna(self.items)]
/opt/anaconda/lib/python3.5/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2525 return self._engine.get_loc(key)
2526 except KeyError:
-> 2527 return self._engine.get_loc(self._maybe_cast_indexer(key))
2528
2529 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 0
根据错误输出,错误似乎在estimators = c.map(fit, train)
语句处被触发,表明可能需要修改def fit(df):
,以便可以将dask数据帧正确传递给{{1但是我不确定如何。