我使用dataframe.from_delayed
方法创建了以下数据框,其中包含以下列
_id hour_timestamp http_method total_hits username hour weekday.
源数据框的一些细节:
hits_rate_stats._meta.dtypes
_id object
hour_timestamp datetime64[ns]
http_method object
total_hits object
username object
hour int64
weekday int64
dtype: object
meta index:
RangeIndex(start=0, stop=0, step=1)
当我执行以下代码时
my_df_grouped = my_df.groupby(['username', 'http_method', 'weekday', 'hour'])
my_df_grouped.total_hits.sum().reset_index().compute()
我得到以下异常:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-27-b24b24fc86db> in <module>()
----> 1 hits_rate_stats_grouped.total_hits.sum().reset_index().compute()
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/dask/base.pyc in compute(self, **kwargs)
141 dask.base.compute
142 """
--> 143 (result,) = compute(self, traverse=False, **kwargs)
144 return result
145
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/dask/base.pyc in compute(*args, **kwargs)
390 postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
391 else (None, a) for a in args]
--> 392 results = get(dsk, keys, **kwargs)
393 results_iter = iter(results)
394 return tuple(a if f is None else f(next(results_iter), *a)
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/client.pyc in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, **kwargs)
2039 secede()
2040 try:
-> 2041 results = self.gather(packed, asynchronous=asynchronous)
2042 finally:
2043 for f in futures.values():
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/client.pyc in gather(self, futures, errors, maxsize, direct, asynchronous)
1476 return self.sync(self._gather, futures, errors=errors,
1477 direct=direct, local_worker=local_worker,
-> 1478 asynchronous=asynchronous)
1479
1480 @gen.coroutine
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/client.pyc in sync(self, func, *args, **kwargs)
601 return future
602 else:
--> 603 return sync(self.loop, func, *args, **kwargs)
604
605 def __repr__(self):
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/utils.pyc in sync(loop, func, *args, **kwargs)
251 e.wait(10)
252 if error[0]:
--> 253 six.reraise(*error[0])
254 else:
255 return result[0]
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/utils.pyc in f()
235 yield gen.moment
236 thread_state.asynchronous = True
--> 237 result[0] = yield make_coro()
238 except Exception as exc:
239 logger.exception(exc)
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/tornado/gen.pyc in run(self)
1053
1054 try:
-> 1055 value = future.result()
1056 except Exception:
1057 self.had_exception = True
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/tornado/concurrent.pyc in result(self, timeout)
236 if self._exc_info is not None:
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
240 self = None
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/tornado/gen.pyc in run(self)
1061 if exc_info is not None:
1062 try:
-> 1063 yielded = self.gen.throw(*exc_info)
1064 finally:
1065 # Break up a reference to itself
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/client.pyc in _gather(self, futures, errors, direct, local_worker)
1354 six.reraise(type(exception),
1355 exception,
-> 1356 traceback)
1357 if errors == 'skip':
1358 bad_keys.add(key)
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/dask/dataframe/core.pyc in apply_and_enforce()
3354 return meta
3355 c = meta.columns if isinstance(df, pd.DataFrame) else meta.name
-> 3356 return _rename(c, df)
3357 return df
3358
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/dask/dataframe/core.pyc in _rename()
3391 # deep=False doesn't doesn't copy any data/indices, so this is cheap
3392 df = df.copy(deep=False)
-> 3393 df.columns = columns
3394 return df
3395 elif isinstance(df, (pd.Series, pd.Index)):
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/pandas/core/generic.pyc in __setattr__()
3625 try:
3626 object.__getattribute__(self, name)
-> 3627 return object.__setattr__(self, name, value)
3628 except AttributeError:
3629 pass
pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__set__()
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/pandas/core/generic.pyc in _set_axis()
557
558 def _set_axis(self, axis, labels):
--> 559 self._data.set_axis(axis, labels)
560 self._clear_item_cache()
561
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/pandas/core/internals.pyc in set_axis()
3072 raise ValueError('Length mismatch: Expected axis has %d elements, '
3073 'new values have %d elements' %
-> 3074 (old_len, new_len))
3075
3076 self.axes[axis] = new_labels
ValueError: Length mismatch: Expected axis has 5 elements, new values have 2 elements
当我做my_df_grouped.count().reset_index().compute()
时它会正常工作,当我做my_df_grouped.sum().reset_index().compute()
时,我得到了
/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/pandas/core/groupby.pyc in _get_grouper()
2830 raise ValueError('No group keys passed!')
2831 else:
-> 2832 raise ValueError('multiple levels only valid with '
2833 'MultiIndex')
2834
ValueError: multiple levels only valid with MultiIndex
使用虚拟数据在本地重新生成并不会给我这些错误。可能出现什么问题?
编辑: 似乎它正在失去多指数。如果我这样做:
total_hits = my_df_grouped.total_hits.sum()
total_hits._meta.index = pd.MultiIndex(levels=[[],[],[],[],], labels=[[],[],[],[]], names=['username', 'http_method', 'weekday', hour'])