将.diff()与dask数据帧一起使用时的ValueError

时间:2018-02-04 18:21:27

标签: python pandas time-series dask

我有一个大型时间序列数据集,我想用Dask处理。

除了其他一些专栏外,还有一个名为“id”的列标识了个人,列transc_date标识了日期,列transc_time标识了个人进行交易的时间

使用以下方式对数据进行排序:

df = df.map_partitions(lambda x: x.sort_values(['id', 'transc_date', 'transc_time'], ascending=[True, True, True]))

transc_time的类型为int,transc_date的类型为datetime64

我想创建一个新列,为每个人提供自上次交易以来的天数。为此,我创建了以下函数:

def get_diff_since_last_trans(df, plot=True):
    df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
    diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
    if plot:
        sns.distplot(diffs.values, kde = False, rug = False)      
    return diffs

当我在一小部分数据(200k行)上尝试此功能时,它按预期工作。但是当我在完整数据集上使用它时,我得到一个ValueErro

我丢弃了所有少于10次出现的ID。 transc_date不包含nan,只包含datetime64个条目。

知道出了什么问题吗?

 ValueError                                Traceback (most recent call last)
<ipython-input-12-551d7256f328> in <module>()
      1 a = get_diff_first_last_trans(df, plot=False)
----> 2 b = get_diff_since_last_trans(df, plot=False)
      3 plot_trans_diff(a,b)

<ipython-input-10-8f83d4571659> in get_diff_since_last_trans(df, plot)
     12 def get_diff_since_last_trans(df, plot=True):
     13     df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
---> 14     diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
     15     if plot:
     16         sns.distplot(diffs.values, kde = False, rug = False)

~/venv/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
    133         dask.base.compute
    134         """
--> 135(result,)= compute(self, traverse=False,**kwargs)    136return result
    137           


~/venv/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
    331     postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
    332                     else (None, a) for a in args]
--> 333     results = get(dsk, keys, **kwargs)
    334     results_iter = iter(results)
    335     return tuple(a if f is None else f(next(results_iter), *a)

~/venv/lib/python3.6/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, **kwargs)
   1997                 secede()
   1998             try:
-> 1999                 results = self.gather(packed, asynchronous=asynchronous)
   2000             finally:
   2001                 for f in futures.values():

~/venv/lib/python3.6/site-packages/distributed/client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
   1435             return self.sync(self._gather, futures, errors=errors,
   1436                              direct=direct, local_worker=local_worker,
-> 1437                              asynchronous=asynchronous)
   1438 
   1439     @gen.coroutine

~/venv/lib/python3.6/site-packages/distributed/client.py in sync(self, func, *args, **kwargs)
    590             return future
    591         else:
--> 592return sync(self.loop, func,*args,**kwargs)    593    594def __repr__(self):               



~/venv/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, *args, **kwargs)
    252             e.wait(1000000)
    253     if error[0]:
--> 254         six.reraise(*error[0])
    255     else:
    256         return result[0]

~/venv/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
    691             if value.__traceback__ is not tb:
    692                 raise value.with_traceback(tb)
--> 693raise value
    694finally:    695             value =None                      


~/venv/lib/python3.6/site-packages/distributed/utils.py in f()
    236             yield gen.moment
    237             thread_state.asynchronous = True
--> 238             result[0] = yield make_coro()
    239         except Exception as exc:
    240             logger.exception(exc)

~/venv/lib/python3.6/site-packages/tornado/gen.py in run(self)
   1053 
   1054                     try:
-> 1055                         value = future.result()
   1056                     except Exception:
   1057                         self.had_exception = True

~/venv/lib/python3.6/site-packages/tornado/concurrent.py in result(self, timeout)
    236         if self._exc_info is not None:
    237             try:
--> 238                 raise_exc_info(self._exc_info)
    239             finally:
    240                 self = None

~/venv/lib/python3.6/site-packages/tornado/util.py in raise_exc_info(exc_info)

~/venv/lib/python3.6/site-packages/tornado/gen.py in run(self)
   1061                     if exc_info is not None:
   1062                         try:
-> 1063                             yielded = self.gen.throw(*exc_info)
   1064                         finally:
   1065                             # Break up a reference to itself

~/venv/lib/python3.6/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
   1313                             six.reraise(type(exception),
   1314                                         exception,
-> 1315                                         traceback)
   1316                     if errors == 'skip':
   1317                         bad_keys.add(key)

~/venv/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
    690                 value = tp()
    691             if value.__traceback__ is not tb:
--> 692raise value.with_traceback(tb)    693raise value
    694finally:                 


~/venv/lib/python3.6/site-packages/dask/dataframe/rolling.py in overlap_chunk()
     30     parts = [p for p in (prev_part, current_part, next_part) if p is not None]
     31     combined = pd.concat(parts)
---> 32     out = func(combined, *args, **kwargs)
     33     if prev_part is None:
     34         before = None

<ipython-input-10-8f83d4571659> in <lambda>()
     11 
     12 def get_diff_since_last_trans(df, plot=True):
---> 13     df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
     14     diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
     15     if plot:

~/venv/lib/python3.6/site-packages/pandas/core/groupby.py in wrapper()
    737                                                             *args, **kwargs)
    738                     except (AttributeError):
--> 739raise ValueError
    740    741return wrapper                          


ValueError: 

0 个答案:

没有答案