Question

我想使用Dask将一些数据从单列扩展到多列。我这样做是通过使用Dask的DataFrame apply方法和一个自定义函数返回一个新的Pandas DataFrame来实现的。但是，当我尝试这样做时，我得到ValueError: If using all scalar values, you must pass an index。这是一个最小的复制案例：

import dask.dataframe as dd
import pandas as pd

pd_df = pd.DataFrame({'a': [1, 2, 3, 4, 5]}, dtype=float)
df = dd.from_pandas(pd_df, npartitions=2)

def custom_fn(row):
    num = row['a']
    frame = pd.DataFrame({
        'squared': [num * num],
        'x2': [num * 2],
    }, dtype=float, index=[0])
    return frame

new_frame = df.apply(custom_fn, axis=1, meta={
    'squared': float,
    'x2': float,
}, result_type='expand')

new_frame.head()

和堆栈跟踪：

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-38-6aaf3a5d32b2> in <module>()
     12 }, result_type='expand')
     13 
---> 14 new_frame.head()

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/dask/dataframe/core.pyc in head(self, n, npartitions, compute)
    896 
    897         if compute:
--> 898             result = result.compute()
    899         return result
    900 

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/dask/base.pyc in compute(self, **kwargs)
    154         dask.base.compute
    155         """
--> 156         (result,) = compute(self, traverse=False, **kwargs)
    157         return result
    158 

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/dask/base.pyc in compute(*args, **kwargs)
    396     keys = [x.__dask_keys__() for x in collections]
    397     postcomputes = [x.__dask_postcompute__() for x in collections]
--> 398     results = schedule(dsk, keys, **kwargs)
    399     return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
    400 

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/dask/threaded.pyc in get(dsk, result, cache, num_workers, pool, **kwargs)
     74     results = get_async(pool.apply_async, len(pool._pool), dsk, result,
     75                         cache=cache, get_id=_thread_get_id,
---> 76                         pack_exception=pack_exception, **kwargs)
     77 
     78     # Cleanup pools associated to dead threads

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/dask/local.pyc in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
    458                         _execute_task(task, data)  # Re-execute locally
    459                     else:
--> 460                         raise_exception(exc, tb)
    461                 res, worker_id = loads(res_info)
    462                 state['cache'][key] = res

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/dask/local.pyc in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
    228     try:
    229         task, data = loads(task_info)
--> 230         result = _execute_task(task, data)
    231         id = get_id()
    232         result = dumps((result, id))

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/dask/core.pyc in _execute_task(arg, cache, dsk)
    116     elif istask(arg):
    117         func, args = arg[0], arg[1:]
--> 118         args2 = [_execute_task(a, cache) for a in args]
    119         return func(*args2)
    120     elif not ishashable(arg):

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/dask/core.pyc in _execute_task(arg, cache, dsk)
    117         func, args = arg[0], arg[1:]
    118         args2 = [_execute_task(a, cache) for a in args]
--> 119         return func(*args2)
    120     elif not ishashable(arg):
    121         return arg

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/dask/optimization.pyc in __call__(self, *args)
    940                              % (len(self.inkeys), len(args)))
    941         return core.get(self.dsk, self.outkey,
--> 942                         dict(zip(self.inkeys, args)))
    943 
    944     def __reduce__(self):

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/dask/core.pyc in get(dsk, out, cache)
    147     for key in toposort(dsk):
    148         task = dsk[key]
--> 149         result = _execute_task(task, cache)
    150         cache[key] = result
    151     result = _execute_task(out, cache)

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/dask/core.pyc in _execute_task(arg, cache, dsk)
    117         func, args = arg[0], arg[1:]
    118         args2 = [_execute_task(a, cache) for a in args]
--> 119         return func(*args2)
    120     elif not ishashable(arg):
    121         return arg

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/dask/dataframe/core.pyc in apply_and_enforce(*args, **kwargs)
   3792     func = kwargs.pop('_func')
   3793     meta = kwargs.pop('_meta')
-> 3794     df = func(*args, **kwargs)
   3795     if is_dataframe_like(df) or is_series_like(df) or is_index_like(df):
   3796         if not len(df):

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/dask/utils.pyc in __call__(self, obj, *args, **kwargs)
    714 
    715     def __call__(self, obj, *args, **kwargs):
--> 716         return getattr(obj, self.method)(*args, **kwargs)
    717 
    718     def __reduce__(self):

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/pandas/core/frame.pyc in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
   6485                          args=args,
   6486                          kwds=kwds)
-> 6487         return op.get_result()
   6488 
   6489     def applymap(self, func):

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/pandas/core/apply.pyc in get_result(self)
    149             return self.apply_raw()
    150 
--> 151         return self.apply_standard()
    152 
    153     def apply_empty_result(self):

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/pandas/core/apply.pyc in apply_standard(self)
    258 
    259         # wrap results
--> 260         return self.wrap_results()
    261 
    262     def apply_series_generator(self):

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/pandas/core/apply.pyc in wrap_results(self)
    306         if len(results) > 0 and is_sequence(results[0]):
    307 
--> 308             return self.wrap_results_for_axis()
    309 
    310         # dict of scalars

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/pandas/core/apply.pyc in wrap_results_for_axis(self)
    382         # we have requested to expand
    383         if self.result_type == 'expand':
--> 384             result = self.infer_to_same_shape()
    385 
    386         # we have a non-series and don't want inference

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/pandas/core/apply.pyc in infer_to_same_shape(self)
    400         results = self.results
    401 
--> 402         result = self.obj._constructor(data=results)
    403         result = result.T
    404 

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    390                                  dtype=dtype, copy=copy)
    391         elif isinstance(data, dict):
--> 392             mgr = init_dict(data, index, columns, dtype=dtype)
    393         elif isinstance(data, ma.MaskedArray):
    394             import numpy.ma.mrecords as mrecords

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/pandas/core/internals/construction.pyc in init_dict(data, index, columns, dtype)
    210         arrays = [data[k] for k in keys]
    211 
--> 212     return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
    213 
    214 

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/pandas/core/internals/construction.pyc in arrays_to_mgr(arrays, arr_names, index, columns, dtype)
     49     # figure out the index, if necessary
     50     if index is None:
---> 51         index = extract_index(arrays)
     52     else:
     53         index = ensure_index(index)

/nail/home/shawn/pg/research_ipython/virtualenv_run/local/lib/python2.7/site-packages/pandas/core/internals/construction.pyc in extract_index(data)
    306 
    307         if not indexes and not raw_lengths:
--> 308             raise ValueError('If using all scalar values, you must pass'
    309                              ' an index')
    310 

ValueError: If using all scalar values, you must pass an index

我可以省略result_type='expand'和meta kwarg来获得一个我在方法中返回的DataFrame完整的DataFrame，但是我确实希望它可以内联扩展。我在Python 2.7.6上使用Dask 1.1.4和Pandas 0.24.1。

编辑：我发现以后可以像这样扩展行：

new_frame = df.apply(custom_fn, axis=1)

dd.concat([
    data for _, data in new_frame.iteritems()
], interleave_partitions=True).head()

这有点混乱，但至少目前看来似乎可行。

使用Dask DataFrame.apply（）时出现“ ValueError：如果使用所有标量值，则必须传递索引”

0 个答案: