Question

在应用返回pandas数据帧上的元组的函数时，我遇到了一些我不明白的行为。我的目的是让df.apply()返回一个新系列，但只有当我在数据框中对列进行子集以排除一个日期时间序列的列时，这似乎才有效。

这个虚拟示例演示了我所看到的行为：

df = pd.DataFrame(np.random.randn(5, 4), columns=list('ABCD'))

def random(row):
    # Return an tuple with more elements than df has columns
    return (1,2,3,4,5,6,7,8)

df.apply(random,axis=1)

# Output, returns new series as expected:
0    (1, 2, 3, 4, 5, 6, 7, 8)
1    (1, 2, 3, 4, 5, 6, 7, 8)
2    (1, 2, 3, 4, 5, 6, 7, 8)
3    (1, 2, 3, 4, 5, 6, 7, 8)
4    (1, 2, 3, 4, 5, 6, 7, 8)

按预期工作，但是当我向数据帧添加datetime列时......

df['E'] = datetime.now()

df.apply(random,axis=1)

我收到此错误：

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in create_block_manager_from_arrays(arrays, names, axes)
   4262         blocks = form_blocks(arrays, names, axes)
-> 4263         mgr = BlockManager(blocks, axes)
   4264         mgr._consolidate_inplace()

/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in __init__(self, blocks, axes, do_integrity_check, fastpath)
   2760         if do_integrity_check:
-> 2761             self._verify_integrity()
   2762 

/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in _verify_integrity(self)
   2970             if block._verify_integrity and block.shape[1:] != mgr_shape[1:]:
-> 2971                 construction_error(tot_items, block.shape[1:], self.axes)
   2972         if len(self.items) != tot_items:

/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in construction_error(tot_items, block_shape, axes, e)
   4232     raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
-> 4233         passed, implied))
   4234 

ValueError: Shape of passed values is (5, 8), indices imply (5, 5)

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-29-b57dd4b93995> in <module>()
----> 1 df.apply(random,axis=1)

/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   4150                     if reduce is None:
   4151                         reduce = True
-> 4152                     return self._apply_standard(f, axis, reduce=reduce)
   4153             else:
   4154                 return self._apply_broadcast(f, axis)

/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
   4263                 index = None
   4264 
-> 4265             result = self._constructor(data=results, index=index)
   4266             result.columns = res_index
   4267 

/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
    264                                  dtype=dtype, copy=copy)
    265         elif isinstance(data, dict):
--> 266             mgr = self._init_dict(data, index, columns, dtype=dtype)
    267         elif isinstance(data, ma.MaskedArray):
    268             import numpy.ma.mrecords as mrecords

/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _init_dict(self, data, index, columns, dtype)
    400             arrays = [data[k] for k in keys]
    401 
--> 402         return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
    403 
    404     def _init_ndarray(self, values, index, columns, dtype=None, copy=False):

/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
   5406     axes = [_ensure_index(columns), _ensure_index(index)]
   5407 
-> 5408     return create_block_manager_from_arrays(arrays, arr_names, axes)
   5409 
   5410 

/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in create_block_manager_from_arrays(arrays, names, axes)
   4265         return mgr
   4266     except ValueError as e:
-> 4267         construction_error(len(arrays), arrays[0].shape, axes, e)
   4268 
   4269 

/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in construction_error(tot_items, block_shape, axes, e)
   4231         raise ValueError("Empty data passed with indices specified.")
   4232     raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
-> 4233         passed, implied))
   4234 
   4235 

ValueError: Shape of passed values is (5, 8), indices imply (5, 5)

我没有收到错误的唯一一次是当函数返回一个元素与元数据具有相同数量的元素时，返回一个DataFrame而不是一个系列。

有没有办法改变这种行为？在我的情况下，我不需要在函数中使用日期时间信息，但我仍然不明白排除它如何改变应用的行为。

任何见解都将受到赞赏。

Answer 1

似乎pandas根据你的df的数据类型处理不同的应用返回值。在第一个示例中，所有数据类型都是浮点数，而在添加列E之后，数据类型是混合的，这导致pandas尝试使用返回的值重建数据帧。我不知道这种行为背后的理性，但以下应该解决你的问题。：

df.astype(object).apply(random,axis=1)
Out[64]: 
0    (1, 2, 3, 4, 5, 6, 7, 8)
1    (1, 2, 3, 4, 5, 6, 7, 8)
2    (1, 2, 3, 4, 5, 6, 7, 8)
3    (1, 2, 3, 4, 5, 6, 7, 8)
4    (1, 2, 3, 4, 5, 6, 7, 8)

Answer 2

https://github.com/pandas-dev/pandas/blob/v0.22.0/pandas/core/frame.py#L236-L6142

class DataFrame(NDFrame):
def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):
    # skip if we are mixed datelike and trying reduce across axes
    # GH6125
    # this block will judge if it's datelike mixed type and disable reduce if true
    if (reduce and axis == 1 and self._is_mixed_type and
            self._is_datelike_mixed_type):
        reduce = False
    # try to reduce first (by default)
    # this only matters if the reduction in values is of different dtype
    # e.g. if we want to apply to a SparseFrame, then can't directly reduce
    if reduce:
        values = self.values

        # we cannot reduce using non-numpy dtypes,
        # as demonstrated in gh-12244
        if not is_extension_type(values):
            # Create a dummy Series from an empty array
            index = self._get_axis(axis)
            empty_arr = np.empty(len(index), dtype=values.dtype)
            dummy = Series(empty_arr, index=self._get_axis(axis),
                           dtype=values.dtype)

            try:
                labels = self._get_agg_axis(axis)
                result = lib.reduce(values, func, axis=axis, dummy=dummy,
                                    labels=labels)
                return Series(result, index=labels)
            except Exception:
                pass

    dtype = object if self._is_mixed_type else None
    if axis == 0:
        series_gen = (self._ixs(i, axis=1)
                      for i in range(len(self.columns)))
        res_index = self.columns
        res_columns = self.index
    elif axis == 1:
        res_index = self.index
        res_columns = self.columns
        values = self.values
        series_gen = (Series.from_array(arr, index=res_columns, name=name,
                                        dtype=dtype)
                      for i, (arr, name) in enumerate(zip(values,
                                                          res_index)))
    else:  # pragma : no cover
        raise AssertionError('Axis must be 0 or 1, got %s' % str(axis))

    i = None
    keys = []
    results = {}
    if ignore_failures:
        successes = []
        for i, v in enumerate(series_gen):
            try:
                results[i] = func(v)
                keys.append(v.name)
                successes.append(i)
            except Exception:
                pass
        # so will work with MultiIndex
        if len(successes) < len(res_index):
            res_index = res_index.take(successes)
    else:
        try:
            for i, v in enumerate(series_gen):
                results[i] = func(v)
                keys.append(v.name)
        except Exception as e:
            if hasattr(e, 'args'):
                # make sure i is defined
                if i is not None:
                    k = res_index[i]
                    e.args = e.args + ('occurred at index %s' %
                                       pprint_thing(k), )
            raise

    if len(results) > 0 and is_sequence(results[0]):
        if not isinstance(results[0], Series):
            index = res_columns
        else:
            index = None

        result = self._constructor(data=results, index=index)
        result.columns = res_index

        if axis == 1:
            result = result.T
        result = result._convert(datetime=True, timedelta=True, copy=False)

    else:

        result = Series(results)
        result.index = res_index

    return result

我找到了你的问题背后的源代码，因为详细的原因你可以检查GH6125作为评论说。我的决议有点愚蠢如下。

df = pd.DataFrame({'a': [1, 2, 3]})
class TMP():
    def __init__(self, a, **kws):
        self.a = a
        self.__dict__.update(kws)
def func(line):
    return TMP(line['a'], 'x', 'y')
se = df.apply(func, axis=1)
se
> 0    <__main__.TMP object at 0x11122f390>
  1    <__main__.TMP object at 0x11122f400>
  2    <__main__.TMP object at 0x11122f438>
  dtype: object
se.apply(lambda tmp: tmp.a)
> 0    1
  1    2
  2    3 
  dtype: int64

第二个解决方案是确保func返回一个系列（看起来更慢）

def func(line):
    return pd.Series(['a', 'x', 'y'])
df.apply(func, axis=1)

希望有所帮助。

当初始df包含datetime系列时，Pandas应用返回的DataFrame

2 个答案: