在应用返回pandas数据帧上的元组的函数时,我遇到了一些我不明白的行为。我的目的是让df.apply()
返回一个新系列,但只有当我在数据框中对列进行子集以排除一个日期时间序列的列时,这似乎才有效。
这个虚拟示例演示了我所看到的行为:
df = pd.DataFrame(np.random.randn(5, 4), columns=list('ABCD'))
def random(row):
# Return an tuple with more elements than df has columns
return (1,2,3,4,5,6,7,8)
df.apply(random,axis=1)
# Output, returns new series as expected:
0 (1, 2, 3, 4, 5, 6, 7, 8)
1 (1, 2, 3, 4, 5, 6, 7, 8)
2 (1, 2, 3, 4, 5, 6, 7, 8)
3 (1, 2, 3, 4, 5, 6, 7, 8)
4 (1, 2, 3, 4, 5, 6, 7, 8)
按预期工作,但是当我向数据帧添加datetime列时......
df['E'] = datetime.now()
df.apply(random,axis=1)
我收到此错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in create_block_manager_from_arrays(arrays, names, axes)
4262 blocks = form_blocks(arrays, names, axes)
-> 4263 mgr = BlockManager(blocks, axes)
4264 mgr._consolidate_inplace()
/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in __init__(self, blocks, axes, do_integrity_check, fastpath)
2760 if do_integrity_check:
-> 2761 self._verify_integrity()
2762
/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in _verify_integrity(self)
2970 if block._verify_integrity and block.shape[1:] != mgr_shape[1:]:
-> 2971 construction_error(tot_items, block.shape[1:], self.axes)
2972 if len(self.items) != tot_items:
/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in construction_error(tot_items, block_shape, axes, e)
4232 raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
-> 4233 passed, implied))
4234
ValueError: Shape of passed values is (5, 8), indices imply (5, 5)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-29-b57dd4b93995> in <module>()
----> 1 df.apply(random,axis=1)
/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
4150 if reduce is None:
4151 reduce = True
-> 4152 return self._apply_standard(f, axis, reduce=reduce)
4153 else:
4154 return self._apply_broadcast(f, axis)
/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
4263 index = None
4264
-> 4265 result = self._constructor(data=results, index=index)
4266 result.columns = res_index
4267
/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
264 dtype=dtype, copy=copy)
265 elif isinstance(data, dict):
--> 266 mgr = self._init_dict(data, index, columns, dtype=dtype)
267 elif isinstance(data, ma.MaskedArray):
268 import numpy.ma.mrecords as mrecords
/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _init_dict(self, data, index, columns, dtype)
400 arrays = [data[k] for k in keys]
401
--> 402 return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
403
404 def _init_ndarray(self, values, index, columns, dtype=None, copy=False):
/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
5406 axes = [_ensure_index(columns), _ensure_index(index)]
5407
-> 5408 return create_block_manager_from_arrays(arrays, arr_names, axes)
5409
5410
/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in create_block_manager_from_arrays(arrays, names, axes)
4265 return mgr
4266 except ValueError as e:
-> 4267 construction_error(len(arrays), arrays[0].shape, axes, e)
4268
4269
/Users/jguillette/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in construction_error(tot_items, block_shape, axes, e)
4231 raise ValueError("Empty data passed with indices specified.")
4232 raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
-> 4233 passed, implied))
4234
4235
ValueError: Shape of passed values is (5, 8), indices imply (5, 5)
我没有收到错误的唯一一次是当函数返回一个元素与元数据具有相同数量的元素时,返回一个DataFrame而不是一个系列。
有没有办法改变这种行为?在我的情况下,我不需要在函数中使用日期时间信息,但我仍然不明白排除它如何改变应用的行为。
任何见解都将受到赞赏。
答案 0 :(得分:0)
似乎pandas根据你的df的数据类型处理不同的应用返回值。在第一个示例中,所有数据类型都是浮点数,而在添加列E之后,数据类型是混合的,这导致pandas尝试使用返回的值重建数据帧。我不知道这种行为背后的理性,但以下应该解决你的问题。:
df.astype(object).apply(random,axis=1)
Out[64]:
0 (1, 2, 3, 4, 5, 6, 7, 8)
1 (1, 2, 3, 4, 5, 6, 7, 8)
2 (1, 2, 3, 4, 5, 6, 7, 8)
3 (1, 2, 3, 4, 5, 6, 7, 8)
4 (1, 2, 3, 4, 5, 6, 7, 8)
答案 1 :(得分:0)
https://github.com/pandas-dev/pandas/blob/v0.22.0/pandas/core/frame.py#L236-L6142
class DataFrame(NDFrame):
def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):
# skip if we are mixed datelike and trying reduce across axes
# GH6125
# this block will judge if it's datelike mixed type and disable reduce if true
if (reduce and axis == 1 and self._is_mixed_type and
self._is_datelike_mixed_type):
reduce = False
# try to reduce first (by default)
# this only matters if the reduction in values is of different dtype
# e.g. if we want to apply to a SparseFrame, then can't directly reduce
if reduce:
values = self.values
# we cannot reduce using non-numpy dtypes,
# as demonstrated in gh-12244
if not is_extension_type(values):
# Create a dummy Series from an empty array
index = self._get_axis(axis)
empty_arr = np.empty(len(index), dtype=values.dtype)
dummy = Series(empty_arr, index=self._get_axis(axis),
dtype=values.dtype)
try:
labels = self._get_agg_axis(axis)
result = lib.reduce(values, func, axis=axis, dummy=dummy,
labels=labels)
return Series(result, index=labels)
except Exception:
pass
dtype = object if self._is_mixed_type else None
if axis == 0:
series_gen = (self._ixs(i, axis=1)
for i in range(len(self.columns)))
res_index = self.columns
res_columns = self.index
elif axis == 1:
res_index = self.index
res_columns = self.columns
values = self.values
series_gen = (Series.from_array(arr, index=res_columns, name=name,
dtype=dtype)
for i, (arr, name) in enumerate(zip(values,
res_index)))
else: # pragma : no cover
raise AssertionError('Axis must be 0 or 1, got %s' % str(axis))
i = None
keys = []
results = {}
if ignore_failures:
successes = []
for i, v in enumerate(series_gen):
try:
results[i] = func(v)
keys.append(v.name)
successes.append(i)
except Exception:
pass
# so will work with MultiIndex
if len(successes) < len(res_index):
res_index = res_index.take(successes)
else:
try:
for i, v in enumerate(series_gen):
results[i] = func(v)
keys.append(v.name)
except Exception as e:
if hasattr(e, 'args'):
# make sure i is defined
if i is not None:
k = res_index[i]
e.args = e.args + ('occurred at index %s' %
pprint_thing(k), )
raise
if len(results) > 0 and is_sequence(results[0]):
if not isinstance(results[0], Series):
index = res_columns
else:
index = None
result = self._constructor(data=results, index=index)
result.columns = res_index
if axis == 1:
result = result.T
result = result._convert(datetime=True, timedelta=True, copy=False)
else:
result = Series(results)
result.index = res_index
return result
我找到了你的问题背后的源代码,因为详细的原因你可以检查GH6125作为评论说。 我的决议有点愚蠢如下。
df = pd.DataFrame({'a': [1, 2, 3]})
class TMP():
def __init__(self, a, **kws):
self.a = a
self.__dict__.update(kws)
def func(line):
return TMP(line['a'], 'x', 'y')
se = df.apply(func, axis=1)
se
> 0 <__main__.TMP object at 0x11122f390>
1 <__main__.TMP object at 0x11122f400>
2 <__main__.TMP object at 0x11122f438>
dtype: object
se.apply(lambda tmp: tmp.a)
> 0 1
1 2
2 3
dtype: int64
第二个解决方案是确保func返回一个系列(看起来更慢)
def func(line):
return pd.Series(['a', 'x', 'y'])
df.apply(func, axis=1)
希望有所帮助。