我在python上遇到了一个简单的问题(我是一个新用户)已经被困了几天。考虑到一个非常小的数据帧(df),我将在这里报告该问题的简化版本。在简化的世界中,代码可以工作,而对于正常操作的大df,比如按列切片df,不再工作了。
1)考虑一个(5x2)df:
df = pd.DataFrame({'a': [1432, 1432, 1433, 1432, 1434],
'b': ['ab152', 'ab153', 'ab154', np.nan, 'ab156']})
df2 = pd.get_dummies(df.b, sparse=True)
type(df2)
[out] pandas.sparse.frame.SparseDataFrame
df2['a'] = df.a
df2 = df2.groupby('a').apply(max)[df2.columns[:-1]].to_sparse()
这里一切正常。在纯文本中,我想根据特定列创建一个虚拟矩阵,然后在这种情况下使用max函数(根据目的可以使用其他函数)删除索引中的重复项。出于内存效率的原因,“稀疏”是必需的(零的数量相对非常高)。
此外,如果我想提取列'b',我只需要写
df['b']
它有效。
2)在我更复杂的情况下,我有大约500万行和3千列。我使用相同的代码集。
dummy_matrix = pd.get_dummies(big_df.b, sparse=True)
type(dummy_matrix)
[out] pandas.sparse.frame.SparseDataFrame
dummy_matrix['a'] = big_df.a
dummy_matrix = dummy_matrix.groupby('a').apply(max)[dummy_matrix.columns[:-1]].to_sparse()
但最后一行代码永远不会结束,也不会提供任何错误消息。
此外,如果我想在这种情况下选择列'b',我会收到如下错误:
In [81]: dummy_matrix['b']
Out[81]: ---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.pyc in __call__(self, obj)
688 type_pprinters=self.type_printers,
689 deferred_pprinters=self.deferred_printers)
--> 690 printer.pretty(obj)
691 printer.flush()
692 return stream.getvalue()
/usr/local/lib/python2.7/dist-packages/IPython/lib/pretty.pyc in pretty(self, obj)
407 if callable(meth):
408 return meth(obj, self, cycle)
--> 409 return _default_pprint(obj, self, cycle)
410 finally:
411 self.end_group()
/usr/local/lib/python2.7/dist-packages/IPython/lib/pretty.pyc in _default_pprint(obj, p, cycle)
527 if _safe_getattr(klass, '__repr__', None) not in _baseclass_reprs:
528 # A user-provided repr. Find newlines and replace them with
p.break_()
--> 529 _repr_pprint(obj, p, cycle)
530 return
531 p.begin_group(1, '<')
/usr/local/lib/python2.7/dist-packages/IPython/lib/pretty.pyc in _repr_pprint(obj, p, cycle)
709 """A pprint that just redirects to the normal repr function."""
710 # Find newlines and replace them with p.break_()
--> 711 output = repr(obj)
712 for idx,output_line in enumerate(output.splitlines()):
713 if idx:
/usr/local/lib/python2.7/dist-packages/pandas/core/base.pyc in __repr__(self)
62 Yields Bytestring in Py2, Unicode String in py3.
63 """
---> 64 return str(self)
65
66
/usr/local/lib/python2.7/dist-packages/pandas/core/base.pyc in __str__(self)
42 if compat.PY3:
43 return self.__unicode__()
---> 44 return self.__bytes__()
45
46 def __bytes__(self):
/usr/local/lib/python2.7/dist-packages/pandas/core/base.pyc in __bytes__(self)
54
55 encoding = get_option("display.encoding")
---> 56 return self.__unicode__().encode(encoding, 'replace')
57
58 def __repr__(self):
/usr/local/lib/python2.7/dist-packages/pandas/sparse/series.pyc in __unicode__(self)
290 def __unicode__(self):
291 # currently, unicode is same as repr...fixes infinite loop
--> 292 series_rep = Series.__unicode__(self)
293 rep = '%s\n%s' % (series_rep, repr(self.sp_index))
294 return rep
/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in __unicode__(self)
897
898 self.to_string(buf=buf, name=self.name, dtype=self.dtype,
--> 899 max_rows=max_rows)
900 result = buf.getvalue()
901
/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in to_string(self, buf, na_rep, float_format, header, length, dtype, name, max_rows)
962 the_repr = self._get_repr(float_format=float_format,
na_rep=na_rep,
963 header=header, length=length, dtype=dtype,
--> 964 name=name, max_rows=max_rows)
965
966 # catch contract violations
/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in _get_repr(self, name, header, length, dtype, na_rep, float_format, max_rows)
991 na_rep=na_rep,
992 float_format=float_format,
--> 993 max_rows=max_rows)
994 result = formatter.to_string()
995
/usr/local/lib/python2.7/dist-packages/pandas/core/format.pyc in __init__(self, series, buf, length, header, na_rep, name, float_format, dtype, max_rows)
146 self.dtype = dtype
147
--> 148 self._chk_truncate()
149
150 def _chk_truncate(self):
/usr/local/lib/python2.7/dist-packages/pandas/core/format.pyc in _chk_truncate(self)
159 else:
160 row_num = max_rows // 2
--> 161 series = concat((series.iloc[:row_num], series.iloc[-row_num:]))
162 self.tr_row_num = row_num
163 self.tr_series = series
/usr/local/lib/python2.7/dist-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
752 keys=keys, levels=levels, names=names,
753 verify_integrity=verify_integrity,
--> 754 copy=copy)
755 return op.get_result()
756
/usr/local/lib/python2.7/dist-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index,
verify_integrity, copy)
803 for obj in objs:
804 if not isinstance(obj, NDFrame):
--> 805 raise TypeError("cannot concatenate a non-NDFrame
object")
806
807 # consolidate
TypeError: cannot concatenate a non-NDFrame object
更简单和更复杂的案例之间有什么区别?为什么在一种情况下代码可以工作,而在另一种情况下却没有?它可能与dtypes有关吗?我检查了两个案例和dtypes是相同的每个col,所以我不认为问题存在于那里。而且,你认为这两个问题,即列表理解问题和永无止境的故事,是否相关?我希望是 - &gt; 1解决两个问题。
非常感谢您的帮助,如果有必要,我愿意提供更多详细信息。非常感谢。