我有一个大数据框架,9000人的26M行数据。 索引没有订购。
我需要分别对每个人的数据进行一些计算, 并将其保存到新数据框,每人一行。
我写了一个关于唯一人id的循环,用提取的小数据框 这个人的数据,对其进行计算,并将结果保存到 预定义的数据框架。计算主要是求和除法运算 在特定条件下的柱子上。
这在亚马逊Linux服务器上花了大约一个小时。不实用。
为了提高效率,我试图删除当前的人 从数据帧开始,数据帧的大小减小会提高效率。 经过2-4步后我得到了一个内存错误。
我设法在我的笔记本电脑窗口上重建问题。 仅当数据帧大小足够大时才会出现此问题。在我的笔记本电脑上从4000000。 重置索引,解决了这个大小的问题,但是在更大的40000000中重复了内存问题。
我是大型数据集的新手,还有熊猫的新手, 任何想法都会受到欢迎
import numpy as np
import pandas as pd
import time
import random
random.seed(50)
np.random.seed(50)
size = 4000000
dtype = [('view_day', 'int32'), ('account', 'int32'),('category', 'int32'),
('Col1I','int32'),
('Col2I','int32'),('Col3I','int32'),
('Col4F','float32'), ('Col5F','float32'), ('Col6F','float32'),
('isFull','int32'), ('islong','int32')]
values = np.ones(size, dtype=dtype)
index = np.arange(size)
np.random.shuffle(index)
df = pd.DataFrame(values, index=index)
df['view_day'] = np.random.randint(7605, 7605 + 180, df.shape[0])
df['account'] = np.random.randint(1548051, 1548051 + 100, df.shape[0])
df['category'] = np.random.randint(1, 5, df.shape[0])
df['Col1I'] = np.random.randint(600, 1200, df.shape[0])
df['Col2I'] = np.random.randint(1, 600, df.shape[0])
accounts= df.account.unique()
for w in accounts:
dfs = df[df.account == w]#.copy() - both versions causing memory error
print dfs.shape
print df.shape
df.drop(dfs.index, inplace=True)
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-32-7c390ede93df> in <module>()
4 print dfs.shape
5 print df.shape
----> 6 df.drop(dfs.index, inplace=True)
C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\generic.pyc in drop(self, labels, axis, level, inplace, errors)
1876 else:
1877 new_axis = axis.drop(labels, errors=errors)
-> 1878 dropped = self.reindex(**{axis_name: new_axis})
1879 try:
1880 dropped.axes[axis_].set_names(axis.names, inplace=True)
C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\frame.pyc in reindex(self, index, columns, **kwargs)
2739 def reindex(self, index=None, columns=None, **kwargs):
2740 return super(DataFrame, self).reindex(index=index, columns=columns,
-> 2741 **kwargs)
2742
2743 @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\generic.pyc in reindex(self, *args, **kwargs)
2227 # perform the reindex on the axes
2228 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 2229 fill_value, copy).__finalize__(self)
2230
2231 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\frame.pyc in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
2685 if index is not None:
2686 frame = frame._reindex_index(index, method, copy, level,
-> 2687 fill_value, limit, tolerance)
2688
2689 return frame
C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\frame.pyc in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
2696 return self._reindex_with_indexers({0: [new_index, indexer]},
2697 copy=copy, fill_value=fill_value,
-> 2698 allow_dups=False)
2699
2700 def _reindex_columns(self, new_columns, copy, level, fill_value=NA,
C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\generic.pyc in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
2339 fill_value=fill_value,
2340 allow_dups=allow_dups,
-> 2341 copy=copy)
2342
2343 if copy and new_data is self._data:
C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\internals.pyc in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
3595 new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=(
3596 fill_value if fill_value is not None else blk.fill_value,))
-> 3597 for blk in self.blocks]
3598
3599 new_axes = list(self.axes)
C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\internals.pyc in take_nd(self, indexer, axis, new_mgr_locs, fill_tuple)
994 fill_value = fill_tuple[0]
995 new_values = algos.take_nd(values, indexer, axis=axis,
--> 996 allow_fill=True, fill_value=fill_value)
997
998 if new_mgr_locs is None:
C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\algorithms.pyc in take_nd(arr, indexer, axis, out, fill_value, mask_info, allow_fill)
928 out = np.empty(out_shape, dtype=dtype, order='F')
929 else:
--> 930 out = np.empty(out_shape, dtype=dtype)
931
932 func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis,
MemoryError: