来自pandas数据帧的丢弃行的内存错误

时间:2017-10-31 22:49:38

标签: python performance pandas large-data

我有一个大数据框架,9000人的26M行数据。 索引没有订购。

我需要分别对每个人的数据进行一些计算, 并将其保存到新数据框,每人一行。

我写了一个关于唯一人id的循环,用提取的小数据框  这个人的数据,对其进行计算,并将结果保存到 预定义的数据框架。计算主要是求和除法运算 在特定条件下的柱子上。

这在亚马逊Linux服务器上花了大约一个小时。不实用。

为了提高效率,我试图删除当前的人 从数据帧开始,数据帧的大小减小会提高效率。 经过2-4步后我得到了一个内存错误。

我设法在我的笔记本电脑窗口上重建问题。 仅当数据帧大小足够大时才会出现此问题。在我的笔记本电脑上从4000000。 重置索引,解决了这个大小的问题,但是在更大的40000000中重复了内存问题。

我是大型数据集的新手,还有熊猫的新手, 任何想法都会受到欢迎

import numpy as np
import pandas as pd
import time
import random
random.seed(50)
np.random.seed(50)
size = 4000000
dtype = [('view_day', 'int32'), ('account', 'int32'),('category', 'int32'),
         ('Col1I','int32'), 
         ('Col2I','int32'),('Col3I','int32'),
         ('Col4F','float32'), ('Col5F','float32'), ('Col6F','float32'),
        ('isFull','int32'), ('islong','int32')]
values = np.ones(size, dtype=dtype)

index = np.arange(size)

np.random.shuffle(index)

df = pd.DataFrame(values, index=index)
df['view_day'] = np.random.randint(7605, 7605 + 180, df.shape[0])
df['account'] = np.random.randint(1548051, 1548051 + 100, df.shape[0])
df['category'] = np.random.randint(1, 5, df.shape[0])
df['Col1I'] = np.random.randint(600, 1200, df.shape[0])
df['Col2I'] = np.random.randint(1, 600, df.shape[0])

accounts= df.account.unique()

for w in accounts:
    dfs = df[df.account == w]#.copy() - both versions causing memory error

    print dfs.shape
    print df.shape
    df.drop(dfs.index, inplace=True)


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-32-7c390ede93df> in <module>()
      4     print dfs.shape
      5     print df.shape
----> 6     df.drop(dfs.index, inplace=True)

C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\generic.pyc in drop(self, labels, axis, level, inplace, errors)
   1876             else:
   1877                 new_axis = axis.drop(labels, errors=errors)
-> 1878             dropped = self.reindex(**{axis_name: new_axis})
   1879             try:
   1880                 dropped.axes[axis_].set_names(axis.names, inplace=True)

C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\frame.pyc in reindex(self, index, columns, **kwargs)
   2739     def reindex(self, index=None, columns=None, **kwargs):
   2740         return super(DataFrame, self).reindex(index=index, columns=columns,
-> 2741                                               **kwargs)
   2742 
   2743     @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)

C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\generic.pyc in reindex(self, *args, **kwargs)
   2227         # perform the reindex on the axes
   2228         return self._reindex_axes(axes, level, limit, tolerance, method,
-> 2229                                   fill_value, copy).__finalize__(self)
   2230 
   2231     def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,

C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\frame.pyc in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   2685         if index is not None:
   2686             frame = frame._reindex_index(index, method, copy, level,
-> 2687                                          fill_value, limit, tolerance)
   2688 
   2689         return frame

C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\frame.pyc in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
   2696         return self._reindex_with_indexers({0: [new_index, indexer]},
   2697                                            copy=copy, fill_value=fill_value,
-> 2698                                            allow_dups=False)
   2699 
   2700     def _reindex_columns(self, new_columns, copy, level, fill_value=NA,

C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\generic.pyc in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
   2339                                                 fill_value=fill_value,
   2340                                                 allow_dups=allow_dups,
-> 2341                                                 copy=copy)
   2342 
   2343         if copy and new_data is self._data:

C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\internals.pyc in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
   3595             new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=(
   3596                 fill_value if fill_value is not None else blk.fill_value,))
-> 3597                 for blk in self.blocks]
   3598 
   3599         new_axes = list(self.axes)

C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\internals.pyc in take_nd(self, indexer, axis, new_mgr_locs, fill_tuple)
    994             fill_value = fill_tuple[0]
    995             new_values = algos.take_nd(values, indexer, axis=axis,
--> 996                                        allow_fill=True, fill_value=fill_value)
    997 
    998         if new_mgr_locs is None:

C:\Users\naomi\Anaconda2\lib\site-packages\pandas\core\algorithms.pyc in take_nd(arr, indexer, axis, out, fill_value, mask_info, allow_fill)
    928             out = np.empty(out_shape, dtype=dtype, order='F')
    929         else:
--> 930             out = np.empty(out_shape, dtype=dtype)
    931 
    932     func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis,

MemoryError: 

0 个答案:

没有答案