MemoryError:在65GB RAM

时间:2019-05-23 12:58:26

标签: python pandas csv dataframe join

代码

import pandas as pd
df_trans_items = pd.read_csv('../data/raw/df_transaction_item_201905091747.csv', dtype={'stock_code': str, 'system_reference': str}) #1.5gb
df_trans = pd.read_csv('../data/raw/df_transaction_201905091745.csv', dtype={'system_reference': str, 'trans_reference': str, 'pos_user_code': str  })                  # 1.0 gb

df_stock = pd.read_csv('../data/raw/df_stock_201905091744.csv')                        # 6.2 mb
df_trans_flight = pd.read_csv('../data/raw/df_transaction_flight_201905091747.csv')     # 5.7 mb
df_stock_category = pd.read_csv('../data/raw/df_stock_category_201905091744.csv')                # 12.7 kb
df_pos = pd.read_csv('../data/raw/df_pos_201905091657.csv')                             # 3.8 kb
df_shops = pd.read_csv('../data/raw/df_shop_201905091744.csv')                          # 2.1 kb
df_concessionaire = pd.read_csv('../data/raw/df_concessionaire_201905091656.csv')
df_ikdec = pd.read_csv('../data/raw/df_ikdec_201905091657.csv')
df_trans_staff = pd.read_csv('../data/raw/df_transaction_staff_201905091749.csv')       # 872 bytes

stock_category_merged = pd.merge(df_stock, df_stock_category, how='inner', 
         on=['company_code', 'shop_code', 'category_code'], suffixes=('_stock', '_stck_category'))

stock_concessionaire_merged = pd.merge(stock_category_merged, df_concessionaire, how='inner', 
         on=['company_code'], suffixes=('_stock', '_concessionaire'))

stock_shop_merged = pd.merge(stock_concessionaire_merged, df_shops, how='inner', 
         on=['company_code', 'shop_code'], suffixes=('_stock', '_shops'))

trans_items_merged = pd.merge(df_trans, df_trans_items, how='inner', 
         on=['system_reference'], suffixes=('_trans', '_trans_items'))

trans_pos_merged = pd.merge(trans_items_merged, df_pos, how='inner', 
         on=['pos_code', 'shop_code'], suffixes=('_trans', '_pos'))

# And here fails 
df_final = pd.merge(trans_pos_merged, stock_shop_merged, how='inner', 
         on=['shop_code'], suffixes=('_trans', '_shop'))

问题描述

所以我有一些从PostgreSQL导出的表,我想将它们全部加入,在我的笔记本电脑(archlinux x64)中看到ram消耗达到14gb,然后我切换到公司拥有的linux集群。并保持相同的状态,直到最后一个始终输出MemoryError的连接都可以正常工作,我尝试了 del 变量并使用Dask却没有运气。

所以不确定我如何才能连接所有这些表,较大的文件是

  • df_trans.shape =(7807645,11)
  • df_trans_items.shape =(11618559,14)

但是用于df_final合并的是

  • trans_pos_merged.shape =(11618500,25)
  • stock_shop_merged.shape =(101441,16)

所以我不确定MemoryError是从哪里来的,因为我认为它可以放入65gb的ram中。

---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-7-0e1d858584c1> in <module>
----> 1 df_final = pd.merge(trans_pos_merged, stock_shop_merged, how='inner', 
         on=['shop_code'], suffixes=('_trans', '_shop'))

/usr/lib64/python3.6/site-packages/pandas/core/reshape/merge.py in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
     46                          copy=copy, indicator=indicator,
     47                          validate=validate)
---> 48     return op.get_result()
     49 
     50 

/usr/lib64/python3.6/site-packages/pandas/core/reshape/merge.py in get_result(self)
    544                 self.left, self.right)
    545 
--> 546         join_index, left_indexer, right_indexer = self._get_join_info()
    547 
    548         ldata, rdata = self.left._data, self.right._data

/usr/lib64/python3.6/site-packages/pandas/core/reshape/merge.py in _get_join_info(self)
    742             join_index, left_indexer, right_indexer = \
    743                 left_ax.join(right_ax, how=self.how, return_indexers=True,
--> 744                              sort=self.sort)
    745         elif self.right_index and self.how == 'left':
    746             join_index, left_indexer, right_indexer = \

/usr/lib64/python3.6/site-packages/pandas/core/indexes/base.py in join(self, other, how, level, return_indexers, sort)
   3287         if not self.is_unique and not other.is_unique:
   3288             return self._join_non_unique(other, how=how,
-> 3289                                          return_indexers=return_indexers)
   3290         elif not self.is_unique or not other.is_unique:
   3291             if self.is_monotonic and other.is_monotonic:

/usr/lib64/python3.6/site-packages/pandas/core/indexes/base.py in _join_non_unique(self, other, how, return_indexers)
   3406                                                  [other._ndarray_values],
   3407                                                  how=how,
-> 3408                                                  sort=True)
   3409 
   3410         left_idx = ensure_platform_int(left_idx)

/usr/lib64/python3.6/site-packages/pandas/core/reshape/merge.py in _get_join_indexers(left_keys, right_keys, sort, how, **kwargs)
   1144     join_func = _join_functions[how]
   1145 
-> 1146     return join_func(lkey, rkey, count, **kwargs)
   1147 
   1148 

pandas/_libs/join.pyx in pandas._libs.join.inner_join()

MemoryError: 

0 个答案:

没有答案