代码
import pandas as pd
df_trans_items = pd.read_csv('../data/raw/df_transaction_item_201905091747.csv', dtype={'stock_code': str, 'system_reference': str}) #1.5gb
df_trans = pd.read_csv('../data/raw/df_transaction_201905091745.csv', dtype={'system_reference': str, 'trans_reference': str, 'pos_user_code': str }) # 1.0 gb
df_stock = pd.read_csv('../data/raw/df_stock_201905091744.csv') # 6.2 mb
df_trans_flight = pd.read_csv('../data/raw/df_transaction_flight_201905091747.csv') # 5.7 mb
df_stock_category = pd.read_csv('../data/raw/df_stock_category_201905091744.csv') # 12.7 kb
df_pos = pd.read_csv('../data/raw/df_pos_201905091657.csv') # 3.8 kb
df_shops = pd.read_csv('../data/raw/df_shop_201905091744.csv') # 2.1 kb
df_concessionaire = pd.read_csv('../data/raw/df_concessionaire_201905091656.csv')
df_ikdec = pd.read_csv('../data/raw/df_ikdec_201905091657.csv')
df_trans_staff = pd.read_csv('../data/raw/df_transaction_staff_201905091749.csv') # 872 bytes
stock_category_merged = pd.merge(df_stock, df_stock_category, how='inner',
on=['company_code', 'shop_code', 'category_code'], suffixes=('_stock', '_stck_category'))
stock_concessionaire_merged = pd.merge(stock_category_merged, df_concessionaire, how='inner',
on=['company_code'], suffixes=('_stock', '_concessionaire'))
stock_shop_merged = pd.merge(stock_concessionaire_merged, df_shops, how='inner',
on=['company_code', 'shop_code'], suffixes=('_stock', '_shops'))
trans_items_merged = pd.merge(df_trans, df_trans_items, how='inner',
on=['system_reference'], suffixes=('_trans', '_trans_items'))
trans_pos_merged = pd.merge(trans_items_merged, df_pos, how='inner',
on=['pos_code', 'shop_code'], suffixes=('_trans', '_pos'))
# And here fails
df_final = pd.merge(trans_pos_merged, stock_shop_merged, how='inner',
on=['shop_code'], suffixes=('_trans', '_shop'))
问题描述
所以我有一些从PostgreSQL导出的表,我想将它们全部加入,在我的笔记本电脑(archlinux x64)中看到ram消耗达到14gb,然后我切换到公司拥有的linux集群。并保持相同的状态,直到最后一个始终输出MemoryError的连接都可以正常工作,我尝试了 del 变量并使用Dask却没有运气。
所以不确定我如何才能连接所有这些表,较大的文件是
但是用于df_final合并的是
所以我不确定MemoryError是从哪里来的,因为我认为它可以放入65gb的ram中。
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-7-0e1d858584c1> in <module>
----> 1 df_final = pd.merge(trans_pos_merged, stock_shop_merged, how='inner',
on=['shop_code'], suffixes=('_trans', '_shop'))
/usr/lib64/python3.6/site-packages/pandas/core/reshape/merge.py in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
46 copy=copy, indicator=indicator,
47 validate=validate)
---> 48 return op.get_result()
49
50
/usr/lib64/python3.6/site-packages/pandas/core/reshape/merge.py in get_result(self)
544 self.left, self.right)
545
--> 546 join_index, left_indexer, right_indexer = self._get_join_info()
547
548 ldata, rdata = self.left._data, self.right._data
/usr/lib64/python3.6/site-packages/pandas/core/reshape/merge.py in _get_join_info(self)
742 join_index, left_indexer, right_indexer = \
743 left_ax.join(right_ax, how=self.how, return_indexers=True,
--> 744 sort=self.sort)
745 elif self.right_index and self.how == 'left':
746 join_index, left_indexer, right_indexer = \
/usr/lib64/python3.6/site-packages/pandas/core/indexes/base.py in join(self, other, how, level, return_indexers, sort)
3287 if not self.is_unique and not other.is_unique:
3288 return self._join_non_unique(other, how=how,
-> 3289 return_indexers=return_indexers)
3290 elif not self.is_unique or not other.is_unique:
3291 if self.is_monotonic and other.is_monotonic:
/usr/lib64/python3.6/site-packages/pandas/core/indexes/base.py in _join_non_unique(self, other, how, return_indexers)
3406 [other._ndarray_values],
3407 how=how,
-> 3408 sort=True)
3409
3410 left_idx = ensure_platform_int(left_idx)
/usr/lib64/python3.6/site-packages/pandas/core/reshape/merge.py in _get_join_indexers(left_keys, right_keys, sort, how, **kwargs)
1144 join_func = _join_functions[how]
1145
-> 1146 return join_func(lkey, rkey, count, **kwargs)
1147
1148
pandas/_libs/join.pyx in pandas._libs.join.inner_join()
MemoryError: