我有以下功能可以很好地处理数据。我想在数据块上运行它以提高内存效率:
def feat(df):
cols=['CompanyName','FirstName','MiddleName',
'LastName','AddressConcat','State',
'EmailConcat','State','County','ZipCode','EmailConcat']
features=[hamming_distance,jaro_winkler,damerau_levenshtein_distance,ratio,partial_ratio,partial_token_set_ratio,partial_token_sort_ratio]
features_names=['hamming_distance','jaro_winkler','damerau_levenshtein_distance','ratio,partial_ratio','partial_token_set_ratio','partial_token_sort_ratio']
for j in features:
for i in features_names:
for col in cols:
df[col+'_'+i]=df[[col+'_x',col+'_y']].dropna().apply(lambda row: j(row[col+'_x'],row[col+'_y']),axis=1)
df['Mean']=df.mean(axis=1)
dft=df[df['Mean']>=38]
columns=['CompanyName_y','CompanyName_x','ZipCode_y','State_y',
'City_y','County_y','ZipCode_x','State_x','City_x',
'County_x','FirstName_x','FirstName_y','MiddleName_x','MiddleName_y',
'AddressConcat_x','AddressConcat_y',
'EmailConcat_x','EmailConcat_y']
for c in columns:
dft[c+'_soundex']=dft[c].dropna().apply(lambda row: soundex(row))
return dft
我将数据帧分为10个块,将每个块与原始数据帧进行交叉连接,然后应用上面提供的功能:
appended_data = []
chunk_size = int(df.shape[0] / 10)
for start in list(range(0, df.shape[0], chunk_size)):
df_subset = df.iloc[start:start + chunk_size]
dfCart=cartesian_product(df_subset, df)
dfCartResult=feat(dfCart)
appended_data.append(dfCartResult)
dff = pd.concat(appended_data, axis=1)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3077 try:
-> 3078 return self._engine.get_loc(key)
3079 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'MiddleName_hamming_distance'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/internals.py in set(self, item, value, check)
4242 try:
-> 4243 loc = self.items.get_loc(item)
4244 except KeyError:
~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3079 except KeyError:
-> 3080 return self._engine.get_loc(self._maybe_cast_indexer(key))
3081
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'MiddleName_hamming_distance'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-110-b43ec93e6e45> in <module>
4 df_subset = df.iloc[start:start + chunk_size]
5 dfCart=cartesian_product(df_subset, df)
----> 6 dfCartResult=feat(dfCart)
7 appended_data.append(dfCartResult)
8 dff = pd.concat(appended_data, axis=1)
<ipython-input-72-cf2eb45ab3c6> in feat(df)
8 for i in features_names:
9 for col in cols:
---> 10 df[col+'_'+i]=df[[col+'_x',col+'_y']].dropna().apply(lambda row: j(row[col+'_x'],row[col+'_y']),axis=1)
11 df['Mean']=df.mean(axis=1)
12 dft=df[df['Mean']>=38]
~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
3117 else:
3118 # set column
-> 3119 self._set_item(key, value)
3120
3121 def _setitem_slice(self, key, value):
~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/frame.py in _set_item(self, key, value)
3193 self._ensure_valid_index(value)
3194 value = self._sanitize_column(key, value)
-> 3195 NDFrame._set_item(self, key, value)
3196
3197 # check if we are modifying a copy
~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/generic.py in _set_item(self, key, value)
2598
2599 def _set_item(self, key, value):
-> 2600 self._data.set(key, value)
2601 self._clear_item_cache()
2602
~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/internals.py in set(self, item, value, check)
4244 except KeyError:
4245 # This item wasn't present, just insert at end
-> 4246 self.insert(len(self.items), item, value)
4247 return
4248
~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/internals.py in insert(self, loc, item, value, allow_duplicates)
4345
4346 block = make_block(values=value, ndim=self.ndim,
-> 4347 placement=slice(loc, loc + 1))
4348
4349 for blkno, count in _fast_count_smallints(self._blknos[loc:]):
~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/internals.py in make_block(values, placement, klass, ndim, dtype, fastpath)
3203 placement=placement, dtype=dtype)
3204
-> 3205 return klass(values, ndim=ndim, placement=placement)
3206
3207 # TODO: flexible with index=None and/or items=None
~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/internals.py in __init__(self, values, placement, ndim)
2301
2302 super(ObjectBlock, self).__init__(values, ndim=ndim,
-> 2303 placement=placement)
2304
2305 @property
~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/internals.py in __init__(self, values, placement, ndim)
123 raise ValueError(
124 'Wrong number of items passed {val}, placement implies '
--> 125 '{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs)))
126
127 def _check_ndim(self, values, ndim):
ValueError: Wrong number of items passed 2, placement implies 1
我该如何解决这个问题?