数据帧块上的运行功能

时间:2019-03-08 21:26:19

标签: python dataframe

问题:

我有以下功能可以很好地处理数据。我想在数据块上运行它以提高内存效率:

脚本:

def feat(df):
cols=['CompanyName','FirstName','MiddleName',
      'LastName','AddressConcat','State',
      'EmailConcat','State','County','ZipCode','EmailConcat']
features=[hamming_distance,jaro_winkler,damerau_levenshtein_distance,ratio,partial_ratio,partial_token_set_ratio,partial_token_sort_ratio]
features_names=['hamming_distance','jaro_winkler','damerau_levenshtein_distance','ratio,partial_ratio','partial_token_set_ratio','partial_token_sort_ratio']
for j in features:
    for i in features_names:
        for col in cols:
            df[col+'_'+i]=df[[col+'_x',col+'_y']].dropna().apply(lambda row: j(row[col+'_x'],row[col+'_y']),axis=1)
df['Mean']=df.mean(axis=1)
dft=df[df['Mean']>=38]
columns=['CompanyName_y','CompanyName_x','ZipCode_y','State_y',
         'City_y','County_y','ZipCode_x','State_x','City_x',
         'County_x','FirstName_x','FirstName_y','MiddleName_x','MiddleName_y',
        'AddressConcat_x','AddressConcat_y',
         'EmailConcat_x','EmailConcat_y']
for c in columns:
    dft[c+'_soundex']=dft[c].dropna().apply(lambda row: soundex(row))
return dft

我将数据帧分为10个块,将每个块与原始数据帧进行交叉连接,然后应用上面提供的功能:

appended_data = []
chunk_size = int(df.shape[0] / 10)
for start in list(range(0, df.shape[0], chunk_size)):
    df_subset = df.iloc[start:start + chunk_size]
    dfCart=cartesian_product(df_subset, df)
    dfCartResult=feat(dfCart)
    appended_data.append(dfCartResult)
dff = pd.concat(appended_data, axis=1)

错误:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3077             try:
-> 3078                 return self._engine.get_loc(key)
   3079             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'MiddleName_hamming_distance'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/internals.py in set(self, item, value, check)
   4242         try:
-> 4243             loc = self.items.get_loc(item)
   4244         except KeyError:

~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3079             except KeyError:
-> 3080                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   3081 

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'MiddleName_hamming_distance'

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-110-b43ec93e6e45> in <module>
      4     df_subset = df.iloc[start:start + chunk_size]
      5     dfCart=cartesian_product(df_subset, df)
----> 6     dfCartResult=feat(dfCart)
      7     appended_data.append(dfCartResult)
      8 dff = pd.concat(appended_data, axis=1)

<ipython-input-72-cf2eb45ab3c6> in feat(df)
      8         for i in features_names:
      9             for col in cols:
---> 10                 df[col+'_'+i]=df[[col+'_x',col+'_y']].dropna().apply(lambda row: j(row[col+'_x'],row[col+'_y']),axis=1)
     11     df['Mean']=df.mean(axis=1)
     12     dft=df[df['Mean']>=38]

~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
   3117         else:
   3118             # set column
-> 3119             self._set_item(key, value)
   3120 
   3121     def _setitem_slice(self, key, value):

~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/frame.py in _set_item(self, key, value)
   3193         self._ensure_valid_index(value)
   3194         value = self._sanitize_column(key, value)
-> 3195         NDFrame._set_item(self, key, value)
   3196 
   3197         # check if we are modifying a copy

~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/generic.py in _set_item(self, key, value)
   2598 
   2599     def _set_item(self, key, value):
-> 2600         self._data.set(key, value)
   2601         self._clear_item_cache()
   2602 

~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/internals.py in set(self, item, value, check)
   4244         except KeyError:
   4245             # This item wasn't present, just insert at end
-> 4246             self.insert(len(self.items), item, value)
   4247             return
   4248 

~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/internals.py in insert(self, loc, item, value, allow_duplicates)
   4345 
   4346         block = make_block(values=value, ndim=self.ndim,
-> 4347                            placement=slice(loc, loc + 1))
   4348 
   4349         for blkno, count in _fast_count_smallints(self._blknos[loc:]):

~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/internals.py in make_block(values, placement, klass, ndim, dtype, fastpath)
   3203                      placement=placement, dtype=dtype)
   3204 
-> 3205     return klass(values, ndim=ndim, placement=placement)
   3206 
   3207 # TODO: flexible with index=None and/or items=None

~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/internals.py in __init__(self, values, placement, ndim)
   2301 
   2302         super(ObjectBlock, self).__init__(values, ndim=ndim,
-> 2303                                           placement=placement)
   2304 
   2305     @property

~/.conda/envs/test_py3/lib/python3.6/site-packages/pandas/core/internals.py in __init__(self, values, placement, ndim)
    123             raise ValueError(
    124                 'Wrong number of items passed {val}, placement implies '
--> 125                 '{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs)))
    126 
    127     def _check_ndim(self, values, ndim):

ValueError: Wrong number of items passed 2, placement implies 1

我该如何解决这个问题?

0 个答案:

没有答案