
时间:2017-02-15 06:51:45

标签: python regex python-3.x pandas datetime



import pandas as pd
df = pd.DataFrame({'blobs':['6-Feb- 1 4 Startup ZestFinance says it has built a machine-learning system that’s smart enough to find new borrowers and keep bias out of its credit analysis. 17-Feb-2014',
                         'Credit ratings have long been the key measure of how likely a U.S. 29—Oct-2012 consumer is to repay any loan, from mortgages to 18-0ct-12 credit cards. But the factors that FICO and other companies that create credit scores rely on—things like credit history and credit card balances—often depend on having credit already.  ',
                         'November 22, 2012     In recent years, a crop of startup companies have launched on the premise 6—Feb- ] 4 that borrowers without such histories might still be quite likely to repay, and that their likelihood of doing so could be determined by analyzing large amounts of data, especially  data that has traditionally not been part of the credit evaluation. These companies use algorithms and machine learning to find meaningful patterns in the data, alternative signs that a borrower is a good or bad credit risk.',
                         'March 1“, 2012     Los Angeles-based ZestFinance, founded by former Google CIO Douglas Merrill, claims to have solved this problem with a new credit-scoring platform, called ZAML. 06—Fcb—2012 The company sells the machine-learning software to lenders and also offers consulting 19—Jan— ] 2 services. Zest does not lend money itself. January 2, 1990']})


(*) 出:

    dates1                          dates2
0   6-Feb-14, 17-Feb-2014           NaN
1   29—Oct-2012, 18-0ct-12          NaN
2   6—Feb- ]4                       November 22, 2012
3   06—Fcb—2012, 19—Jan— ] 2        March 1“, 2012 | January 2, 1990



df['col1'] = df['blobs'].str.extract(r"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)(?:0?2|(?:Feb))\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$")




KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2133             try:
-> 2134                 return self._engine.get_loc(key)
   2135             except KeyError:

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 'date_format_3'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in set(self, item, value, check)
   3667         try:
-> 3668             loc = self.items.get_loc(item)
   3669         except KeyError:

/usr/local/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2135             except KeyError:
-> 2136                 return self._engine.get_loc(self._maybe_cast_indexer(key))

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 'date_format_3'

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-3-ec929aa5341c> in <module>()
----> 1 df['date_format_3'] = df['text'].str.extract(r"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)(?:0?2|(?:Feb))\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$", expand = True)
      2 df

/usr/local/lib/python3.5/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
   2417         else:
   2418             # set column
-> 2419             self._set_item(key, value)
   2421     def _setitem_slice(self, key, value):

/usr/local/lib/python3.5/site-packages/pandas/core/frame.py in _set_item(self, key, value)
   2484         self._ensure_valid_index(value)
   2485         value = self._sanitize_column(key, value)
-> 2486         NDFrame._set_item(self, key, value)
   2488         # check if we are modifying a copy

/usr/local/lib/python3.5/site-packages/pandas/core/generic.py in _set_item(self, key, value)
   1499     def _set_item(self, key, value):
-> 1500         self._data.set(key, value)
   1501         self._clear_item_cache()

/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in set(self, item, value, check)
   3669         except KeyError:
   3670             # This item wasn't present, just insert at end
-> 3671             self.insert(len(self.items), item, value)
   3672             return

/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in insert(self, loc, item, value, allow_duplicates)
   3771         block = make_block(values=value, ndim=self.ndim,
-> 3772                            placement=slice(loc, loc + 1))
   3774         for blkno, count in _fast_count_smallints(self._blknos[loc:]):

/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in make_block(values, placement, klass, ndim, dtype, fastpath)
   2683                      placement=placement, dtype=dtype)
-> 2685     return klass(values, ndim=ndim, fastpath=fastpath, placement=placement)
   2687 # TODO: flexible with index=None and/or items=None

/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in __init__(self, values, ndim, fastpath, placement, **kwargs)
   1816         super(ObjectBlock, self).__init__(values, ndim=ndim, fastpath=fastpath,
-> 1817                                           placement=placement, **kwargs)
   1819     @property

/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in __init__(self, values, placement, ndim, fastpath)
    107             raise ValueError('Wrong number of items passed %d, placement '
    108                              'implies %d' % (len(self.values),
--> 109                                              len(self.mgr_locs)))
    111     @property

ValueError: Wrong number of items passed 4, placement implies 1

1 个答案:

答案 0 :(得分:0)


我扩展/纠正了正则表达式模式,但需要更多工作。 (您可以查看并试验this regular expression on regex101

df['blobs'].str.extract("(\d{1,2}[\s-]\w+[\s-]\d{2,4}|\w+[\s-]\d{1,2}[\",\s-]+\d{2,4})", expand = True)


0        17-Feb-2014
1          18-0ct-12
2  November 22, 2012
3    January 2, 1990