将Pandas Column转换为dataframe

时间:2017-10-13 06:05:34

标签: python pandas datetime dataframe python-datetime

我有一个名为' Date'的pandas数据帧列。格式条目:'%Y%m%d%H%M%H%M' (首先%H%M是本地时间&第二%%H%M是UTC)。

我想将其转换为以下格式:%Y-%m-%d_%H%M(保持UTC%H%M)。

obs_df = pd.read_csv(obs, names= ['WBAN','Date','Extinc Coeff', 'D/N', 'Dir 2min av wind', 
                              'Spd 2min av wind(kts)', 'Dir max 5min av wind','Spd_max_5min_av_wind(kts)', 
                              'Constant','Runway vis range'], usecols= ['WBAN', 'Date', 
                                                                        'Spd_max_5min_av_wind(kts)'], 
                 na_filter=False)

这是数据框的样子:

                Date    WBAN    Spd_max_5min_av_wind(kts)
0   2014100108481348    KACK    19
1   2014100108491349    KACK    18
2   2014100108501350    KACK    20
3   2014100108511351    KACK    19
4   2014100108521352    KACK    17

以下是我尝试的内容:

import datetime as dt

obs_df['Date'] = obs_df['Date'].apply(lambda x: dt.datetime.strptime(x, '%Y%m%d%H%M%H%M'))

obs_df['Date'] = pd.to_datetime(obs_df['Date'], format = '%Y%m%d%H%M%H%M')

以下是两次尝试的错误:

---------------------------------------------------------------------------
error                                     Traceback (most recent call last)
<ipython-input-200-27e83cc1348d> in <module>()
----> 1 obs_df['Date'] = obs_df['Date'].apply(lambda x: dt.datetime.strptime(x, '%Y%m%d%H%M%H%M'))

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, **kwds)
   2353             else:
   2354                 values = self.asobject
-> 2355                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   2356 
   2357         if len(mapped) and isinstance(mapped[0], Series):

pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer (pandas/_libs/lib.c:66645)()

<ipython-input-200-27e83cc1348d> in <lambda>(x)
----> 1 obs_df['Date'] = obs_df['Date'].apply(lambda x: dt.datetime.strptime(x, '%Y%m%d%H%M%H%M'))

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/_strptime.py in _strptime_datetime(cls, data_string, format)
    498     """Return a class cls instance based on the input string and the
    499     format string."""
--> 500     tt, fraction = _strptime(data_string, format)
    501     tzname, gmtoff = tt[-2:]
    502     args = tt[:6] + (fraction,)

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/_strptime.py in _strptime(data_string, format)
    318         if not format_regex:
    319             try:
--> 320                 format_regex = _TimeRE_cache.compile(format)
    321             # KeyError raised when a bad format is found; can be specified as
    322             # \\, in which case it was a stray % but with a space after it

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/_strptime.py in compile(self, format)
    266     def compile(self, format):
    267         """Return a compiled re object for the format string."""
--> 268         return re_compile(self.pattern(format), IGNORECASE)
    269 
    270 _cache_lock = _thread_allocate_lock()

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/re.py in compile(pattern, flags)
    222 def compile(pattern, flags=0):
    223     "Compile a regular expression pattern, returning a pattern object."
--> 224     return _compile(pattern, flags)
    225 
    226 def purge():

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/re.py in _compile(pattern, flags)
    291     if not sre_compile.isstring(pattern):
    292         raise TypeError("first argument must be string or compiled pattern")
--> 293     p = sre_compile.compile(pattern, flags)
    294     if not (flags & DEBUG):
    295         if len(_cache) >= _MAXCACHE:

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/sre_compile.py in compile(p, flags)
    534     if isstring(p):
    535         pattern = p
--> 536         p = sre_parse.parse(p, flags)
    537     else:
    538         pattern = None

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/sre_parse.py in parse(str, flags, pattern)
    827     pattern.str = str
    828 
--> 829     p = _parse_sub(source, pattern, 0)
    830     p.pattern.flags = fix_flags(str, p.pattern.flags)
    831 

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/sre_parse.py in _parse_sub(source, state, nested)
    435     start = source.tell()
    436     while True:
--> 437         itemsappend(_parse(source, state))
    438         if not sourcematch("|"):
    439             break

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/sre_parse.py in _parse(source, state)
    772                     group = state.opengroup(name)
    773                 except error as err:
--> 774                     raise source.error(err.msg, len(name) + 1) from None
    775             if condgroup:
    776                 p = _parse_sub_cond(source, state, condgroup)

error: redefinition of group name 'H' as group 6; was group 4 at position 127

我也尝试了这个:

obs_df['Date'] = pd.to_datetime(obs_df['Date'], errors='raise', yearfirst=True, utc=True, box=False, format="%Y-%m-%d_%H%M", exact=False,
                            infer_datetime_format=True)

但得到了这个错误:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime (pandas/_libs/tslib.c:44294)()

pandas/_libs/src/datetime.pxd in datetime._string_to_dts (pandas/_libs/tslib.c:98425)()

ValueError: Error parsing datetime string "2014100108481348" at position 8

During handling of the above exception, another exception occurred:

OverflowError                             Traceback (most recent call last)
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime (pandas/_libs/tslib.c:44703)()

pandas/_libs/tslib.pyx in pandas._libs.tslib.parse_datetime_string (pandas/_libs/tslib.c:35351)()

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/dateutil/parser.py in parse(timestr, parserinfo, **kwargs)
   1181     else:
-> 1182         return DEFAULTPARSER.parse(timestr, **kwargs)
   1183 

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/dateutil/parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
    577 
--> 578             if cday > monthrange(cyear, cmonth)[1]:
    579                 repl['day'] = monthrange(cyear, cmonth)[1]

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/calendar.py in monthrange(year, month)
    120         raise IllegalMonthError(month)
--> 121     day1 = weekday(year, month, 1)
    122     ndays = mdays[month] + (month == February and isleap(year))

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/calendar.py in weekday(year, month, day)
    112        day (1-31)."""
--> 113     return datetime.date(year, month, day).weekday()
    114 

OverflowError: signed integer is greater than maximum

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime (pandas/_libs/tslib.c:44803)()

TypeError: invalid string coercion to datetime

During handling of the above exception, another exception occurred:

OverflowError                             Traceback (most recent call last)
<ipython-input-205-5e9ed01bf0eb> in <module>()
      1 obs_df['Date'] = pd.to_datetime(obs_df['Date'], errors='raise', yearfirst=True, utc=True, box=False, format="%Y-%m-%d_%H%M", exact=False,
----> 2                                 infer_datetime_format=True)

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/tools/datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, box, format, exact, unit, infer_datetime_format, origin)
    507     elif isinstance(arg, ABCSeries):
    508         from pandas import Series
--> 509         values = _convert_listlike(arg._values, False, format)
    510         result = Series(values, index=arg.index, name=arg.name)
    511     elif isinstance(arg, (ABCDataFrame, MutableMapping)):

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/tools/datetimes.py in _convert_listlike(arg, box, format, name, tz)
    433                     dayfirst=dayfirst,
    434                     yearfirst=yearfirst,
--> 435                     require_iso8601=require_iso8601
    436                 )
    437 

pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime (pandas/_libs/tslib.c:46617)()

pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime (pandas/_libs/tslib.c:46233)()

pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime (pandas/_libs/tslib.c:46122)()

pandas/_libs/tslib.pyx in pandas._libs.tslib.parse_datetime_string (pandas/_libs/tslib.c:35351)()

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/dateutil/parser.py in parse(timestr, parserinfo, **kwargs)
   1180         return parser(parserinfo).parse(timestr, **kwargs)
   1181     else:
-> 1182         return DEFAULTPARSER.parse(timestr, **kwargs)
   1183 
   1184 

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/dateutil/parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
    576             cday = default.day if res.day is None else res.day
    577 
--> 578             if cday > monthrange(cyear, cmonth)[1]:
    579                 repl['day'] = monthrange(cyear, cmonth)[1]
    580 

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/calendar.py in monthrange(year, month)
    119     if not 1 <= month <= 12:
    120         raise IllegalMonthError(month)
--> 121     day1 = weekday(year, month, 1)
    122     ndays = mdays[month] + (month == February and isleap(year))
    123     return day1, ndays

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/calendar.py in weekday(year, month, day)
    111     """Return weekday (0-6 ~ Mon-Sun) for year (1970-...), month (1-12),
    112        day (1-31)."""
--> 113     return datetime.date(year, month, day).weekday()
    114 
    115 

OverflowError: signed integer is greater than maximum

正如您所知,我没有尝试添加连字符和下划线,因为这可能需要另一步,但是对此的任何指导都将非常感激。

2 个答案:

答案 0 :(得分:1)

使用format='%Y%m%d%H%M%S%f'

In [1454]: pd.to_datetime(df.Date, format='%Y%m%d%H%M%S%f')
Out[1454]:
0   2014-10-01 08:48:13.480
1   2014-10-01 08:49:13.490
2   2014-10-01 08:50:13.500
3   2014-10-01 08:51:13.510
4   2014-10-01 08:52:13.520
Name: Date, dtype: datetime64[ns]

如果需要的话,将其四舍五入到分钟。

In [1462]: pd.to_datetime(df.Date, format='%Y%m%d%H%M%S%f').dt.round('60S')
Out[1462]:
0   2014-10-01 08:48:00
1   2014-10-01 08:49:00
2   2014-10-01 08:50:00
3   2014-10-01 08:51:00
4   2014-10-01 08:52:00
Name: Date, dtype: datetime64[ns]

详细

In [1455]: df
Out[1455]:
               Date  WBAN  Spd_max_5min_av_wind(kts)
0  2014100108481348  KACK                         19
1  2014100108491349  KACK                         18
2  2014100108501350  KACK                         20
3  2014100108511351  KACK                         19
4  2014100108521352  KACK                         17

答案 1 :(得分:1)

我认为不可能再次使用UTC,因此请仅删除UTC日期时间的最后4个数字:

obs_df['Date'] = pd.to_datetime(obs_df['Date'].astype(str).str[:-4], format = '%Y%m%d%H%M')
print (obs_df)
                 Date  WBAN  Spd_max_5min_av_wind(kts)
0 2014-10-01 08:48:00  KACK                         19
1 2014-10-01 08:49:00  KACK                         18
2 2014-10-01 08:50:00  KACK                         20
3 2014-10-01 08:51:00  KACK                         19
4 2014-10-01 08:52:00  KACK                         17

或者如果只需要另一个日期时间为%H%M

d = obs_df['Date'].astype(str)
d = d.str[:8] + d.str[-4:]
print (d)
0    201410011348
1    201410011349
2    201410011350
3    201410011351
4    201410011352
Name: Date, dtype: object

obs_df['Date'] = pd.to_datetime(d, format = '%Y%m%d%H%M')
print (obs_df)
                 Date  WBAN  Spd_max_5min_av_wind(kts)
0 2014-10-01 13:48:00  KACK                         19
1 2014-10-01 13:49:00  KACK                         18
2 2014-10-01 13:50:00  KACK                         20
3 2014-10-01 13:51:00  KACK                         19
4 2014-10-01 13:52:00  KACK                         17

带时区的解决方案:

import pytz

#find offset as difference of hours form first date
a = str(obs_df.loc[0, 'Date'])
offset = (int(a[12:14]) - int(a[8:10])) * 60
print (offset)
300

obs_df['Date'] = pd.to_datetime(obs_df['Date'].astype(str).str[:-4], format = '%Y%m%d%H%M')
print (obs_df)
                 Date  WBAN  Spd_max_5min_av_wind(kts)
0 2014-10-01 08:48:00  KACK                         19
1 2014-10-01 08:49:00  KACK                         18
2 2014-10-01 08:50:00  KACK                         20
3 2014-10-01 08:51:00  KACK                         19
4 2014-10-01 08:52:00  KACK                         17

首先tz_localize然后tz_convert抵消:

obs_df['Date'] = obs_df['Date'].dt.tz_localize('UTC').dt.tz_convert(pytz.FixedOffset(offset))
print (obs_df)
                       Date  WBAN  Spd_max_5min_av_wind(kts)
0 2014-10-01 13:48:00+05:00  KACK                         19
1 2014-10-01 13:49:00+05:00  KACK                         18
2 2014-10-01 13:50:00+05:00  KACK                         20
3 2014-10-01 13:51:00+05:00  KACK                         19
4 2014-10-01 13:52:00+05:00  KACK                         17

更多信息是here