我尝试对小时数据使用pandas的Holidays功能来返回一个布尔值numpy数组,其中假期的所有24小时都返回False。我已经使用df.apply()完成了这项工作,但这并不是很有效。下面的代码:
import pandas as pd
from pandas.tseries.holiday import Holiday, nearest_workday
from dateutil.relativedelta import MO
from dataclasses import dataclass
dt = pd.date_range(start='1/1/2019', end='12/31/2019', freq='H')
@dataclass
class Custom_Holidays:
# todo: rework; Holiday object has start_date and end_date
labor_day = Holiday('Labor Day', month=9, day=1, offset=pd.DateOffset(weekday=MO(1)))
independence_day = Holiday('Independence Day', month=7, day=4)
holidays = Custom_Holidays()
# this only filters out 1 hour instead of 24 hours
independence_day_mask = ~dt.isin(holidays.independence_day.dates(dt[0], dt[-1]))
labor_day_mask = ~dt.isin(holidays.labor_day.dates(dt[0], dt[-1]))
# tests fail -- this should filter out
assert len(dt) - np.sum(independence_day_mask*1) == 24
assert len(dt) - np.sum(independence_day_mask*1) == 24
我认为这与对小时值而非每日值应用掩码有关,但我仍然认为这应该可行。
答案 0 :(得分:1)
签出this。希望这可以帮助。基本上,这里的假期日期会转换为每小时的频率日期范围
dt = pd.date_range(start='1/1/2019', end='12/31/2019', freq='H')
class Custom_Holidays(AbstractHolidayCalendar):
# todo: rework; Holiday object has start_date and end_date
rules = [Holiday('Labor Day', month=9, day=1, offset=pd.DateOffset(weekday=MO(1))),
Holiday('Independence Day', month=7, day=4)]
holiday_df = pd.date_range(start=1/1/2019, periods=24, freq='H')
holidays = Custom_Holidays().holidays(dt.min().date(), dt.max().date())
# for the holidays make it as a range of hourly freq
for day in holidays:
holiday_df = holiday_df.append(pd.date_range(day, day + pd.DateOffset(hours=23), freq='H'))
holiday_mask = ~dt.isin(holiday_df)
print(len(dt) - np.sum(holiday_mask*1)) # this will give you 48 (24 + 24 for 2 days as holidays)