import pandas as pd
import numpy as np
import datetime as dt
# Create Column names
col_names = ['930', '931', '932', '933', '934', '935']
# Create Index datetimes
idx_names = pd.date_range(start = dt.datetime(2011, 1, 1), periods = 10, freq= 'D')
# Create dataframe with previously created column names and index datetimes
df1 = pd.DataFrame(np.random.randn(10, 6), columns=col_names, index=idx_names)
# Change the column names from strings to datetimes.time() object
df1.columns = [dt.datetime.strptime(x, '%H%M').time() for x in df1.columns]
# This step and the next step changes the dataframe into a chronological timeseries
df2 = df1.T.unstack()
df2.index = [dt.datetime.combine(x[0], x[1]) for x in df2.index.tolist()]
# Show the series
df2
问题:创建特定列表的最pythonic / pandas-thonic方法是什么?这个列表会说“每次9:32和9:34之间的差异在0到.50之间时,9:34和第二天9:34之间的区别是什么。
我用数据帧格式的数字(沿x轴的日期和沿y轴的时间)这样做,我想说的是(下面是伪代码,上面不是伪代码):
# Create a column with wrong answers and right answers
df['Today 934 minus yesterday 934'] = df[934] - df[934].shift(1)
# Boolean mask were condition 1 (diff > 0) and condition 2 (diff < .5) are true
mask = (df[934].shift(1) - df[932].shift(1) > 0) & (df[934].shift(1) - df[932].shift(1) < .5)
# Apply the boolean mask to the dataframe. This is will remove all the answers
# I dont want from the df['Today 934 minus yesterday 934'] column
df2 = df[mask]
# Only the answers I want:
answers = df['Today 934 minus yesterday 934']
答案 0 :(得分:1)
我的尝试,基本上是你的伪代码的填写版本。其他人可能会采取更清洁的方法。
mask1 = (df2.index.hour == 9) & (df2.index.minute == 34)
mask2 = (df2.index.hour == 9) & (df2.index.minute == 32)
diff_934 = df2[mask1] - df2[mask1].shift(-1)
diff_934 = diff_934[diff_934.index.minute == 34]
diff_932 = df2[mask1|mask2] - df2[mask1|mask2].shift(-1)
diff_932 = diff_932[diff_932.index.minute == 34]
diff_932 = diff_932[(diff_932 > 0) & (diff_932 < .5)]
answer = diff_934.reindex(diff_932.index)
In [116]: answer
Out[116]:
2011-01-02 09:34:00 -0.874153
2011-01-08 09:34:00 0.186254
dtype: float64