当我尝试按日期时间增量获取子数组时,如下所示:
dx = dx[(dx[['ts']].diff() > threshold).any(axis=1)]
在下面的示例中,它应该删除太近的标签,但这不起作用。
完整代码:
#!/usr/bin/env python
import re
import pandas as pd
import numpy as np
from datetime import datetime,timedelta
import matplotlib.pyplot as plt
def str2dt(tstr):
return datetime.strptime(tstr, '%m-%d %H:%M:%S.%f')
def plotme():
lst = [
"09-04 11:55:05.011 5",
"09-04 11:55:15.011 2",
"09-04 11:55:16.011 3",
"09-04 11:55:20.011 4",
"09-04 11:55:25.011 4",
"09-04 11:55:30.011 4",
"09-04 11:55:35.011 4",
"09-04 11:55:40.011 4",
"09-04 11:55:45.011 9",
"09-04 11:55:50.011 9",
"09-04 11:55:55.011 9",
"09-04 11:56:00.011 9",
"09-04 11:56:05.011 7",
"09-04 11:56:15.011 8",
]
scorelst = []
tslst = []
index = np.arange(1) # array of numbers for the number of samples
df = pd.DataFrame(columns=["ts","score"], index=index)
count = 0
tsregex = "\d+-\d+ \d+:\d+:\d+.\d+"
for line in lst:
m = re.search("(%s).* (\d+)"%(tsregex),line)
if m:
tstr = m.group(1)
tsdt = str2dt(tstr)
tslst.append(tsdt)
score = int(m.group(2))
scorelst.append(score)
df.ix[count] = [tsdt,score]
count += 1
fig, ax = plt.subplots(figsize=(12,6))
ax.plot(df['ts'],df['score'],label = "score")
cols = ["score"]
dfl = df[(df[cols].shift() != df[cols]).any(axis=1)]
dfr = df[(df[cols].shift(-1) != df[cols]).any(axis=1)]
dx = pd.concat([dfl,dfr],ignore_index=True)
dx = dx.sort_values(['ts'])
threshold = timedelta(seconds=5)
print dx[['ts']].diff()
print dx[['ts']].diff() > threshold
dx = dx[(dx[['ts']].diff() > threshold).any(axis=1)]
fig.autofmt_xdate()
ax.xaxis.set_ticks(np.array(dx['ts']))
#ax.yaxis.set_ticks(delta['score'])
ax.yaxis.grid(True)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.show()
return
plotme()
答案 0 :(得分:1)
如果我对您的理解正确,那么您想删除时间与上一行太近的行。试试这个:
lst = [
"09-04 11:55:05.011 5",
"09-04 11:55:15.011 2",
"09-04 11:55:16.011 3",
"09-04 11:55:20.011 4",
"09-04 11:55:25.011 4",
"09-04 11:55:30.011 4",
"09-04 11:55:35.011 4",
"09-04 11:55:40.011 4",
"09-04 11:55:45.011 9",
"09-04 11:55:50.011 9",
"09-04 11:55:55.011 9",
"09-04 11:56:00.011 9",
"09-04 11:56:05.011 7",
"09-04 11:56:15.011 8",
]
df = pd.DataFrame(lst)[0].str.extract(r'(?P<Date>.+) (?P<Value>\d+)', expand=True)
df['Date'] = pd.to_datetime(df['Date'], format='%m-%d %H:%M:%S.%f')
df['Value'] = pd.to_numeric(df['Value'])
threshold = pd.Timedelta(seconds=5)
mask = df['Date'].diff() >= threshold
df[mask].plot('Date', 'Value')
输出:
请注意,此代码仅将一行与上一行进行比较。如果行之间的间隔很长(例如2秒),则行的长度较长,那么最终将导致丢失范围较大。考虑用resample
将您的所有观测值标准化为一个时间间隔。