我有一个包含日期和值的数据框,
Date Price
Jun 30 95.60
Jun 29 94.40
Jun 28 93.59
Jun 27 92.04
Jun 24 93.40
Jun 23 96.10
Jun 22 95.55
Jun 21 95.91
Jun 20 95.10
Jun 17 95.33
Jun 16 97.55
Jun 15 97.14
Jun 14 97.46
Jun 13 97.34
Jun 10 98.83
Jun 9 99.65
Jun 8 98.94
Jun 7 99.03
Jun 6 98.63
Jun 3 97.92
Jun 2 97.72
有一个迭代dateframe的函数,
indic_up = [False, False,False, False]
i = 4
while i+4 <= df.index[-1]:
if (df.get_value(i, 'value') > df.get_value(i-1, 'value')) or
(df.get_value(i, 'value') > df.get_value(i-2, 'value')) or
(df.get_value(i, 'value') > df.get_value(i-3, 'value')) or
(df.get_value(i, 'value') > df.get_value(i-4, 'value')):indic_up.append(True)
else:indic_up.append(False)
i = i+1
此函数的逻辑是今天的value
大于昨天,前天或之前,那么它是true
或false
。
这个函数对我来说似乎很慢,所以我怎么能像这些
for index, row in df.iterrows():
row['a'], index
或
for idx in df.index:
df.ix[idx, 'a'], idx
或者我可以通过将数据帧转换为numpy数组来实现更快的速度吗?
答案 0 :(得分:2)
我们也邀请Scipy
!
理念:通过计算该区间中的最小值并与当前元素进行比较,将当前元素与之前的4
值进行比较。如果匹配,我们基本上都没有通过所有比较,因此选择False
。因此,在代码方面,只需将当前元素与该间隔中的最小值进行比较。这是scipy
带有minimum_filter
的地方。
实施:
from scipy.ndimage.filters import minimum_filter
# Extract values from relevant column into a NumPy array for further procesing
A = df['value'].values
# Look for no match with interval-ed min & look for NOT matching for True as o/p
indic_up_out = A != minimum_filter(A,footprint=np.ones((5,)),origin=2)
# Set first four as False because those would be invalid with a 5 elem runway
indic_up_out[:4] = 0
答案 1 :(得分:1)
你实际上可以计时。这是我的实验。它表明列表上的循环比您的方法快得多。 @Divakar的答案实际上非常好。
Private Sub Worksheet_Change(ByVal Target As Range)
If Not Intersect(Target, Me.Range("F1")) Is Nothing Then
Application.ScreenUpdating = False
Dim lastRow As Long, varArr As Variant, val As String, rowArr(1) As Range, i As Long
val = LCase(Me.Range("F1").Value2)
Me.UsedRange.EntireRow.Hidden = False
If Len(val) Then
lastRow = Application.Max(Me.Cells(Me.Rows.Count, 1).End(xlUp).Row, _
Me.Cells(Me.Rows.Count, 2).End(xlUp).Row, _
Me.Cells(Me.Rows.Count, 3).End(xlUp).Row, _
Me.Cells(Me.Rows.Count, 4).End(xlUp).Row)
varArr = Me.Range("A1:D" & lastRow).Value2
For i = 3 To lastRow
If val = LCase(varArr(i, 1)) Or val = LCase(varArr(i, 2)) Or val = LCase(varArr(i, 3)) Or val = LCase(varArr(i, 4)) Then
If rowArr(0) Is Nothing Then
Set rowArr(0) = Me.Rows(i)
Else
Set rowArr(0) = Union(rowArr(0), Me.Rows(i))
End If
Else
If rowArr(1) Is Nothing Then
Set rowArr(1) = Me.Rows(i)
Else
Set rowArr(1) = Union(rowArr(1), Me.Rows(i))
End If
End If
Next
rowArr(0).EntireRow.Hidden = False
rowArr(1).EntireRow.Hidden = True
End If
Application.ScreenUpdating = True
End If
End Sub
这是我机器上的输出:
import pandas as pd
import timeit
import numpy as np
df = pd.DataFrame({'Date':['Jun 30', 'Jun 29', 'Jun 28', 'Jun 27', 'Jun 24', 'Jun 23', 'Jun 22', 'Jun 21', 'Jun 20', 'Jun 17',
'Jun 16','Jun 15', 'Jun 14', 'Jun 13', 'Jun 10', 'Jun 9', 'Jun 8', 'Jun 7', 'Jun 6', 'Jun 3', 'Jun 2'],
'value': ['95.60', '94.40', '93.59', '92.04', '93.40', '96.10', '95.55', '95.91', '95.10', '95.33', '97.55',
'97.14', '97.46', '97.34', '98.83', '99.65', '98.94', '99.03', '98.63', '97.92', '97.72']})
def by_df_get_value():
indic_up = [False, False,False, False]
i = 4
while i+4 <= df.index[-1]:
if (df.get_value(i, 'value') > df.get_value(i-1, 'value')) or \
(df.get_value(i, 'value') > df.get_value(i-2, 'value')) or \
(df.get_value(i, 'value') > df.get_value(i-3, 'value')) or \
(df.get_value(i, 'value') > df.get_value(i-4, 'value')):
indic_up.append(True)
else:
indic_up.append(False)
i = i+1
def by_list():
indic_up = [False, False,False, False]
values = df['value'].tolist()
for i, v in enumerate(values):
if i < 4:
continue
if (v > values[i-1]) or \
(v > values[i-2]) or \
(v > values[i-3]) or \
(v > values[i-4]):
indic_up.append(True)
else:
indic_up.append(False)
total_time = []
for i in range(10):
t = timeit.Timer('by_df_get_value()','from __main__ import by_df_get_value').timeit(number=1)
total_time.append(t)
print('by_df_get_value(): ', '{:.20f}'.format(np.mean(total_time)))
total_time = []
for i in range(10):
t = timeit.Timer('by_list()','from __main__ import by_list').timeit(number=1)
total_time.append(t)
print('by_list', '{:.20f}'.format(np.mean(total_time)))