我正在使用以下信息创建数据框:
import numpy as np
import pandas as pd
from time import time
start_time = time()
columns = 60
Data = pd.DataFrame(np.random.randint(low=0, high=10, size=(700000, 3)), columns=['a', 'b', 'c'])
Data['f'] = (Data.index % 60) + 1
Data['column_-1'] = 100
for i in range(columns):
Data['column_' + str(i)] = np.where( # condition 1
Data['f'] == 1,
1000 + i,
np.where( # condition2
i < Data['f'],
0,
np.where( # condition3
Data['a'] > Data['b'],
Data['column_' + str(-1)] * Data['c'],
Data['column_' + str(-1)]
)
)
)
elapsed_time = time() - start_time
print("Elapsed time: %.10f seconds." % elapsed_time)
经过的时间:1.0710000992秒。
我想知道是否有更好的方法,可以动态生成列并提高脚本的速度。
答案 0 :(得分:2)
使用探查器,您可以看到大部分时间都花在np.where
中。不幸的是,目前我们对此无能为力。
下一个最大的耗时似乎是熊猫转换,但转换速度很慢。因此,我们可以节省一些时间,使代码更简化(更易读):
import numpy as np
import pandas as pd
def make_data():
data_raw = np.random.randint(low=0, high=10, size=(700000, 3))
Data = pd.DataFrame(data_raw, columns=['a', 'b', 'c'])
Data['f'] = (Data.index % 60) + 1
Data['column_-1'] = 100
return Data
def run1(Data):
""" Original """
for i in range(COLUMNS):
Data['column_' + str(i)] = np.where( # Condition 1
Data['f'] == 1,
1000 + i,
np.where( # Condition 2
i < Data['f'],
0,
np.where( # Condition 3
Data['a'] > Data['b'],
Data['column_' + str(-1)] * Data['c'],
Data['column_' + str(-1)]
)
)
)
def run2(Data):
""" Cleaned up """
f = Data['f'].values
a = Data['a'].values
b = Data['b'].values
c = Data['c'].values
for i in range(COLUMNS):
col = f'column_{i}'
colm1 = f'column_{i-1}'
colm1 = Data[colm1].values
Data[col] = np.where(f == 1, 1000 + i,
np.where(f > i, 0,
np.where(a > b, colm1*c, colm1)))
%timeit run1(make_data())
# 1.31 s ± 101 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit run2(make_data())
# 1.22 s ± 26.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop
但是我们仍然使用np.where
3次。 numpy函数非常渴望,np.where
最终将在每次运行时遍历数据。
让我们做得更好!我们可以将整个过程“压平”,并在一个循环中完成:
def run3(Data):
def _run3(f, a, b, c, x, i):
results = np.zeros_like(x)
for k, (fval, aval, bval, cval, xval) in enumerate(zip(f, a, b, c, x)):
if fval == 1:
results[k] = i + 1000
elif fval > i:
results[k] = 0
elif aval > bval:
results[k] = xval*cval
else:
results[k] = xval
return results
fabc = Data[['f', 'a', 'b', 'c']].values.astype(np.dtype('int64'))
f, a, b, c = [fabc[:,j] for j in range(4)]
col = 'column_-1'
for i in range(COLUMNS):
colm1 = col
col = f'column_{i}'
x = Data[colm1].values
Data[col] = _run3(f, a, b, c, x, i)
%timeit run3(make_data())
# 34.3 s ± 1.4 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
哦...没关系。这就是人们说“ Python慢”时的意思。在C中循环3次比在Python中循环一次快25倍!
很好,让我们在C中循环:
import numba
@numba.jit(nopython=True)
def _run4(f, a, b, c, x, i):
results = np.zeros_like(x)
for k in range(len(x)):
fval = f[i]
aval = a[i]
bval = b[i]
cval = c[i]
xval = x[i]
if fval == 1:
results[k] = i + 1000
elif fval > i:
results[k] = 0
elif aval > bval:
results[k] = xval*cval
else:
results[k] = xval
return results
def run4(Data):
fabc = Data[['f', 'a', 'b', 'c']].values.astype(np.dtype('int64'))
f, a, b, c = [fabc[:,j] for j in range(4)]
col = 'column_-1'
for i in range(COLUMNS):
colm1 = col
col = f'column_{i}'
x = Data[colm1].values
Data[col] = _run4(f, a, b, c, x, i)
%timeit run4(make_data())
# 496 ms ± 70.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
对于今天来说,这可能已经足够快了。 +更好的算法-巨大的循环开销=快速。
答案 1 :(得分:1)
只需注意您正在循环中执行冗余计算。这会影响您的表现。为了说明这一点,我将转而使用numpy.select
,因为它更容易说明正在发生的事情:
正在初始化:
columns = 10
Data = pd.DataFrame(np.random.randint(low=0, high=10, size=(100000, 3)), columns=['a', 'b', 'c'])
Data['f'] = (Data.index % 60) + 1
Data['column_-1'] = 100
时间:
冗余计算
%%timeit
case_f = Data['f'] == 1
case_ab = Data['a'] > Data['b']
val_ab = Data['column_-1'] * Data['c']
for i in range(columns):
Data['column_' + str(i)] = np.select(
[Data['f'] == 1, i < Data['f'], Data['a'] > Data['b']],
[1000 + i, 0, Data['column_-1'] * Data['c']],
default=Data['column_-1']
)
结果:
28.6 ms ± 1.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
拉出计算循环
%%timeit
case_f = Data['f'] == 1
case_ab = Data['a'] > Data['b']
val_ab = Data['column_-1'] * Data['c']
for i in range(columns):
Data['column_' + str(i)] = np.select(
[case_f, i < Data['f'], case_ab],
[1000 + i, 0, val_ab],
default=Data['column_-1']
)
结果:
16.1 ms ± 282 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
这可以节省约40-45%的时间。