我有一个大数据框(〜1M行和〜20列),需要使用另一个小数据框(10-12行和2列)进行更新。此更新操作发生了数千次。我已经在两个数据帧上都设置了索引,并使用 update 来执行操作,但是时间仍然很快增加。我想知道有更好的方法吗?
我的代码块如下所示
import random
import string
import pandas as pd
from datetime import datetime
import timeit
def update_df(nrows, no_of_updates):
N = nrows
col1 = [''.join(random.choices(string.ascii_uppercase + string.digits, k=10)) for _ in range(N)]
col2 = [random.randint(0, 100) for _ in range(N)]
col3 = [random.randint(0, 100) for _ in range(N)]
col4 = [random.randint(0, 100) for _ in range(N)]
df1 = pd.DataFrame({'a':col1, 'b':col2, 'c':col3, 'd':col4}, index=col1)
start_time = datetime.now()
for _ in range(no_of_updates):
df2 = df1.iloc[[random.randint(0, len(df1.index)-1) for _ in range(10)]].drop_duplicates()
df2 = df2[['a', 'b']]
df1.update(df2)
end_time = datetime.now()
execution_time = end_time - start_time
print("Time to update for {} rows is {}".format(nrows, execution_time))
update_df(nrows=1000000, no_of_updates=10)