我写了一段代码。而且我确信它可以以更快的方式实施。有人有一些指示。我知道嵌套for循环不是非常pythonic。
该代码基本上使用不断变化的窗口大小计算doc_topic矩阵中文档之间的KLD差异。
def calculate(dataframe):
dates = []
novelty = []
transience = []
resonance = []
time_frame = []
for w in range(1, 500, 1):
print(w)
for i in range(w, doc_topic.shape[0] - w ):
time_frame.append(w)
avg_transience = 0
avg_novelty = 0
dates.append(dataframe.iloc[i]['date'])
novelties = []
transiences = []
for d in range(1, w+1):
novelties.append(KLD(doc_topic[i], doc_topic[i-d]))
transiences.append(KLD(doc_topic[i], doc_topic[i+d]))
avg_novelty = 1/w * np.sum(novelties)
avg_transience = 1/w * np.sum(transiences)
transience.append(avg_transience)
novelty.append(avg_novelty)
resonance = [a - b for a, b in zip(novelty, transience)]
resonance = np.array(resonance)
df_kld = pd.DataFrame(list(zip(transience, novelty, resonance)),
columns=['transience','novelty', 'resonance'])
df_kld['time_frame'] = time_frame
df_kld['dates'] = dates
df_kld.to_pickle('df_kld_final.pkl')
return df_kld