我建立了一个LDA模型和术语doc-matrix。现在,我想在'df ['topic-description']'中添加主题。我已经为此编写了一个函数,但是代码似乎有些错误。我查看了idxmax对对象类型不起作用的stackoverflow,因此将dtype更改为数字。我无法在这里找出问题。感谢您的帮助
向df添加主题的功能
def add_topics_to_df(ldamodel, doc_term_matrix, df, new_col='topics_description', num_topics = 3):
# Convert into Per-document topic probability matrix:
docTopicProbMat = ldamodel[doc_term_matrix]
docTopicProbDf = pd.DataFrame(index=df.index, columns=range(0, num_topics))
for i, doc in enumerate(docTopicProbMat):
for topic in doc:
docTopicProbDf.iloc[i, topic[0]] = topic[1]
docTopicProbDf[new_col] = pd.to_numeric(docTopicProbDf[new_col])
docTopicProbDf[new_col] = docTopicProbDf.idxmax(axis=1)
df_topics = docTopicProbDf[new_col]
# Merge with df
df_new = pd.concat([df, df_topics], axis=1)
return df_new
函数调用和进一步处理
df = add_topics_to_df(ldamodel_description, doc_term_matrix_description, df, new_col='topics_description', num_topics=3)
# Rename based on understanding of topics
df['topics_description'].replace({0:'Location', 1:'Luxury', 2:'Budget'},inplace=True)
df = pd.get_dummies(df, columns=['topics_description'], drop_first=False)
print("Dataset has {} rows, {} columns.".format(*df.shape))