实际上,我有1.6 GB的大型数据集。它包含通用数据。因此,我想到了根据主题分离数据。
因此,我将LDA用于相同的情况。但是,要花费大量时间才能提供输出。
对于较小的数据,性能很好。但是,对于较大的数据,性能会受到影响。
有人可以建议我就同一件事...
I love food.
I like cricket.
I like reading books.
Her samsung mobile is not great.
我不喜欢网球。
from gensim.models import ldamodel
import gensim.corpora;
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer;
from sklearn.decomposition import LatentDirichletAllocation
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
def load_data(filename):
reviews = list()
labels = list()
with open(filename, encoding='utf-8') as file:
file.readline()
for line in file:
line = line.strip().split()
reviews.append(line[0])
return reviews
data = load_data('/Users/abc/dataset.txt')
#print("Data:" , data)
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print ("Topic %d:" % (topic_idx))
print (" ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]]))
no_features = 100
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(data)
tf_feature_names = tf_vectorizer.get_feature_names()
no_topics = 2
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
no_top_words = 10
display_topics(lda, tf_feature_names, no_top_words)