我尝试使用word2vec加权tfidf向量进行DBSCAN聚类,并为DBSCAN使用不同的epsilon和minpts阈值。我也尝试了不同minpts的光学聚类方法,但它根本没有产生任何输出。
#Import libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from unidecode import unidecode # $ pip install unidecode
import gensim
import csv
import nltk
from sklearn.feature_extraction import text
import pandas as pd
import numpy as np
from collections import defaultdict
from string import lower
#read data
dat = pd.read_csv('D:\\data_800k.csv',encoding='latin',nrows=500000).Certi.tolist()
wnl = WordNetLemmatizer()
#nltk.download('punkt')
my_stop_words = text.ENGLISH_STOP_WORDSunion(['education','certification','certificate','certified'])
def tokenize_stop(row):
az = []
for j in nltk.word_tokenize(lower(unidecode(row))):
if j not in my_stop_words:
az.extend([j])
return az
def preprocess(dat):
return [tokenize_stop(row) for row in dat]
X = preprocess(dat)
#word2vec
model = gensim.models.Word2Vec(X, size=100)
w2v = dict(zip(model.wv.index2word, model.wv.syn0))
#
tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(X)
max_idf = max(tfidf.idf_)
#train model
def fit(X):
tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(X)
# if a word was never seen - it must be at least as infrequent
# as any of the known words - so the default idf is the max of
# known idf's
max_idf = max(tfidf.idf_)
return defaultdict(
lambda: max_idf,
[(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
#actual training//
word2weight = fit(X)
#multiply word2vec with tfidf
def transform_word2vec_tfidf(X,word2vec,word2weight):
return np.array([
np.mean([word2vec[w] * word2weight[w]
for w in words if w in word2vec] or
[np.zeros(dim)], axis=0)
for words in X
])
export_data_w2v_Tfidf = transform_word2vec_tfidf(X,w2v,word2weight)
np.savetxt('D:\Azim\data_500k_w2v_tfidf.csv',export_data_w2v_Tfidf,delimiter=',',fmt=('%1.15e'))
以下是ELKI截图。任何人都可以分享他们能够使用DBSCAN或任何其他算法进行有意义的文本数据聚类的见解吗?感谢
答案 0 :(得分:1)
我不认为DBSCAN是一种很有前途的文本数据方法。选择参数将很困难(但正如评论中所指出的,您的分数可能太多太大),显然您也遇到了可扩展性问题。我也不确定你的" word2vec"在那里。 word2vec可能会让事情变得更加困难。
我宁愿选择LDA。这通常是文本的最佳方法。
使用OPTICS时,请注意OPTICS不会生成分区。它产生光学图。你需要例如Xi方法提取分区,并添加了另一个可能难以在高维数据中选择的参数。