我正在使用HDBSCAN算法从我拥有的文档中创建集群。但是要从单词创建矢量矩阵,我使用的是tf-idf算法,并希望使用GloVe。我搜索过帖子但无法理解如何使用此算法。我还阅读了Gensim,但我不明白如何使用它来实现GloVe
。这就是我在做的事情:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import csv
import string
import time
import sys
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.externals import joblib
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import hdbscan
csvRows = []
nltk.download('stopwords')
title = []
synopses = []
filename = "twitter-test-dataset.csv"
num_clusters = 10
pkl_file = "doc_cluster.pkl"
generate_pkl = False
# pre-process data
with open(filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
fields = csvreader.next()
# extracting each data row one by one
duplicates = 0
for row in csvreader:
# removes the characters specified
line = re.sub(r'[.,"!]+', '', row[2], flags=re.MULTILINE)
line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE) # removes RT
line = re.sub(r'https?:\/\/.*[\r\n]*', '',
line, flags=re.MULTILINE) # remove link
line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
line = (re.sub(
"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", line, flags=re.MULTILINE))
line = filter(lambda x: x in string.printable,
line) # filter non-ascii characers
if line not in synopses:
synopses.append(line)
title.append(row[2])
else:
duplicates += 1
print("Removed " + str(duplicates) + " rows")
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word for sent in nltk.sent_tokenize(
text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
def tokenize_only(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word.lower() for sent in nltk.sent_tokenize(text)
for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
return filtered_tokens
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in synopses:
# for each item in 'synopses', tokenize/stem
allwords_stemmed = tokenize_and_stem(i)
# extend the 'totalvocab_stemmed' list
totalvocab_stemmed.extend(allwords_stemmed)
allwords_tokenized = tokenize_only(i)
totalvocab_tokenized.extend(allwords_tokenized)
vocab_frame = pd.DataFrame(
{'words': totalvocab_tokenized}, index=totalvocab_stemmed)
# print "there are " + str(vocab_frame.shape[0]) + " items in vocab_frame"
# define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
min_df=0.0, stop_words='english',
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))
#CREATE TFIDF MATRIX
tfidf_matrix = tfidf_vectorizer.fit_transform(synopses)
terms = tfidf_vectorizer.get_feature_names()
c = hdbscan.HDBSCAN(min_cluster_size=5)
#PASS TFIDF_MATRIX TO HDBSCAN
c.fit(tfidf_matrix)
print(c.labels_)
sys.exit()
正如您在上面的实现中所看到的,我使用HDBSCAN
和tf-idf
进行文本群集。如何使用GloVe
代替tf-idf
?
答案 0 :(得分:0)
通常似乎是使用文档中每个单词的所有手套向量的平均值。
我不相信这一点。理论支持似乎很脆弱,你可以实际添加或平均这些向量,因为这会打破角度。