如何使用GloVe生成矢量矩阵?

时间:2018-06-16 12:15:57

标签: python vectorization tf-idf hdbscan

我正在使用HDBSCAN算法从我拥有的文档中创建集群。但是要从单词创建矢量矩阵,我使用的是tf-idf算法,并希望使用GloVe。我搜索过帖子但无法理解如何使用此算法。我还阅读了Gensim,但我不明白如何使用它来实现GloVe。这就是我在做的事情:

import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import csv
import string
import time
import sys
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.externals import joblib
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import hdbscan

csvRows = []
nltk.download('stopwords')

title = []
synopses = []
filename = "twitter-test-dataset.csv"
num_clusters = 10
pkl_file = "doc_cluster.pkl"
generate_pkl = False

# pre-process data
with open(filename, 'r') as csvfile:
    # creating a csv reader object
    csvreader = csv.reader(csvfile)

    # extracting field names through first row
    fields = csvreader.next()

    # extracting each data row one by one
    duplicates = 0
    for row in csvreader:
        # removes the characters specified
        line = re.sub(r'[.,"!]+', '', row[2], flags=re.MULTILINE)
        line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE)  # removes RT
        line = re.sub(r'https?:\/\/.*[\r\n]*', '',
                    line, flags=re.MULTILINE)  # remove link
        line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
        line = (re.sub(
            "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", line, flags=re.MULTILINE))
        line = filter(lambda x: x in string.printable,
                    line)  # filter non-ascii characers
        if line not in synopses:
            synopses.append(line)
            title.append(row[2])
        else:
            duplicates += 1

print("Removed " + str(duplicates) + " rows")


stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")


def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(
        text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text)
            for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens


totalvocab_stemmed = []
totalvocab_tokenized = []

for i in synopses:
    # for each item in 'synopses', tokenize/stem
    allwords_stemmed = tokenize_and_stem(i)
    # extend the 'totalvocab_stemmed' list
    totalvocab_stemmed.extend(allwords_stemmed)

    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

vocab_frame = pd.DataFrame(
    {'words': totalvocab_tokenized}, index=totalvocab_stemmed)

# print "there are " + str(vocab_frame.shape[0]) + " items in vocab_frame"


# define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                min_df=0.0, stop_words='english',
                                use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))

#CREATE TFIDF MATRIX
tfidf_matrix = tfidf_vectorizer.fit_transform(synopses)
terms = tfidf_vectorizer.get_feature_names()


c = hdbscan.HDBSCAN(min_cluster_size=5)
#PASS TFIDF_MATRIX TO HDBSCAN
c.fit(tfidf_matrix)
print(c.labels_)
sys.exit()

正如您在上面的实现中所看到的,我使用HDBSCANtf-idf进行文本群集。如何使用GloVe代替tf-idf

1 个答案:

答案 0 :(得分:0)

通常似乎是使用文档中每个单词的所有手套向量的平均值。

我不相信这一点。理论支持似乎很脆弱,你可以实际添加或平均这些向量,因为这会打破角度。