我正在创建前10个最常用单词的簇,并且我的filter_data具有单词标记列表的集合。在向量化之后,我能够绘制出这10个单词的簇,但是在将最常见单词的引数与过滤器数据进行比较之后,我想在同一张图中绘制单词标记列表。这样所有的单词都被绘制到它们自己的相关簇中。我该怎么办?
我曾尝试对大多数常用单词以及整个标记列表的数据进行矢量化处理。而且,从filter_data令牌列表中提取了前10个最常用的单词。简单来说,我正在尝试使用matplotlib绘制语义簇。
import string
import re
import nltk
import PyPDF4
import numpy
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from nltk.corpus import wordnet
import matplotlib.pyplot as plt
# Declaring all the variables
stopwords = nltk.corpus.stopwords.words('english')
# additional stopwords to be removed manually.
file = open('Corpus.txt', 'r')
moreStopwords = file.read().splitlines()
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()
data = PyPDF4.PdfFileReader(open('ReadyPlayerOne.pdf', 'rb'))
pageData = ''
for page in data.pages:
pageData += page.extractText()
def clean_text(text):
text = "".join([word.lower() for word in text if word not in
string.punctuation])
tokenize = re.split("\W+", text)
text = [wn.lemmatize(word) for word in tokenize if word not in stopwords]
final = [word for word in text if word not in moreStopwords]
return final
filter_data = clean_text(pageData)
# get most common words & plot them on bar graph
most_common_words = [word for word, word_count in
Counter(filter_data).most_common(10)]
word_freq = [word_count for word, word_count in
Counter(filter_data).most_common(10)
mcw_lemma = []
for token in most_common_words:
synsets = wordnet.synsets(token)
if synsets:
mcw_lemma.append(synsets[0].lemmas()[0].name())
fd_lemma = []
for token in filter_data:
synsets = wordnet.synsets(token)
if synsets:
fd_lemma.append(synsets[0].lemmas()[0].name())
# Vectorizing most common words & filter data
mcw_vec = TfidfVectorizer(analyzer=clean_text)
fd_vec = TfidfVectorizer(analyzer=clean_text)
tfidf_mcw = mcw_vec.fit_transform(mcw_lemma)
tfidf_fd = fd_vec.fit_transform(fd_lemma)
# Create cluster
cluster = KMeans(n_clusters=len(mcw_lemma), max_iter=300,
precompute_distances='auto', n_jobs=-1)
X = cluster.fit_transform(tfidf_mcw)
pca = PCA(n_components=2).fit(X)
data2D = pca.transform(X)
plt.scatter(data2D[:, 0], data2D[:, 0],
c=numpy.random.random(len(mcw_lemma)))
plt.scatter(data2D[:, 0], data2D[:, 0],
c=numpy.random.random(len(fd_lemma)))
plt.show()
假设最常用的词是: [“一个”,“绿洲”,“节日”,“头像”,“时间”,“学校”,“年份”,“事物”,“旧”,“堆栈”] 它们将被绘制在图中,并且它们应具有自己的簇,其中绘制的其他词共享相同的引理。