很久以前我一直在从事这个项目,但是在分析集群时遇到了一些麻烦。基本上,我是使用panda read_csv()(具有作为csv导出的excel文件)从具有4000条记录的csv文件中读取一些数据的,然后我通过删除标点符号,标记化和词干来清理提取的数据,在接下来的步骤中,我创建Tdidf矩阵,然后使用k均值创建聚类。
我使用了以下库:
word_tokenize,SnowballStemmer,TfidfVectorizer,cosine_likeity,KMeans,MDS。使用python 3。
from __future__ import print_function
import os
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.manifold import MDS
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
from string import punctuation
import numpy as np
# Creación de un corpus de documentos a partir de una ruta (PATH) donde se encuentran varios documentos.
# Se genera una lista de textos de cada documentos junto con su nombre de archivo de origen
def CrearCorpus(path):
df = pd.read_csv('./llamadas.csv', usecols=['motivo', 'respuesta'], delimiter=';')
corpus = []
for i in range(1, 4050):
problema = str(df['motivo'][i])
solucion = str(df['respuesta'][i])
problema_final = problema + ' ' + solucion
corpus.append([problema_final, 'document ' + str(i + 1)])
return (corpus)
# Eliminar "stopwords" de un texto
def _RemoveStopwords(sentence):
word_tokens = word_tokenize(sentence)
stop_words = set(stopwords.words('spanish'))
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = ""
for w in word_tokens:
if w not in stop_words:
filtered_sentence = filtered_sentence + " "+w
return(filtered_sentence)
# Lee cada uno de los documentos desde el corpus y genera los textos y sus identificadores (titles)
def read_documents(path):
corpus = CrearCorpus(path)
documents = []
titles = []
for c in range(len(corpus)):
(doc, fn) = corpus[c]
titles.append(fn)
documents.append(doc)
return ((documents, titles))
# Elimina puntuación de los documentos
def removePuntuaction(documents):
translator = str.maketrans('', '', punctuation)
for i in range(len(documents)):
documents[i] = documents[i].translate(translator)
return (documents)
# Realizar lematización de un texto en Español
def Stemmer(text):
stemmer = SnowballStemmer('spanish')
words_stem = stemmer.stem(text)
return (words_stem)
# Realiza tokenización y lematización de un texto
def tokenize_and_stem(textdata):
text = word_tokenize(textdata)
lista = []
for elem in text:
word = elem.lower()
nuevo = Stemmer(word)
lista.append(nuevo)
return (lista)
# Realiza tokenización de un un texto
def tokenize_only(text):
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = tokens
return filtered_tokens
# Crear vocabulario de términos a partir del corpus de documentos
# Se crea una tabla (FRAME) que representa un "vocabulario" de las palabras de cada documento
def crear_vocabulario(documents):
# crear dos listas, una lematizada y otra con tokens
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in documents:
allwords_stemmed = tokenize_and_stem(i)
totalvocab_stemmed.extend(allwords_stemmed)
allwords_tokenized = tokenize_only(i)
totalvocab_tokenized.extend(allwords_tokenized)
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index=totalvocab_stemmed)
return (vocab_frame)
# Crea una matrix tf x idf a partir de los textos "tokenizados" y lematizados
def crear_matriz_tfidf():
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
min_df=0.2,
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
terms = tfidf_vectorizer.get_feature_names()
return ((tfidf_matrix, terms))
# Realiza clustering K-means de la matrix tf x idf (entrega los clusters)
def clustering(tfidf_matrix, num_clusters):
km = KMeans(n_clusters=num_clusters) # Crea objeto de datos KMeans
km.fit(tfidf_matrix) # Realiza K-means propiamente tal
clusters = km.labels_.tolist()
return ((clusters, km))
# Muestra estadísticas de los clusters y objetos (documentos y palabras) generados
def cluster_stats(clusters, titles, km, tfidf_matrix):
# abrimos el archivo csv
df = pd.read_csv('./llamadas.csv', usecols=['Plataforma'], delimiter=';')
films = {'title': titles, 'documents': documents, 'cluster': clusters}
frame = pd.DataFrame(films, index=[clusters], columns=['title', 'cluster'])
frame['cluster'].value_counts() # number of docs per cluster
print("Top terminos por cluster:")
print()
# Ordenar centros de clusters segun proximidad al centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
# analizamos cluster por cluster
for i in range(num_clusters):
print("Palabras de Cluster %d:" % i, end='')
for ind in order_centroids[i, :5]: # replace 6 with n words per cluster
print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'),
end=',')
print()
print()
# variable para ver el largo de la cantidad de documentos involucrados en el cluster
list_row = []
print("IDs de cluster %d:" % i, end='')
for title in frame.loc[i]['title'].values.tolist():
print(' %s,' % title, end='')
# extraemos la linea de cada documento para analizar de que plataforma es
count_snd = 0
count_mateonet = 0
print()
numbers_rows = frame.loc[i]['title'].values.tolist()
for index in range(0, len(numbers_rows)):
row = numbers_rows[index].strip('document ')
list_row.append(int(row))
# contamos cuantos elementos son de que plataforma
for cols in range(0, len(list_row)):
value_platform = df['Plataforma'][list_row[cols]]
if value_platform == 'SND ':
count_snd = count_snd + 1
if value_platform == 'Mateonet ':
count_mateonet = count_mateonet + 1
# indices por plataforma
print()
print('Plataforma SND: ' + str(count_snd))
print()
print('Plataforma Mateonet: ' + str(count_mateonet))
print()
print('Cantidad de preguntas: ' + str(len(frame.loc[i]['title'].values.tolist())))
print()
print()
#MAIN
# Ajustar esta variable con la ruta a un directorio que contenga varios documentos
# Give the location of the file
PATH = "./llamadas.csv"
num_clusters = 100 # Maximo numero de clusters es 5
(documents, titles) = read_documents(PATH)
documents = removePuntuaction(documents)
vocab_frame = crear_vocabulario(documents)
print('Existen ' + str(vocab_frame.shape[0]) + ' itemes en vocab_frame')
(tfidf_matrix, terms) = crear_matriz_tfidf()
(clusters, km) = clustering(tfidf_matrix, num_clusters)
cluster_stats(clusters, titles, km, tfidf_matrix)
https://github.com/felipefuller/faq/blob/master/data_analysis.py
我最多可以创建167个群集,但是当我对其进行更新以创建168个或更多群集时,会引发以下错误:
Traceback (most recent call last):
File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2657, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 987, in pandas._libs.hashtable.Int64HashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 993, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 167
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/felipefuller/dev/faq/cluster.py", line 324, in <module>
cluster_stats(clusters, titles, km, tfidf_matrix)
File "/Users/felipefuller/dev/faq/cluster.py", line 187, in cluster_stats
if (len(frame.loc[i]['title'].values.tolist()) >= 250):
File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexing.py", line 1500, in __getitem__
return self._getitem_axis(maybe_callable, axis=axis)
File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexing.py", line 1913, in _getitem_axis
return self._get_label(key, axis=axis)
File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexing.py", line 141, in _get_label
return self.obj._xs(label, axis=axis)
File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/generic.py", line 3583, in xs
drop_level=drop_level)
File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexes/multi.py", line 2571, in get_loc_level
indexer = self._get_level_indexer(key, level=level)
File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexes/multi.py", line 2652, in _get_level_indexer
code = level_index.get_loc(key)
File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 987, in pandas._libs.hashtable.Int64HashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 993, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 167
因此,它应该能够轻松管理167个以上的集群,因为我正在使用4000多个记录。
谢谢!
答案 0 :(得分:0)
您的代码中显然有错误。
您可能尝试访问不存在的列,而不是行。
修正您的代码-我们不能,因为我们没有它。