Question

很久以前我一直在从事这个项目，但是在分析集群时遇到了一些麻烦。基本上，我是使用panda read_csv（）（具有作为csv导出的excel文件）从具有4000条记录的csv文件中读取一些数据的，然后我通过删除标点符号，标记化和词干来清理提取的数据，在接下来的步骤中，我创建Tdidf矩阵，然后使用k均值创建聚类。

我使用了以下库：

word_tokenize，SnowballStemmer，TfidfVectorizer，cosine_likeity，KMeans，MDS。使用python 3。

from __future__ import print_function
import os
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.manifold import MDS
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
from string import punctuation
import numpy as np


# Creación de un corpus de documentos a partir de una ruta (PATH) donde se encuentran varios documentos.
# Se genera una lista de textos de cada documentos junto con su nombre de archivo de origen

def CrearCorpus(path):

    df = pd.read_csv('./llamadas.csv', usecols=['motivo', 'respuesta'], delimiter=';')

    corpus = []

    for i in range(1, 4050):
        problema = str(df['motivo'][i])
        solucion = str(df['respuesta'][i])
        problema_final = problema + ' ' + solucion
        corpus.append([problema_final, 'document ' + str(i + 1)])
    return (corpus)

# Eliminar "stopwords" de un texto

def _RemoveStopwords(sentence):
    word_tokens = word_tokenize(sentence)
    stop_words = set(stopwords.words('spanish'))
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = ""
    for w in word_tokens:
       if w not in stop_words:
           filtered_sentence = filtered_sentence + " "+w
    return(filtered_sentence)

# Lee  cada uno de los documentos desde el corpus y genera los textos y sus identificadores (titles)

def read_documents(path):
    corpus = CrearCorpus(path)
    documents = []
    titles = []
    for c in range(len(corpus)):
        (doc, fn) = corpus[c]
        titles.append(fn)
        documents.append(doc)
    return ((documents, titles))


# Elimina puntuación de los documentos

def removePuntuaction(documents):
    translator = str.maketrans('', '', punctuation)
    for i in range(len(documents)):
        documents[i] = documents[i].translate(translator)
    return (documents)


# Realizar lematización de un texto en Español

def Stemmer(text):
    stemmer = SnowballStemmer('spanish')
    words_stem = stemmer.stem(text)
    return (words_stem)


# Realiza tokenización y lematización de un texto

def tokenize_and_stem(textdata):
    text = word_tokenize(textdata)
    lista = []
    for elem in text:
        word = elem.lower()
        nuevo = Stemmer(word)
        lista.append(nuevo)
    return (lista)


# Realiza tokenización de un un texto

def tokenize_only(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = tokens
    return filtered_tokens


# Crear vocabulario de términos a partir del corpus de documentos
# Se crea una tabla (FRAME) que representa un "vocabulario" de las palabras de cada documento

def crear_vocabulario(documents):
    # crear dos listas, una lematizada y otra con tokens
    totalvocab_stemmed = []
    totalvocab_tokenized = []
    for i in documents:
        allwords_stemmed = tokenize_and_stem(i)
        totalvocab_stemmed.extend(allwords_stemmed)
        allwords_tokenized = tokenize_only(i)
        totalvocab_tokenized.extend(allwords_tokenized)
    vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index=totalvocab_stemmed)
    return (vocab_frame)


# Crea una matrix tf x idf a partir de los textos "tokenizados" y lematizados

def crear_matriz_tfidf():
    tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                       min_df=0.2,
                                       use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    terms = tfidf_vectorizer.get_feature_names()
    return ((tfidf_matrix, terms))


# Realiza clustering K-means de la matrix tf x idf (entrega los clusters)

def clustering(tfidf_matrix, num_clusters):
    km = KMeans(n_clusters=num_clusters)  # Crea objeto de datos KMeans
    km.fit(tfidf_matrix)  # Realiza K-means propiamente tal
    clusters = km.labels_.tolist()

    return ((clusters, km))


# Muestra estadísticas de los clusters y objetos (documentos y palabras) generados

def cluster_stats(clusters, titles, km, tfidf_matrix):

    # abrimos el archivo csv
    df = pd.read_csv('./llamadas.csv', usecols=['Plataforma'], delimiter=';')

    films = {'title': titles, 'documents': documents, 'cluster': clusters}
    frame = pd.DataFrame(films, index=[clusters], columns=['title', 'cluster'])
    frame['cluster'].value_counts()  # number of docs per cluster
    print("Top terminos por cluster:")
    print()
    # Ordenar centros de clusters segun proximidad al centroid
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    # analizamos cluster por cluster
    for i in range(num_clusters):

        print("Palabras de Cluster %d:" % i, end='')
        for ind in order_centroids[i, :5]:  # replace 6 with n words per cluster
            print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'),
                  end=',')
        print()
        print()

        # variable para ver el largo de la cantidad de documentos involucrados en el cluster
        list_row = []

        print("IDs de cluster %d:" % i, end='')
        for title in frame.loc[i]['title'].values.tolist():
            print(' %s,' % title, end='')

        # extraemos la linea de cada documento para analizar de que plataforma es
        count_snd = 0
        count_mateonet = 0
        print()
        numbers_rows = frame.loc[i]['title'].values.tolist()
        for index in range(0, len(numbers_rows)):
            row = numbers_rows[index].strip('document ')
            list_row.append(int(row))

        # contamos cuantos elementos son de que plataforma
        for cols in range(0, len(list_row)):
            value_platform = df['Plataforma'][list_row[cols]]
            if value_platform == 'SND ':
                count_snd = count_snd + 1
            if value_platform == 'Mateonet ':
                count_mateonet = count_mateonet + 1

        # indices por plataforma
        print()
        print('Plataforma SND: ' + str(count_snd))
        print()
        print('Plataforma Mateonet: ' + str(count_mateonet))
        print()
        print('Cantidad de preguntas: ' + str(len(frame.loc[i]['title'].values.tolist())))
        print()
        print()

#MAIN

# Ajustar esta variable con la ruta a un directorio que contenga varios documentos
# Give the location of the file
PATH = "./llamadas.csv"

num_clusters = 100 # Maximo numero de clusters es 5

(documents, titles) = read_documents(PATH)
documents = removePuntuaction(documents)
vocab_frame = crear_vocabulario(documents)

print('Existen ' + str(vocab_frame.shape[0]) + ' itemes en vocab_frame')

(tfidf_matrix, terms) = crear_matriz_tfidf()
(clusters, km) = clustering(tfidf_matrix, num_clusters)
cluster_stats(clusters, titles, km, tfidf_matrix)

https://github.com/felipefuller/faq/blob/master/data_analysis.py

我最多可以创建167个群集，但是当我对其进行更新以创建168个或更多群集时，会引发以下错误：

Traceback (most recent call last):
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2657, in get_loc
    return self._engine.get_loc(key)
  File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 987, in pandas._libs.hashtable.Int64HashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 993, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 167

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/felipefuller/dev/faq/cluster.py", line 324, in <module>
    cluster_stats(clusters, titles, km, tfidf_matrix)
  File "/Users/felipefuller/dev/faq/cluster.py", line 187, in cluster_stats
    if (len(frame.loc[i]['title'].values.tolist()) >= 250):
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexing.py", line 1500, in __getitem__
    return self._getitem_axis(maybe_callable, axis=axis)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexing.py", line 1913, in _getitem_axis
    return self._get_label(key, axis=axis)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexing.py", line 141, in _get_label
    return self.obj._xs(label, axis=axis)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/generic.py", line 3583, in xs
    drop_level=drop_level)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexes/multi.py", line 2571, in get_loc_level
    indexer = self._get_level_indexer(key, level=level)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexes/multi.py", line 2652, in _get_level_indexer
    code = level_index.get_loc(key)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 987, in pandas._libs.hashtable.Int64HashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 993, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 167

因此，它应该能够轻松管理167个以上的集群，因为我正在使用4000多个记录。

谢谢！

Answer 1

您的代码中显然有错误。

您可能尝试访问不存在的列，而不是行。

修正您的代码-我们不能，因为我们没有它。

创建集群时出错pandas._libs.hashtable.Int64HashTable.get_item

1 个答案: