错误:
----------------------------------------------- ---------------------------- MemoryError Traceback(最近一次调用 最后)in() 72#] 73 ---> 74簇= get_clusters(句子) 75 #print(集群) 76
get_clusters中的(句子) 18 def get_clusters(句子): 19 tf_idf_matrix = vectorizer.fit_transform(句子) ---> 20 similarity_matrix =(tf_idf_matrix * tf_idf_matrix.T).A 21 affinity_propagation = AffinityPropagation(affinity =“precomputed”,damp = 0.5) 22 affinity_propagation.fit(similarity_matrix)
〜/ .local / lib / python3.5 / site-packages / scipy / sparse / base.py in getattr (self,attr) 562 def getattr (self,attr): 563如果attr =='A': - > 564返回self.toarray() 565 elif attr =='T': 566返回self.transpose()
〜/ .local / lib / python3.5 / site-packages / scipy / sparse / compressed.py in toarray(自我,秩序,外出) 962 def toarray(self,order = None,out = None): 963“”“查看
spmatrix.toarray
的文档字符串。”“” - > 964返回self.tocoo(copy = False).toarray(order = order,out = out) 965 966 ################################################### #############〜/ .local / lib / python3.5 / site-packages / scipy / sparse / coo.py in toarray(自我,秩序,外出) 250 def toarray(self,order = None,out = None): 251“”“查看
spmatrix.toarray
的文档字符串。”“” - > 252 B = self._process_toarray_args(order,out) 253 fortran = int(B.flags.f_contiguous) 254如果不是fortran而不是B.flags.c_contiguous:〜/ .local / lib / python3.5 / site-packages / scipy / sparse / base.py in _process_toarray_args(self,order,out)1037返回1038 else: - > 1039返回np.zeros(self.shape,dtype = self.dtype,order = order)1040 1041 def numpy_ufunc (self,func, 方法,pos,输入,** kwargs):
的MemoryError:
代码:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AffinityPropagation
import pandas as pd
from collections import Counter
punctuation_map = dict((ord(char), None) for char in string.punctuation)
stemmer = nltk.stem.snowball.SpanishStemmer()
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
def normalize(text):
return stem_tokens(nltk.word_tokenize(text.lower().translate(punctuation_map)))
vectorizer = TfidfVectorizer(tokenizer=normalize)
def get_clusters(sentences):
tf_idf_matrix = vectorizer.fit_transform(sentences)
similarity_matrix = (tf_idf_matrix * tf_idf_matrix.T).A
affinity_propagation = AffinityPropagation(affinity="precomputed", damping=0.5)
affinity_propagation.fit(similarity_matrix)
labels = affinity_propagation.labels_
cluster_centers = affinity_propagation.cluster_centers_indices_
tagged_sentences = zip(sentences, labels)
clusters = {}
for sentence, cluster_id in tagged_sentences:
clusters.setdefault(sentences[cluster_centers[cluster_id]], []).append(sentence)
return clusters
#loading data file
filename = "/home/ubuntu/VA_data/first_50K.csv"
df = pd.read_csv(filename, header = None)
sentences = df.iloc[:, 0].values.tolist()
clusters = get_clusters(sentences)
#print cluster labels in descending order of number sentences present in it
for k in sorted(clusters, key=lambda k: len(clusters[k]), reverse=True):
print(k,"\n")
#Print cluster with sentences in it
for cluster in clusters:
print(cluster, ':')
count = 0
for element in clusters[cluster]:
print(' - ', element)
count+= 1
print('Cluster size: ', count)
print('% of queries within the cluster', (count/len(sentences))*100)
print('Number of clusters: ',len(cluster_centers))
我应该如何解决这个问题?请帮忙
答案 0 :(得分:1)
亲和传播需要两个不对称的n乘n矩阵。您的输入矩阵是另一个副本。
算一算:n = 100万需要多少内存,三个矩阵需要双精度(可能还有另一个副本)?你有足够的内存用于距离数据的TB吗?
否则,您应该使用不使用距离矩阵的算法。
答案 1 :(得分:0)
@Sonal,我最近也遇到了这个问题(这就是为什么我在这里)。如果您想测试应用Anony-Mousse的建议,请考虑使用sentences = sentences.head(n=20)
之类的数据按比例缩小数据。如果其他所有内容均正确编写,则将为您成功运行。但是,随着您增加n
的值,您最终将击中相同的MemoryError
。就Anony-Mousse而言,这是一个合理的系统限制,Python预先告诉您系统无法运行这么多计算。最有可能的是,这只能通过更改硬件或更改群集选择来解决。
我也是一个初学者,所以不要把它当作福音:)希望这可以帮助您不断迭代!