我有一组要点[s1,s2,........,s27]
然后我计算点之间的相似度,然后计算csv文件'数据集'中的点之间的距离:( si和sj之间的距离= 1-相似度)和相似度=((类受点影响i∩类受点影响j)/(受点影响的类i∪受点j影响的类),最后,我得到一个像distancematrix =[[ds1s1,ds1s2,...,ds1s27],[ds2s1,ds2s2,ds2s3,..ds2s27],.....[ds27s1,ds27s2,.....,ds27s27]]
的矩阵
我将距离矩阵输入到DBSCAN算法。我的代码如下:
import operator
from functools import reduce
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
data = pd.read_csv("dataset.csv")
points=['s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11','512','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23','s24','s25','s26','s27']
Class = data ['Class'].values.tolist()
liste =[]
for n in range (0,len(points)):
a="points_"+points[n]
a= data[points[n]].values.tolist()
b="ClassAffectedBy_"+points[n]
b= []
for i in range(0,len(a)):
if a [i] ==1:
b.append([Class [i]])
k = 'ClassAffectedByPoints_'+points[n]
k = []
for x in b:
for y in x:
k.append(y)
#print(k)
liste.append(k)
dist = []
for a in range (0,len(liste)):
for b in range (0,len(liste)):
Intersection=list(set(liste[a])& set(liste[b]))
Union=list(set().union(liste[a],liste[b]))
try:
if a== b :
USim =1
distance=0
dist.append(distance)
else:
USim=len(Intersection)/len( Union)
distance=1- (len(Intersection)/len( Union))
dist.append(distance)
except ZeroDivisionError :
USim =0
distance=1
dist.append(distance)
distancematrix = [dist[x:x+27]for x in range(0,len(dist),27)]
这段代码是要计算点si和sj之间的距离,然后给DBSCAN距离矩阵如下:
#using default values, set metric to 'precomputed'
db = DBSCAN(eps= 0.75, min_samples = 2, metric='precomputed')
#check db
print(db)
db.fit(distancematrix)
#get labels
labels = db.labels_
print(labels)
#get number of clusters
no_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print('No of clusters:', no_clusters)
print('Cluster 0 ', np.nonzero(labels == 0)[0])
print('Cluster 1 : ', np.nonzero(labels == 1)[0])
现在,我想绘制点并将它们聚类。我使用了这段代码:
import matplotlib.pyplot as plt
# Black removed and is used for noise instead.
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % no_clusters)
plt.show()
问题是,当我执行此代码时,它返回的图形不包含所有点(26个点)-仅返回20点。而且这些点没有很好地分组,我不明白这个问题。