让我们举个例子
X: 1 2 3 4 5
Y: .9 .91 .92 .93 .94
Z: 20 36 999 211
M. 4000 3456 1 0
当我有这样的数据集时,选择哪种聚类算法?另外,如何在聚类后解释结果? 含义:如何将4D数据集输入集群。
我发现DBSCAN可以在因特网上获得2D,可以使用该图。由于我的数据集是4 D并且在ILLOGICALLY变化...我害怕将其提供给算法
`
import pdb
import matplotlib.pyplot as plt
from numpy.random import rand
from numpy import square, sqrt
def regionQuery(P, eps, D):
neighbourPts = []
for point in D:
#print point
if sqrt(square(P[1] - point[1]) + square(P[2] - point[2]))<eps:
neighbourPts.append(point)
return neighbourPts
def DBSCAN(D, eps, MinPts):
noise = []
visited = []
C = []
c_n = -1
for point in D:
visited.append(point) #marking point as visited
# print point
neighbourPts = regionQuery(point, eps, D)
if len(neighbourPts) < MinPts:
noise.append(point)
else:
C.append([])
c_n+=1
expandCluster(point, neighbourPts, C, c_n,eps, MinPts, D, visited)
print("no. of clusters: " , len(C) )
print("length of noise:", len(noise))
for cluster in C:
col =[rand(1),rand(1),rand(1)]
#print(cluster)
plt.scatter([i[1] for i in cluster],[i[2] for i in cluster],color=col)
plt.show()
def expandCluster(P, neighbourPts, C, c_n,eps, MinPts, D, visited):
C[c_n].append(P)
for point in neighbourPts:
if point not in visited:
visited.append(point)
neighbourPts_2 = regionQuery(point, eps, D)
if len(neighbourPts_2) >= MinPts:
neighbourPts += neighbourPts_2
if point not in (i for i in C):
C[c_n].append(point)
eps =20#input("enter eps")
x=200*rand(10)
y=200*rand(10)
l=[]
for i in range(10):
l.append([i,x[i],y[i]])
#pdb.set_trace()
DBSCAN(l,eps,1)`
答案 0 :(得分:0)
如果您使用的是Python:
方法1:
from sklearn.metrics import confusion_matrix as cm
import pandas as pd
y_test = [1, 0, 0]
y_pred = [1, 0, 0]
confusion_matrix=cm(y_test, y_pred)
list1 = ["Actual 0", "Actual 1"]
list2 = ["Predicted 0", "Predicted 1"]
pd.DataFrame(confusion_matrix, list1,list2)
方法2:
虽然sklearn.metrics.confusion_matrix提供了一个数字矩阵,但您可以使用以下内容生成“报告”:
import pandas as pd
y_true = pd.Series([2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 2])
y_pred = pd.Series([0, 0, 2, 1, 0, 2, 1, 0, 2, 0, 2, 2])
pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
导致:
Predicted 0 1 2 All
True
0 3 0 0 3
1 0 1 2 3
2 2 1 3 6
All 5 2 5 12
这让我们可以看到:
y_true
和y_pred
中每个班级的分类总数此方法也适用于文本标签,并且可以扩展数据集中的大量样本以提供百分比报告。