我研究了如何在python中使用DBSCAN对NSL KDD数据进行聚类。当我尝试运行具有10.000个数据限制的程序时,它说MemoryError,而在运行所有数据时(NSL KDD具有125.973行,41列),它说的是达到最大维数。仅仅是因为计算机规格问题(我正在使用8GB RAM)还是代码问题?如何解决呢?最后,如何更新要保存在mySQL中的每行的集群结果?我是python新手,如果您认为我问了一个愚蠢的问题,我很抱歉
def set2List(NumpyArray):
list = []
for item in NumpyArray:
list.append(item.tolist())
return list
def GenerateData():
mydb = pymysql.connect(
host="localhost",user="root", password="", database="ta")
mycursor = mydb.cursor()
mycursor.execute("SELECT * FROM data_trans LIMIT 10000")
myresult = mycursor.fetchall()
final_result= numpy.array(myresult)
return final_result
def DBSCAN(Dataset, Epsilon,MinumumPoints,DistanceMethod = 'euclidean'):
m,n=Dataset.shape
Visited=numpy.zeros(m,'int')
Type=numpy.zeros(m)
ClustersList=[]
Cluster=[]
PointClusterNumber=numpy.zeros(m)
PointClusterNumberIndex=1
PointNeighbors=[]
DistanceMatrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(Dataset, DistanceMethod))
for i in xrange(m):
if Visited[i]==0:
Visited[i]=1
PointNeighbors=numpy.where(DistanceMatrix[i]<Epsilon)[0]
if len(PointNeighbors)<MinumumPoints:
Type[i]=-1
else:
for k in xrange(len(Cluster)):
Cluster.pop()
Cluster.append(i)
PointClusterNumber[i]=PointClusterNumberIndex
PointNeighbors=set2List(PointNeighbors)
ExpandClsuter(Dataset[i], PointNeighbors,Cluster,MinumumPoints,Epsilon,Visited,DistanceMatrix,PointClusterNumber,PointClusterNumberIndex )
Cluster.append(PointNeighbors[:])
ClustersList.append(Cluster[:])
PointClusterNumberIndex=PointClusterNumberIndex+1
return PointClusterNumber
def ExpandClsuter(PointToExapnd, PointNeighbors, Cluster, MinumumPoints, Epsilon, Visited, DistanceMatrix, PointClusterNumber, PointClusterNumberIndex ):
Neighbors=[]
for i in PointNeighbors:
if Visited[i]==0:
Visited[i]=1
Neighbors=numpy.where(DistanceMatrix[i]<Epsilon)[0]
if len(Neighbors)>=MinumumPoints:
for j in Neighbors:
try:
PointNeighbors.index(j)
except ValueError:
PointNeighbors.append(j)
if PointClusterNumber[i]==0:
Cluster.append(i)
PointClusterNumber[i]=PointClusterNumberIndex
return
Data=GenerateData()
fig = plt.figure()
ax1=fig.add_subplot(2,1,1) #row, column, figure number
ax1.scatter(Data[:,0],Data[:,1], alpha = 0.5 )
Epsilon=300
MinumumPoints=50
result =DBSCAN(Data,Epsilon,MinumumPoints)
print result
plt.show()
错误消息:
Traceback (most recent call last):
File "<ipython-input-8-20458e6efb7c>", line 1, in <module>
runfile('C:/Users/Ji Min/Downloads/oprek.py', wdir='C:/Users/Ji Min/Downloads')
File "C:\Users\Ji Min\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "C:\Users\Ji Min\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 87, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "C:/Users/Ji Min/Downloads/oprek.py", line 95, in <module>
result =DBSCAN(Data,Epsilon,MinumumPoints)
File "C:/Users/Ji Min/Downloads/oprek.py", line 44, in DBSCAN
DistanceMatrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(Dataset, DistanceMethod))
File "C:\Users\Ji Min\Anaconda2\lib\site-packages\scipy\spatial\distance.py", line 1652, in pdist
dm = np.empty((m * (m - 1)) // 2, dtype=np.double)
MemoryError
答案 0 :(得分:0)
关键是要不计算距离矩阵。
距离矩阵需要太多内存。
但是该数据集还是没有用的。您计算出的距离是没有意义的,所以不要期望聚类会比这更好。