我正在尝试根据距离对空间数据执行聚类,但会限制群集大小。我在网上找到了这篇文章,(Spatial Clustering With Equal Sizes),它可以在3个集群中使用一小组日期。
但是,当我尝试运行更大的列表并将它们集群到30个集群中时,它无法按预期工作。它返回的集群再次不均匀,如下所示。
我尝试使用30个聚类和示例数据集的较小数据,它们都均匀地计算出来。所以我猜我的数据有问题。但我不确定如何解决它。
table(cl_constrain $ cluster)
Cluster 1 2 3 4 5 6 7 8 9 10
尺寸151 63 67 88 65 89 92 82 72 84Cluster 11 12 13 14 15 16 17 18 19 20
尺寸60 61 44 46 60 51 65 216 56 188Cluster 20 21 22 23 24 25 26 27 28 29 30
规格229 78 101 75 196 70 222 62 102 271
我的数据集看起来像这个
我是R的新手,不知道它出了什么问题,有人可以帮帮我吗?非常感谢!
这是文章的源代码。
# Convert to radian
as_radians = function(theta=0){
return(theta * pi / 180)
}
calc_dist = function(fr, to) {
lat1 = as_radians(fr$lat)
lon1 = as_radians(fr$lon)
lat2 = as_radians(to$lat)
lon2 = as_radians(to$lon)
a = 3963.191;
b = 3949.903;
numerator = ( a^2 * cos(lat2) )^2 + ( b^2 * sin(lat2) ) ^2
denominator = ( a * cos(lat2) )^2 + ( b * sin(lat2) )^2
radiusofearth = sqrt(numerator/denominator) #Accounts for the ellipticity of the earth.
d = radiusofearth * acos( sin(lat1) * sin(lat2) + cos(lat1)*cos(lat2)*cos(lon2 - lon1) )
d.return = list(distance_miles=d)
return(d.return)
}
raw.og = read.csv("http://statistical-research.com/wp-content/uploads/2013/11/sample_geo.txt", header=T, sep="\t")
orig.data = raw.og[,1:3]
dirichletClusters_constrained = function(orig.data, k=5, max.iter =50, tolerance = 1, plot.iter=TRUE) {
fr = to = NULL
r.k.start = sample(seq(1:k))
n = nrow( orig.data )
k.size = ceiling(n/k)
initial.clusters = rep(r.k.start, k.size)
if(n%%length(initial.clusters)!=0){
exclude.k = length(initial.clusters) - n%%length(initial.clusters)
} else {
exclude.k = 0
}
orig.data$cluster = initial.clusters[1:(length(initial.clusters)-exclude.k)]
orig.data$cluster_original = orig.data$cluster
## Calc centers and merge
mu = cbind( by(orig.data$Latitude, orig.data$cluster, mean), by(orig.data$Longitude, orig.data$cluster, mean), seq(1:k) )
tmp1 = matrix( match(orig.data$cluster, mu[,3]) )
orig.data.centers = cbind(as.matrix(orig.data), mu[tmp1,])[,c(1:2,4:6)]
## Calc initial distance from centers
fr$lat = orig.data.centers[,3]; fr$lon = orig.data.centers[,4]
to$lat = orig.data.centers[,1]; to$lon = orig.data.centers[,2]
orig.data$distance.from.center = calc_dist(fr, to)$distance_miles
orig.data$distance.from.center_original = orig.data$distance.from.center
## Set some initial configuration values
is.converged = FALSE
iteration = 0
error.old = Inf
error.curr = Inf
while ( !is.converged && iteration < max.iter ) { # Iterate until threshold or maximum iterations
if(plot.iter==TRUE){
plot(orig.data$Longitude, orig.data$Latitude, col=orig.data$cluster, pch=16, cex=.6,
xlab="Longitude",ylab="Latitude")
}
iteration = iteration + 1
start.time = as.numeric(Sys.time())
cat("Iteration ", iteration,sep="")
for( i in 1:n ) {
# Iterate over each observation and measure the distance each observation' from its mean center
# Produces an exchange. It takes the observation closest to it's mean and in return it gives the observation
# closest to the giver, k, mean
fr = to = distances = NULL
for( j in 1:k ){
# Determine the distance from each k group
fr$lat = orig.data$Latitude[i]; fr$lon = orig.data$Longitude[i]
to$lat = mu[j,1]; to$lon = mu[j,2]
distances[j] = as.numeric( calc_dist(fr, to) )
}
# Which k cluster is the observation closest.
which.min.distance = which(distances==min(distances), arr.ind=TRUE)
previous.cluster = orig.data$cluster[i]
orig.data$cluster[i] = which.min.distance # Replace cluster with closest cluster
# Trade an observation that is closest to the giving cluster
if(previous.cluster != which.min.distance){
new.cluster.group = orig.data[orig.data$cluster==which.min.distance,]
fr$lat = mu[previous.cluster,1]; fr$lon = mu[previous.cluster,2]
to$lat = new.cluster.group$Latitude; to$lon = new.cluster.group$Longitude
new.cluster.group$tmp.dist = calc_dist(fr, to)$distance_miles
take.out.new.cluster.group = which(new.cluster.group$tmp.dist==min(new.cluster.group$tmp.dist), arr.ind=TRUE)
LocationID = new.cluster.group$LocationID[take.out.new.cluster.group]
orig.data$cluster[orig.data$LocationID == LocationID] = previous.cluster
}
}
# Calculate new cluster means
mu = cbind( by(orig.data$Latitude, orig.data$cluster, mean), by(orig.data$Longitude, orig.data$cluster, mean), seq(1:k) )
tmp1 = matrix( match(orig.data$cluster, mu[,3]) )
orig.data.centers = cbind(as.matrix(orig.data), mu[tmp1,])[,c(1:2,4:6)]
mu = cbind( by(orig.data$Latitude, orig.data$cluster, mean), by(orig.data$Longitude, orig.data$cluster, mean), seq(1:k) )
## Calc initial distance from centers
fr$lat = orig.data.centers[,3]; fr$lon = orig.data.centers[,4]
to$lat = orig.data.centers[,1]; to$lon = orig.data.centers[,2]
orig.data$distance.from.center = calc_dist(fr, to)$distance_miles
# Test for convergence. Is the previous distance within the threshold of the current total distance from center
error.curr = sum(orig.data$distance.from.center)
error.diff = abs( error.old - error.curr )
error.old = error.curr
if( !is.nan( error.diff ) && error.diff < tolerance ) {
is.converged = TRUE
}
# Set a time to see how long the process will take is going through all iterations
stop.time = as.numeric(Sys.time())
hour.diff = (((stop.time - start.time) * (max.iter - iteration))/60)/60
cat("\n Error ",error.diff," Hours remain from iterations ",hour.diff,"\n")
# Write out iterations. Can later be used as a starting point if iterations need to pause
write.table(orig.data, paste("C:\\optimize_iteration_",iteration,"_instore_data.csv", sep=""), sep=",", row.names=F)
}
centers = data.frame(mu)
ret.val = list("centers" = centers, "cluster" = factor(orig.data$cluster), "LocationID" = orig.data$LocationID,
"Latitude" = orig.data$Latitude, "Longitude" = orig.data$Longitude,
"k" = k, "iterations" = iteration, "error.diff" = error.diff)
return(ret.val)
}
# Constrained clustering
cl_constrain = dirichletClusters_constrained(orig.data, k=4, max.iter=5, tolerance=.0001, plot.iter=TRUE)
table( cl_constrain$cluster )
plot(cl_constrain$Longitude, cl_constrain$Latitude, col=cl_constrain$cluster, pch=16, cex=.6,
xlab="Longitude",ylab="Latitude")
library(maps)
map("state", add=T)
points(cl_constrain$centers[,c(2,1)], pch=4, cex=2, col='orange', lwd=4)
答案 0 :(得分:1)
ELKI中有一个相同大小的集群k-means变体。
在this tutorial中详细解释。
我见过很多人要求使用这样的聚类算法,但我不认为使用这样的算法可以很好地支持理论。
对于您的用例,您还遇到地理坐标问题:k-means使用均值,但均值可能与您的距离函数不一致。考虑经度-179°和+ 178°处的两个点。 K-means将使用这两个的平均值,-0.5°作为聚类中心。一个更合理的聚类中心选择是在地球的另一侧+ 179.5°处。
如果您的数据被限制在相当小的区域,它可能仍然有效。 为了获得更好的质量,您可能希望将数据映射到适当的UTM区域。在一个UTM区域内,欧几里德距离是距离的合理近似值。