我问这个问题是关于矢量量化的作业。
我已经实现了一种相当经典的算法来检测点群集的中心。但是在输入数据中有几个簇(簇的数量和总输入是已知的),我需要找到每个簇的中心但我不知道哪个点构成簇。因此,如果我设法初始化我的未来中心点内部或集群附近(比任何其他初始化中心更近),我的算法可以迭代并转到正确的中心。
但是我不知道如何正确初始化。我正在随机初始化并检查两个中心是否彼此距离太近以及中心是否距离任何输入点太远但是这种方法不容易参数化,即花费很多“计算”时间或没有获得正确的中心
我的想法很简单,随机初始化并检查点是否在群集内。有人知道我该怎么做?我无法构造多边形,因为我不知道群集的限制。 我更喜欢C中的实现,但我也只考虑了这些想法!
编辑:输入数据的示例:
我的代码:
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <float.h>
#define TRAINING_CYCLE 10000
#define LEARNING_RATE 0.001
#define CENTROID_DISTANCE_SCALE 0.7 //used for setting a minimal distance between centroids
#define CENTROID_POINT_SCALE 0.1
#define CLUSTER_SIZE_PERCENTAGE 0.3
//User: REMOVED
//Password: REMOVED
typedef enum { false, true } bool;
typedef struct point{
double x;
double y;
} point;
int nbOfClusters;
double in1[1000];
double in2[1000];
point centers[100]; //later it is limited by number of clusters
int dataSize=0;
double maxX1, maxY1, maxX2, maxY2=0; //maximums of each data set
double deltaX, deltaY=0; //error toleration of each axis
double getAbs(double n){
if (n>=0){
return n;
} else {
return (-1)*n;
}
}
int findNearestCentroid(point p1){ //returns the location in the table of the nearest centroid to the argument point
double distance=DBL_MAX;
int nearest=0;
for (int i=0; i<nbOfClusters; i++){
double distance_temp = (p1.x-centers[i].x)*(p1.x-centers[i].x)+(p1.y-centers[i].y)*(p1.y-centers[i].y);
if ( distance_temp < distance){
distance=distance_temp;
nearest=i;
}
}
return nearest;
}
double getDistance(point p1, point p2){
return sqrt((p1.x-p2.x)*(p1.x-p2.x)+(p1.y-p2.y)*(p1.y-p2.y));
}
bool isCentroidsNear(double minDistance){
for (int i=0;i<nbOfClusters;i++){
for (int j=0; j<nbOfClusters; j++){
if (i != j){
double temp_distance=getDistance(centers[i],centers[j]);
if (temp_distance<minDistance){ // the distance shouldn't be small
return true;
}
}
}
}
return false; //if nothing hit the condition, there is no centroid too close to another
}
point findNearestInput(int centroid){ //returns the location in the table of the nearest centroid to the argument point
double distance=DBL_MAX;
point returnPoint;
int nearest=0;
for (int i=0; i<nbOfClusters; i++){
double distance_temp = (in1[i]-centers[centroid].x)*(in1[i]-centers[centroid].x)+(in2[i]-centers[centroid].y)*(in2[i]-centers[centroid].y);
if ( distance_temp < distance){
distance=distance_temp;
nearest=i;
}
}
returnPoint.x=in1[nearest];
returnPoint.y=in2[nearest];
return returnPoint;
}
bool isPointNear(double minDistance){
for(int i=0; i<nbOfClusters; i++){
double distance=getDistance(findNearestInput(i),centers[i]); //the distance to the nearest point
if(distance>minDistance){
return true;
}
}
return false;
}
bool isCountNearPoints(double distance){
int counter=0;
for(int i=0;i<nbOfClusters;i++){
point p;
for(int j=0; j<dataSize; j++){
p.x=in1[j];
p.y=in2[j];
double tempDistance=getDistance(p,centers[i]);
if (tempDistance<distance){
counter++;
}
}
//this is the number of points that the centroid should be near to
int minNearPoints = dataSize/nbOfClusters*CLUSTER_SIZE_PERCENTAGE;
if (counter<minNearPoints){
return true;
}
}
return false;
}
int main()
{
char dummy[1];
scanf("%c",&dummy[0]);
nbOfClusters=dummy[0]-'0';
while ( scanf("%lf,%lf", &in1[dataSize], &in2[dataSize]) != EOF){
dataSize++;
}
//finding the maximums to determine the error toleration delta
for(int i =0; i< dataSize; i++){
if(in1[i]>0 && in1[i] > maxX1){
maxX1=in1[i];
}
if(in2[i]>0 && in2[i]>maxY1){
maxY1=in2[i];
}
if(in1[i]<0 && in1[i] < maxX1){
maxX2=in1[i];
}
if(in2[i]<0 && in2[i] < maxY1){
maxY2=in2[i];
}
}
//double minDistance = CENTROID_DISTANCE_SCALE*sqrt((maxX1-maxX2)*(maxX1-maxX2)+(maxY1-maxY2)*(maxY1-maxY2));
double minDistance = 1/nbOfClusters*sqrt((maxX1-maxX2)*(maxX1-maxX2)+(maxY1-maxY2)*(maxY1-maxY2));
double pointMinDistance = CENTROID_POINT_SCALE*sqrt((maxX1-maxX2)*(maxX1-maxX2)+(maxY1-maxY2)*(maxY1-maxY2));
/*
do { //randomly generate centroids but have finally nothing near
for(int i=0; i<nbOfClusters; i++){
centers[i].x=(double)rand()/RAND_MAX*2*(maxX1-maxX2)-(maxX1-maxX2);
centers[i].y=(double)rand()/RAND_MAX*2*(maxY1-maxY2)-(maxY1-maxY2);
}
//} while(isCentroidsNear(minDistance) || isCountNearPoints(pointMinDistance));
} while(isCentroidsNear(minDistance) || isPointNear(pointMinDistance));
//} while(isCentroidsNear(minDistance));
*/
int randomInputs[50];
bool isSame;
//generating nbOfClusters amount of random numbers from dataSize range that will later used to pick inputs
do {
do{
//generate random numbers
for(int i=0; i<nbOfClusters; i++){
randomInputs[i]=(int)((double)rand()/RAND_MAX*dataSize);
}
isSame = false;
//checking if the generated numbers are the same
for(int i=0; i<nbOfClusters-1; i++){
for(int j=i+1; j<nbOfClusters; j++){
if(randomInputs[i]==randomInputs[j] ){
isSame=true;
break;
}
}
if(isSame){
break;
}
}
}while(isSame);
//assign centroids to the generated numbers
for (int i =0;i<nbOfClusters;i++){
centers[i].x=in1[randomInputs[i]];
centers[i].y=in2[randomInputs[i]];
}
}while(isCentroidsNear(minDistance)); //if the centroids are too close, i.e. in the same cluster
//learning
point p1;//point for iteration
for (int ii=0; ii<TRAINING_CYCLE; ii++){
for (int i=0; i<dataSize; i++){
//construct a point
p1.x=in1[i];
p1.y=in2[i];
//find the nearest point and the distance to it
int nearPt=findNearestCentroid(p1);
double distance=getDistance(p1,centers[nearPt]);
//the distance that I want to move it
double deltaDistance=LEARNING_RATE*distance;
//moving the center on the DIRECTION of the other point
//the slope of the line passing through both
double slope=(in2[i]-centers[nearPt].y)/(in1[i]-centers[nearPt].x);
double dx,dy;
// finding how much the x needs to change => totalchange^2=dx^2+dy^2 but I know dy from dx
dx=sqrt(deltaDistance*deltaDistance/(1+slope*slope)); //dx=(totaldist^2/(1+slope^2)
//dx is always positive till now, so it should be neg. if the center is to the right of the point
if(centers[nearPt].x>in1[i]){
dx=(-1)*dx;
}
dy=slope*dx;
//updating the center value
centers[nearPt].x += dx;
centers[nearPt].y += dy;
}
}
//printing the results
for (int i=0; i<nbOfClusters; i++){
printf("%lf,%lf\n",centers[i].x,centers[i].y);
}
return 0;
}
答案 0 :(得分:0)
通常的做法是从现有数据中选择点,而不是统一随机。
由于在您的数据模型中,每个点都属于一个群集,选择现有点可以解决您的(模糊)问题,不是吗?