EM算法无法正常工作

时间:2018-05-18 08:00:36

标签: statistics processing expectation-maximization

我正在尝试实现一个简单的EM算法。到目前为止,它似乎运作良好,除了差异迅速缩小到零的小问题,收敛于数据的均值。 (如果我不更新方差,它将收敛到平均值!)

据我所知,这是由于“加权”靠近中心的点太多 - 因此使算法降低方差并缩小到零。当我将公式从First更改为Second时,算法运行得更好(除了略微过高的方差,这是预期的)。这是我的代码的问题吗?

class DataPoint {
  int nDims; // Number of dimensions
  float[] data;
  DataPoint(int n) {nDims = n; data = new float[n];}
  DataPoint(float[] d) {nDims = d.length; data = d;}
}

float sum(float[] d) {float ret = 0; for (int i = 0; i < d.length; ++i) {ret += d[i];} return ret;}
float[] sub(float[] f, float[] u) {float[] ret = new float[f.length]; for (int i = 0; i < f.length; ++i) {ret[i] = f[i] - u[i];} return ret;}
float distSq(float[] d) {float ret = 0; for (int i = 0; i < d.length; ++i) {ret += d[i]*d[i];} return ret;}
float distSq(float[][] d) {float ret = 0; for (int i = 0; i < d.length; ++i) {ret += distSq(d[i]);} return ret;}

float det(float[][] mat) {
  if (mat.length == 2 && mat[0].length == 2) {
    float det = (mat[0][0] * mat[1][1]) - (mat[0][1] * mat[1][0]);
    return det;
  }
  throw new RuntimeException("Det has to be 2x2");
}

float[][] inverse(float[][] mat) {
  if (mat.length == 2 && mat[0].length == 2) {
    float det = mat[0][0] * mat[1][1] - mat[0][1] * mat[1][0];
    float[][] ret = {{mat[1][1]/det, -mat[0][1]/det}, {-mat[1][0]/det, mat[0][0]/det}};
    return ret;
  }
  throw new RuntimeException("Inverse has to be 2x2");

}

class GMM {
  int number;
  int dims;
  float[] weights;
  float[][] means;
  float[][][] covariances;
  float[][][] invCov;


  GMM(int gNo, int noDimensions) {
    number = gNo;
    dims = noDimensions;
    weights = new float[gNo];
    means = new float[gNo][noDimensions];
    covariances = new float[gNo][noDimensions][noDimensions];
    invCov      = new float[gNo][noDimensions][noDimensions];

    // Initialise to random values.
    for (int i = 0; i < gNo; ++i) {
      weights[i] = random(0, 1);
      for (int j = 0; j < noDimensions; ++j) {
        means[i][j] = random(-100,100);
        covariances[i][j][j] = 100;
        invCov[i] = inverse(covariances[i]);
      }
    }
    normaliseWeights();
  }

  float[][] EStep(DataPoint[] data) {
    // For each data point, return probablility of each gaussian having generated it
    // Arguments: n-dimensional data
    float[][] ret = new float[number][data.length];

    for (int Gauss = 0; Gauss < number; ++Gauss) {
      for (int i = 0; i < data.length; ++i) {
        ret[Gauss][i] = calculateProbabilityFast(data[i], Gauss);
      }
    }
    return ret;
  }

  void MStep(DataPoint[] data, float[][] dataProbabilities) {
    for (int Gauss = 0; Gauss < number; ++Gauss) {
      means[Gauss] = new float[data[0].nDims]; // Reset dims to zero
      float probSum = 0;
      for (int i = 0; i < dataProbabilities[Gauss].length; ++i) {
        probSum += dataProbabilities[Gauss][i];
        for (int j = 0; j < means[Gauss].length; ++j) {
          means[Gauss][j] += data[i].data[j] * dataProbabilities[Gauss][i];
        }
      }
      for (int i = 0; i < means[Gauss].length; ++i) {
        means[Gauss][i] /= probSum; // Normalise
      }
      // Means[Gauss] has been updated

      // Now for covariance.... :x
      covariances[Gauss] = new float[data[0].nDims][data[0].nDims];
      for (int m = 0; m < data[0].nDims; ++m) {
        for (int n = 0; n < data[0].nDims; ++n) {
          for (int i = 0; i < dataProbabilities[Gauss].length; ++i) {
            covariances[Gauss][m][n] += (data[i].data[m]-means[Gauss][m])*(data[i].data[n]-means[Gauss][n])*dataProbabilities[Gauss][i];
          }
        }
      }
      // Created a triangular matrix, normalise and then update other half too.
      for (int m = 0; m < data[0].nDims; ++m) {
        for (int n = 0; n < data[0].nDims; ++n) {
          covariances[Gauss][m][n] /= probSum;
        }
      }
      // Update inverses
      invCov[Gauss] = inverse(covariances[Gauss]);
      weights[Gauss] = probSum;
    }
    normaliseWeights();
  }

  float calculateProbabilityFast(DataPoint x, int Gauss) {
    float ret = pow(TWO_PI, dims/2.0)*sqrt(det(covariances[Gauss]));
    float exponent = 0;
    for (int i = 0; i < x.nDims; ++i) {
      float temp = 0;
      for (int j = 0; j < x.nDims; ++j) {
        temp += (x.data[j] - means[Gauss][j])*invCov[Gauss][i][j];
      }
      exponent += temp*(x.data[i] - means[Gauss][i]);
    }
    exponent = exp(-0.5*exponent);
    // ==================================================================
    // If I change this line HERE to -0.3*exponent, everything works fine
    // ==================================================================
    //print(exponent); print(","); println(ret);
    return exponent/ret;
  }



  void normaliseWeights() {
    float sum = sum(weights);
    for (int i = 0; i < number; ++i) {weights[i] /= sum;}
  }

  void display() {
    ellipseMode(CENTER);
    for (int i = 0; i < number; ++i) {
      //strokeWeight(weights[i]*100);
      strokeWeight(5);
      stroke(color(255, 0, 0));
      point(means[i][0], means[i][1]);
      noFill();
      strokeWeight(1.5);
      ellipse(means[i][0], means[i][1], (covariances[i][0][0]), (covariances[i][1][1]));
      ellipse(means[i][0], means[i][1], (covariances[i][0][0]*2), (covariances[i][1][1]*2));
      fill(0);
    }
  }
}

DataPoint[] data;

final int size = 10000;

GMM MixModel;

void setup() {
  // Hidden gaussians
  size(800,600);
  MixModel = new GMM(1, 2); // 1 gaussians, 2 dimensions.
  data = new DataPoint[size];
  int gNo = 1;
  float gxMeans[] = new float[gNo];
  float gxVars[]  = new float[gNo];
  float gyMeans[] = new float[gNo];
  float gyVars[]  = new float[gNo];
  float covars[]  = new float[gNo];
  for (int i = 0; i < gNo; ++i) {
    gxMeans[i] = random(-100, 100);
    gxVars[i] =  random(5, 40);
    gyMeans[i] = random(-100, 100);
    gyVars[i] =  random(5, 40); // Actually std. devs!! 
    covars[i] = 0;//random(-1, 1);
    println("Vars: " + str(pow(gxVars[i], 2)) + ", " + str(pow(gyVars[i], 2)));
    println("Covar: " + str(covars[i]));
  }
  for (int i = 0; i < size; ++i) {
    int gauss = (int)random(gNo);
    data[i] = new DataPoint(2);
    data[i].data[0] = randomGaussian()*gxVars[gauss] + gxMeans[gauss];
    data[i].data[1] = (randomGaussian()*gyVars[gauss])*(1-abs(covars[gauss]))+(gyVars[gauss]*covars[gauss]*(data[i].data[0]-gxMeans[gauss])/gxVars[gauss]) + gyMeans[gauss];
  }


  frameRate(5); // Let's see what's happening!
}


void draw() {
  translate(width/2, height/2); // set 0,0 at centre
  background(color(255, 255, 255));
  stroke(0);
  strokeWeight(1);
  for (int i = 0; i < size; ++i) {
    point(data[i].data[0], data[i].data[1]);
  }
  MixModel.display();
  float[][] dataProbs = MixModel.EStep(data);
  MixModel.MStep(data, dataProbs);
  print(MixModel.covariances[0][0][0]); print(", ");
  println(MixModel.covariances[0][1][1]);
}

编辑:完整,极简的工作示例。方差仍然收敛于0,所以这表明我可能在算法上做错了什么?

import random, statistics, math

hiddenMu = random.uniform(-100, 100)
hiddenVar = random.uniform(10, 30)
dataLen = 10000

data = [random.gauss(hiddenMu, hiddenVar) for i in range(dataLen)]

hiddenVar **= 2 # Make it the actual variance rather than std. dev.

print("Variance: " + str(hiddenVar) + ", actual: " + str(statistics.variance(data)))
print("Mean    : " + str(hiddenMu ) + ", actual: " + str(statistics.mean    (data)))

guessMu = random.uniform(-100, 100)
guessVar = 100

print("Initial mu guess:  " + str(guessMu))
print("Initial var guess: " + str(guessVar))

# perform iterations

numIters = 100

for i in range(numIters):

    dataProbs = [math.exp(-0.5*((i-guessMu)**2)/guessVar)/((2*math.pi*guessVar)**0.5) for i in data]

    guessMu = sum(map(lambda x: x[0]*x[1], zip(dataProbs, data)))/sum(dataProbs)
    guessVar = sum(map(lambda x: x[0]*((x[1]-guessMu)**2), zip(dataProbs, data)))/sum(dataProbs)

    print(str(i) + " mu guess:  " + str(guessMu))
    print(str(i) + " var guess: " + str(guessVar))
    print()

编辑2 :我可能需要贝塞尔的纠正吗? (将结果乘以n /(n-1))。如果是这样,当概率本身的总和可能小于1时,我该怎么做呢?

1 个答案:

答案 0 :(得分:0)

对于其他任何有同样问题的人,我现在理解在GMM中将点分配给高斯人的条件。您可能会在程序中显示NaN,例如问题EM algorithm code is not working

不是使用上面列出的公式为每个点分配属于高斯的概率,而是需要为所有列出的高斯分配该概率然后标准化 - 这意味着当一个点可以是如果高斯是唯一一个可以生成高斯的高斯,那么它被分配给高斯,它就变得完全被分配 - 即概率变为1,即使它最初只有非常小的机会来自该分布。