批量梯度下降没有收敛

时间:2016-06-14 16:05:50

标签: java classification gradient-descent

我在实现批量和随机梯度下降方面迈出了第一步。

这是我的实施:

package ch.learning;

import java.util.*;

import org.jzy3d.analysis.AbstractAnalysis;
import org.jzy3d.analysis.AnalysisLauncher;
import org.jzy3d.chart.factories.AWTChartComponentFactory;
import org.jzy3d.colors.Color;
import org.jzy3d.colors.ColorMapper;
import org.jzy3d.colors.colormaps.ColorMapRainbow;
import org.jzy3d.maths.Coord3d;
import org.jzy3d.maths.Range;
import org.jzy3d.plot3d.builder.*;
import org.jzy3d.plot3d.builder.concrete.*;
import org.jzy3d.plot3d.primitives.Scatter;
import org.jzy3d.plot3d.primitives.Shape;
import org.jzy3d.plot3d.rendering.canvas.Quality;


import org.apache.commons.math3.analysis.function.Sigmoid;


public class LogisticReg_GradientDescent {

    private List<double[]> trainingExamples = new LinkedList<double[]>();
    private static final int sizeTrainingset = 1000;
    private volatile  double[] theta = {10, 10, 10, 10 };
    // Configurable compoenent of step size during theata update
    private final double alpha = 0.01;
    // Amount of iteration in Batch Gradient Descent
    private static final int iterations = 10000;
    private static final int printsAtStartAndEnd = 5;

    private void buildTrainingExample(int amount) {

        // Area of the house
        double areaMin = 80;
        double areaMax = 1000;
        double areaRange = areaMax - areaMin;

        // Distance to center
        double distanceMin = 10;
        double distanceMax = 10000;
        double distanceRange = distanceMax - distanceMin;

        // Generate training examples with prices
        for (int i = 0; i < amount; i++) {
            double[] example = new double[5];
            example[0] = 1.0;
            example[1] = areaMin + Math.random() * areaRange;
            example[2] = distanceMin + Math.random() * distanceRange;
            // Price is a feature as well in this logistic regression example
            double price = 0;
            price += _priceComponent(example[1], areaRange);
            price += _priceComponent(example[2], distanceRange);
            // price += _priceComponent(example[3], yocRange);
            example[3] = price;
            example[4] = (price>200000)?0:1;
            trainingExamples.add(example);
        }
    }

    // Random price according with some range constraints
    private double _priceComponent(double value, double range) {
        if (value <= range / 3)
            return 50000 + 50000 * Math.random() * 0.1;
        if (value <= (range / 3 * 2))
            return 100000 + 100000 * Math.random() * 0.1;
        return 150000 + 150000 * Math.random() * 0.1;
    }

    private double classificationByHypothesis(double[] features) {
        // Scaling
        double scalingF0 = features[0];
        double scalingF1 = (features[1] - 80) / (920);
        double scalingF2 = (features[2] - 10) / (9990);
        double scalingF3 = (features[3] - 50000) / (400000);

        double z = this.theta[0] * scalingF0 + this.theta[1] * scalingF1 + this.theta[2] * scalingF2
                + this.theta[3] * scalingF3;

        double ret = 1 / (1 + Math.pow(Math.E, -z));
        return ret;
    }

    // Costfunction: Mean squared error function
    private double gradientBatch_costs() {

        double costs = this.trainingExamples.stream().mapToDouble(l -> {
            double costsint;
            if (l[4] == 0) {

                costsint = -Math.log(1 - classificationByHypothesis(l));
            } else {

                costsint = -Math.log(classificationByHypothesis(l));
            }
            return costsint;
        }).sum();

        return costs / this.trainingExamples.size();
    }

    // Theta Update with Batch Gradient Descent
    private void gradientBatch_thetaUpdate(int amount) {
        for (int i = 0; i < amount; i++) {

            double partialDerivative0 = this.trainingExamples.stream()
                    .mapToDouble(l -> (classificationByHypothesis(l) - l[4]) * l[0]).sum();
            double tmpTheta0 = this.theta[0] - (this.alpha * partialDerivative0 / this.trainingExamples.size());

            double partialDerivative1 = this.trainingExamples.stream()
                    .mapToDouble(l -> (classificationByHypothesis(l) - l[4]) * l[1]).sum();
            double tmpTheta1 = this.theta[1] - (this.alpha * partialDerivative1 / this.trainingExamples.size());

            double partialDerivative2 = this.trainingExamples.stream()
                    .mapToDouble(l -> (classificationByHypothesis(l) - l[4]) * l[2]).sum();
            double tmpTheta2 = this.theta[2] - (this.alpha * partialDerivative2 / this.trainingExamples.size());

            double partialDerivative3 = this.trainingExamples.stream()
                    .mapToDouble(l -> (classificationByHypothesis(l) - l[4]) * l[3]).sum();
            double tmpTheta3 = this.theta[3] - (this.alpha * partialDerivative3 / this.trainingExamples.size());

            this.theta = new double[] { tmpTheta0, tmpTheta1, tmpTheta2, tmpTheta3 };
        }
    }

    // Theta update with Stochastic Gradient Descent
    private void gradientStochastic_thetaUpdate(double[] feature) {
        double tmpTheta0 = this.theta[0] - this.alpha * (classificationByHypothesis(feature) - feature[4]) * feature[0];
        double tmpTheta1 = this.theta[1] - this.alpha * (classificationByHypothesis(feature) - feature[4]) * feature[1];
        double tmpTheta2 = this.theta[2] - this.alpha * (classificationByHypothesis(feature) - feature[4]) * feature[2];
        double tmpTheta3 = this.theta[3] - this.alpha * (classificationByHypothesis(feature) - feature[4]) * feature[3];
        this.theta = new double[] { tmpTheta0, tmpTheta1, tmpTheta2, tmpTheta3 };
    }

    private void resetTheta() {
        this.theta = new double[] {0.00001, 0.00001, 0.00001, 0.00001};
    }

    private void printSummary(int iteration) {
        System.out.println(String.format("%s \t\t Theta: %f \t %f \t %f \t %f \t Costs: %f", iteration, this.theta[0],
                this.theta[1], this.theta[2], this.theta[3], this.gradientBatch_costs()));
    }

    public static void main(String[] args) {
        LogisticReg_GradientDescent d = new LogisticReg_GradientDescent();

        // Batch and Stochastic Gradient Descent use the same training example
        d.buildTrainingExample(sizeTrainingset);




        System.out.println("Batch Gradient Descent");
        d.printSummary(0);

        System.out.println(String.format("First %s iterations", printsAtStartAndEnd));
        for (int i = 1; i <= iterations; i++) {
            d.gradientBatch_thetaUpdate(1);
            d.printSummary(i);
        }

        System.out.println("Some examples are:");
        System.out.println(String.format("The 1:%s, Area:%s, Distance:%s, Price:%s, Classification:%s", d.trainingExamples.get(0)[0],d.trainingExamples.get(0)[1],d.trainingExamples.get(0)[2],d.trainingExamples.get(0)[3],d.trainingExamples.get(0)[4]));
        System.out.println(String.format("The 1:%s, Area:%s, Distance:%s, Price:%s, Classification:%s", d.trainingExamples.get(500)[0],d.trainingExamples.get(500)[1],d.trainingExamples.get(500)[2],d.trainingExamples.get(500)[3],d.trainingExamples.get(500)[4]));

         try {
                AnalysisLauncher.open(d.new SurfaceDemo());
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
        class SurfaceDemo extends AbstractAnalysis{

            @Override
            public void init(){
                double x;
                double y;
                double z;
                float a;
                Coord3d[] points = new Coord3d[trainingExamples.size()];
                Color[]   colors = new Color[trainingExamples.size()];

                for(int i=0; i<trainingExamples.size(); i++){
                    x = trainingExamples.get(i)[1]; // Area
                    y = trainingExamples.get(i)[2]; // Distance to center
                    z = trainingExamples.get(i)[3]; // price
                    points[i] = new Coord3d(x, y, z);

                    a = 1f;
                    if(trainingExamples.get(i)[4]==1){
                        colors[i] =new Color(0,0,0,a);
                    }else{
                        colors[i]= new Color(250,0,0,a);
                    }
                }

                Scatter scatter = new Scatter(points, colors);
                scatter.setWidth(4);

                Mapper mapper = new Mapper() {
                    @Override
                    public double f(double x, double y) {

                       return (-theta[0]-theta[1]*x-theta[2]*y)/theta[3];
                    }
                };

                // Create the object to represent the function over the given range.
                Range rangeX = new Range(0, 1000);
                Range rangeY = new Range(0, 10000);
                int steps = 10;
                final Shape surface = Builder.buildOrthonormal(new OrthonormalGrid(rangeX, steps, rangeY, steps), mapper);
                surface.setColorMapper(new ColorMapper(new ColorMapRainbow(), surface.getBounds().getZmin(), surface.getBounds().getZmax(), new Color(1, 1, 1, .5f)));
                surface.setFaceDisplayed(true);
                surface.setWireframeDisplayed(false);


                chart = AWTChartComponentFactory.chart(Quality.Advanced, getCanvasType());
                chart.getScene().add(scatter);
                chart.getScene().add(surface);

            }
    }
}

图形表示看起来像

plot training instances by org.jzy3d.plot3d

所以我用org.jzy3d.plot3d绘制生成的训练实例。 我们看到x(房子的面积),y(到市中心的距离)和z(价格)。 分类使红色(负类 - &gt;未售出)和黑色(正类 - &gt;销售)。

在生成的培训实例中,分类仅取决于价格,您可以在此处看到:

 example[4] = (price>200000)?0:1;

问题,我不明白的是

我想绘制分类器的决策边界。 决定边界取决于Theta的优化组件。 (使用批量梯度下降)。 所以我尝试用这段代码绘制决策边界平面:

Mapper mapper = new Mapper() {
                    @Override
                    public double f(double x, double y) {

                       return (-theta[0]-theta[1]*x-theta[2]*y)/theta[3];
                    }
                };

由于

  

theta [0] * 1 + theta [1] * x + theta [2] * y + theta [3] * z = 0

所以

  

z = - (theta [0] * 1 + theta [1] * x + theta [2] * y)/ theta [3]

我希望我的决定在红色和黑色区域之间的边界平面。 相反,它会被z = 0挂起。

我不知道,要么我不能以正确的方式绘制这个决策边界平面,要么我的优化参数是糟糕的。 此外,我不知道如何选择一个好的初始θ矢量。 现在我用

private volatile  double[] theta = {1, 1, 1, 1 };

我将alpha设置为0.0001

private final double alpha = 0.0001;

这是最大可能的Alpha,我的成本函数不会跳转,而且sigmoid实现不会返回无穷大。 我已经在

进行了功能扩展
private double classificationByHypothesis(double[] features) {

    // Scaling
    double scalingF0 = features[0];
    double scalingF1 = (features[1] - 80) / (920);
    double scalingF2 = (features[2] - 10) / (9990);
    double scalingF3 = (features[3] - 50000) / (400000);

    double z = this.theta[0] * scalingF0 + this.theta[1] * scalingF1 + this.theta[2] * scalingF2
            + this.theta[3] * scalingF3;

    double ret = 1 / (1 + Math.pow(Math.E, -z));
    return ret;
}

给定初始theta和alpha等于0.0001的最后五次迭代是

  

9996,Theta:1.057554,-6.340981,-6.242139,8.145087,费用:0.359108

     

9997,Theta:1.057560,-6.341234,-6.242345,8.145576,费用:0.359109

     

9998,Theta:1.057565,-6.341487,-6.242552,8.146065,费用:0.359110

     

9999,Theta:1.057571,-6.341740,-6.242758,8.146553,费用:0.359112

     

10000,Theta:1.057576,-6.341993,-6.242965,8.147042,费用:0.359113

生成的培训实例的一些示例是

  

面积:431.50139030510206,距离:8591.341686012887,   价钱:255049.1280388437,分类:0.0

     

面积:727.4042972310916,距离:4364.710136408952,   价钱:258385.59452489938,分类:0.0

感谢任何提示!

0 个答案:

没有答案