Spark LinearRegressionSummary" normal"摘要

时间:2017-10-11 19:49:01

标签: apache-spark-mllib

根据LinearRegressionSummary (Spark 2.1.0 JavaDoc),p值仅适用于"正常"解算器。

  

此值仅在使用" normal"解算器。

到底是什么"正常"解算器?

我这样做:

import org.apache.spark.ml.{Pipeline, PipelineModel} 
import org.apache.spark.ml.evaluation.RegressionEvaluator 
import org.apache.spark.ml.feature.VectorAssembler 
import org.apache.spark.ml.regression.LinearRegressionModel 
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder} 
import org.apache.spark.sql.functions._ 
import org.apache.spark.sql.{DataFrame, SparkSession}
    .
    .
    .
val (trainingData, testData): (DataFrame, DataFrame) = 
  com.acme.pta.accuracy.Util.splitData(output, testProportion)
    .
    .
    .
val lr = 
  new org.apache.spark.ml.regression.LinearRegression()
  .setSolver("normal").setMaxIter(maxIter)

val pipeline = new Pipeline()
  .setStages(Array(lr))

val paramGrid = new ParamGridBuilder()
  .addGrid(lr.elasticNetParam, Array(0.2, 0.4, 0.8, 0.9))
  .addGrid(lr.regParam, Array(0,6, 0.3, 0.1, 0.01))
  .build()

val cv = new CrossValidator()
  .setEstimator(pipeline)
  .setEvaluator(evaluator)
  .setEstimatorParamMaps(paramGrid)
  .setNumFolds(numFolds) // Use 3+ in practice

val cvModel: CrossValidatorModel = cv.fit(trainingData)

val pipelineModel: PipelineModel = cvModel.bestModel.asInstanceOf[PipelineModel]
val lrModel: LinearRegressionModel = 
  pipelineModel.stages(0).asInstanceOf[LinearRegressionModel]

val modelSummary = lrModel.summary
Holder.log.info("lrModel.summary: " + modelSummary)
try {
  Holder.log.info("feature p values: ")
  // Exception occurs on line below.
  val featuresAndPValues = features.zip(lrModel.summary.pValues)
  featuresAndPValues.foreach(
    (featureAndPValue: (String, Double)) => 
    Holder.log.info(
      "feature: " + featureAndPValue._1 + ": " + featureAndPValue._2))
} catch {
  case _: java.lang.UnsupportedOperationException 
            => Holder.log.error("Cannot compute p-values")
}

我仍然得到UnsupportedOperationException

异常消息是:

  

此LinearRegressionModel

没有可用的p值

我还需要做些什么吗?我正在使用

  "org.apache.spark" %% "spark-mllib" % "2.1.1"

该版本是否支持pValues?

1 个答案:

答案 0 :(得分:7)

<强>更新

TL;博士

解决方案1 ​​

在正常LinearRegression p值和其他“正常”统计信息仅在其中一个参数elasticNetParamregParam为零时才会出现。所以你可以改变

.addGrid( lr.elasticNetParam, Array( 0.0 ) )

.addGrid( lr.regParam, Array( 0.0 ) )

解决方案2

制作明确使用

LinearRegression自定义版本
  1. 回归的“正常”解算器。
  2. Cholesky求解WeightedLeastSquares
  3. 我将此类作为ml.regression包的扩展名。

    package org.apache.spark.ml.regression
    
    import scala.collection.mutable
    
    import org.apache.spark.SparkException
    import org.apache.spark.internal.Logging
    import org.apache.spark.ml.feature.Instance
    import org.apache.spark.ml.linalg.{Vector, Vectors}
    import org.apache.spark.ml.optim.WeightedLeastSquares
    import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
    import org.apache.spark.ml.util._
    import org.apache.spark.mllib.linalg.VectorImplicits._
    import org.apache.spark.rdd.RDD
    import org.apache.spark.sql.{DataFrame, Dataset, Row}
    import org.apache.spark.sql.functions._
    
    class CholeskyLinearRegression ( override val uid: String )
        extends Regressor[ Vector, CholeskyLinearRegression, LinearRegressionModel ]
        with LinearRegressionParams with DefaultParamsWritable with Logging {
    
        import CholeskyLinearRegression._
    
        def this() = this(Identifiable.randomUID("linReg"))
    
        def setRegParam(value: Double): this.type = set(regParam, value)
        setDefault(regParam -> 0.0)
    
        def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
        setDefault(fitIntercept -> true)
    
        def setStandardization(value: Boolean): this.type = set(standardization, value)
        setDefault(standardization -> true)
    
        def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value)
        setDefault(elasticNetParam -> 0.0)
    
        def setMaxIter(value: Int): this.type = set(maxIter, value)
        setDefault(maxIter -> 100)
    
        def setTol(value: Double): this.type = set(tol, value)
        setDefault(tol -> 1E-6)
    
        def setWeightCol(value: String): this.type = set(weightCol, value)
    
        def setSolver(value: String): this.type = set(solver, value)
        setDefault(solver -> Auto)
    
        def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
        setDefault(aggregationDepth -> 2)
    
        override protected def train(dataset: Dataset[_]): LinearRegressionModel = {
    
            // Extract the number of features before deciding optimization solver.
            val numFeatures = dataset.select(col($(featuresCol))).first().getAs[Vector](0).size
            val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))
    
            val instances: RDD[Instance] = 
                dataset
                .select( col( $(labelCol) ), w, col( $(featuresCol) ) )
                .rdd.map {
                    case Row(label: Double, weight: Double, features: Vector) =>
                    Instance(label, weight, features)
                }
    
            // if (($(solver) == Auto &&
            //   numFeatures <= WeightedLeastSquares.MAX_NUM_FEATURES) || $(solver) == Normal) {
            // For low dimensional data, WeightedLeastSquares is more efficient since the
            // training algorithm only requires one pass through the data. (SPARK-10668)
    
            val optimizer = new WeightedLeastSquares( 
                $(fitIntercept), 
                $(regParam),
                elasticNetParam = $(elasticNetParam), 
                $(standardization), 
                true,
                solverType = WeightedLeastSquares.Cholesky, 
                maxIter = $(maxIter), 
                tol = $(tol)
            )
    
            val model = optimizer.fit(instances)
    
            val lrModel = copyValues(new LinearRegressionModel(uid, model.coefficients, model.intercept))
            val (summaryModel, predictionColName) = lrModel.findSummaryModelAndPredictionCol()
    
            val trainingSummary = new LinearRegressionTrainingSummary(
                summaryModel.transform(dataset),
                predictionColName,
                $(labelCol),
                $(featuresCol),
                summaryModel,
                model.diagInvAtWA.toArray,
                model.objectiveHistory
            )
    
            lrModel
            .setSummary( Some( trainingSummary ) )
    
            lrModel
        }
    
        override def copy(extra: ParamMap): CholeskyLinearRegression = defaultCopy(extra)
    }
    
    object CholeskyLinearRegression 
        extends DefaultParamsReadable[CholeskyLinearRegression] {
    
        override def load(path: String): CholeskyLinearRegression = super.load(path)
    
        val MAX_FEATURES_FOR_NORMAL_SOLVER: Int = WeightedLeastSquares.MAX_NUM_FEATURES
    
        /** String name for "auto". */
        private[regression] val Auto = "auto"
    
        /** String name for "normal". */
        private[regression] val Normal = "normal"
    
        /** String name for "l-bfgs". */
        private[regression] val LBFGS = "l-bfgs"
    
        /** Set of solvers that LinearRegression supports. */
        private[regression] val supportedSolvers = Array(Auto, Normal, LBFGS)
    }
    

    您只需将其粘贴到项目中的单独文件中,然后在代码中将LinearRegression更改为CholeskyLinearRegression

    val lr = new CholeskyLinearRegression() // new LinearRegression()
            .setSolver( "normal" )
            .setMaxIter( maxIter )
    

    它适用于非零参数并提供pValues 。测试了以下参数网格。

    val paramGrid = new ParamGridBuilder()
            .addGrid( lr.elasticNetParam, Array( 0.2, 0.4, 0.8, 0.9 ) )
            .addGrid( lr.regParam, Array( 0.6, 0.3, 0.1, 0.01 ) )
            .build()
    

    全面调查

    我最初认为主要问题是模型没有完全保留。在CrossValidator中拟合后,训练模型不会被保留。由于内存消耗,这是可以理解的。关于如何解决问题,正在进行debate。 JIRA Issue

    您可以在评论部分看到我尝试从最佳模型中提取参数,以便再次运行它。然后我发现模型摘要没问题,只是某些参数diagInvAtWa的长度为1,基本上为零。

    对于岭回归或Tikhonov正则化(elasticNet = 0)和任何regParam pValues和其他“正常”统计可以计算,但对于Lasso方法和介于两者之间的东西(弹性网)不是。同样适用于regParam = 0:计算任何elasticNet个p值。

    为什么

    LinearRegression uses加权最小二乘优化器,用于solverType = WeightedLeastSquares.Auto的“普通”求解器。此优化程序的求解程序为two optionsQuasiNewtonCholesky。仅当regParamelasticNetParam都是非零时才选择前者。

    val solver = if (
        ( solverType == WeightedLeastSquares.Auto && 
            elasticNetParam != 0.0 && 
            regParam != 0.0 ) ||
        ( solverType == WeightedLeastSquares.QuasiNewton ) ) {
    
        ...
        new QuasiNewtonSolver(fitIntercept, maxIter, tol, effectiveL1RegFun)
    } else {
        new CholeskySolver
    }
    

    因此,在您的参数网格中,QuasiNewtonSolver将始终使用,因为regParamelasticNetParam没有组合,其中一个为零。

    我们知道为了获得pValues和其他“正常”统计数据,例如t-statistic或std。系数误差矩阵的对角线(A ^ T * W * A)^ - 1(diagInvAtWA)不能是只有一个零的向量。此条件在pValues的定义中设置。

    diagInvAtWA是压缩上三角矩阵(solution.aaInv)的对角元素的向量。

    val diagInvAtWA = solution.aaInv.map { inv => ...
    

    对于Cholesky solver,它是calculatedQuasiNewton notNormalEquationSolution的第二个parameter就是这个矩阵。

    从技术上讲,您可以使用

    创建自己的LinearRegression版本

    生殖

    在此示例中,我使用了来自here的数据sample_linear_regression_data.txt

    完整的复制代码

    import org.apache.spark._
    
    import org.apache.spark.ml.{Pipeline, PipelineModel} 
    import org.apache.spark.ml.evaluation.{RegressionEvaluator, BinaryClassificationEvaluator}
    import org.apache.spark.ml.feature.VectorAssembler 
    import org.apache.spark.ml.regression.{LinearRegressionModel, LinearRegression}
    import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder} 
    import org.apache.spark.sql.functions._ 
    import org.apache.spark.sql.{DataFrame, SparkSession}
    import org.apache.spark.ml.param.ParamMap
    
    object Main {
    
        def main( args: Array[ String ] ): Unit = {
    
            val spark =
                SparkSession
                .builder()
                .appName( "SO" )
                .master( "local[*]" )
                .config( "spark.driver.host", "localhost" )
                .getOrCreate()
    
            import spark.implicits._
    
            val data = 
                spark
                .read
                .format( "libsvm" )
                .load( "./sample_linear_regression_data.txt" )
    
            val Array( training, test ) = 
                data
                .randomSplit( Array( 0.9, 0.1 ), seed = 12345 )
    
            val maxIter = 10;
    
            val lr = new LinearRegression()
                .setSolver( "normal" )
                .setMaxIter( maxIter )
    
            val paramGrid = new ParamGridBuilder()
                // .addGrid( lr.elasticNetParam, Array( 0.2, 0.4, 0.8, 0.9 ) )
                .addGrid( lr.elasticNetParam, Array( 0.0 ) )
                .addGrid( lr.regParam, Array( 0.6, 0.3, 0.1, 0.01 ) )
                .build()
    
            val pipeline = new Pipeline()
                .setStages( Array( lr ) )
    
            val cv = new CrossValidator()
                .setEstimator( pipeline )
                .setEvaluator( new RegressionEvaluator )
                .setEstimatorParamMaps( paramGrid )
                .setNumFolds( 2 )  // Use 3+ in practice
    
            val cvModel = 
                cv
                .fit( training )
    
            val pipelineModel: PipelineModel = 
                cvModel
                .bestModel
                .asInstanceOf[ PipelineModel ]
    
            val lrModel: LinearRegressionModel = 
                pipelineModel
                .stages( 0 )
                .asInstanceOf[ LinearRegressionModel ]
    
            // Technically there is a way to use exact ParamMap
            // to build a new LR but for the simplicity I'll 
            // get and set them explicitly
    
            // lrModel.params.foreach( ( param ) => {
    
            //     println( param )
            // } )
    
            // val bestLr = new LinearRegression()
            //     .setSolver( "normal" )
            //     .setMaxIter( maxIter )
            //     .setRegParam( lrModel.getRegParam )
            //     .setElasticNetParam( lrModel.getElasticNetParam )
    
            // val bestLrModel = bestLr.fit( training )
    
            val modelSummary = 
                lrModel
                .summary
    
            println( "lrModel pValues: " + modelSummary.pValues.mkString( ", " ) )
    
            spark.stop()
        }
    }
    

    <强>原始

    有三种解算器算法available

    coefficientStandardErrorstValuespValues仅在使用“普通”求解器时可用,因为它们都基于diagInvAtWA - 矩阵的对角线(A ^ T * W * A)^ - 1。