Question

我是火花流的初学者，并尝试使用scala进行流线性回归示例。因此，当我搜索时，我发现了很多使用RDD的流机器学习算法的例子。但是，是否可以使用数据集（在spark 2.0.1中引入）来代替RDD进行流式传输。有没有办法验证代码是使用RDD还是数据集？我已经在下面发布了我的代码。任何帮助表示赞赏。

import scala.language.reflectiveCalls
import scopt.OptionParser
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.sql.{ DataFrame, SparkSession }
import com.sun.xml.internal.ws.wsdl.writer.document.Import
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import java.lang.Boolean

object LinearRegressionExample {

  case class Params(
    input: String = null,
    testInput: String = "",
    dataFormat: String = "libsvm",
    regParam: Double = 0.0,
    elasticNetParam: Double = 0.0,
    maxIter: Int = 100,
    tol: Double = 1E-6,
    fracTest: Double = 0.2) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()
    val parser = new OptionParser[Params]("LinearRegressionExample") {
      head("LinearRegressionExample: an example Linear Regression with Elastic-Net app.")
      opt[Double]("regParam")
    .text(s"regularization parameter, default: ${defaultParams.regParam}")
    .action((x, c) => c.copy(regParam = x))
      opt[Double]("elasticNetParam")
    .text(s"ElasticNet mixing parameter. For alpha = 0, the penalty is an L2 penalty. " +
      s"For alpha = 1, it is an L1 penalty. For 0 < alpha < 1, the penalty is a combination of " +
      s"L1 and L2, default: ${defaultParams.elasticNetParam}")
    .action((x, c) => c.copy(elasticNetParam = x))
  opt[Int]("maxIter")
    .text(s"maximum number of iterations, default: ${defaultParams.maxIter}")
    .action((x, c) => c.copy(maxIter = x))
  opt[Double]("tol")
    .text(s"the convergence tolerance of iterations, Smaller value will lead " +
      s"to higher accuracy with the cost of more iterations, default: ${defaultParams.tol}")
    .action((x, c) => c.copy(tol = x))
  opt[Double]("fracTest")
    .text(s"fraction of data to hold out for testing. If given option testInput, " +
      s"this option is ignored. default: ${defaultParams.fracTest}")
    .action((x, c) => c.copy(fracTest = x))
  opt[String]("testInput")
    .text(s"input path to test dataset. If given, option fracTest is ignored." +
      s" default: ${defaultParams.testInput}")
    .action((x, c) => c.copy(testInput = x))
  opt[String]("dataFormat")
    .text("data format: libsvm (default), dense (deprecated in Spark v1.1)")
    .action((x, c) => c.copy(dataFormat = x))
  arg[String]("<input>")
    .text("input path to labeled examples")
    .required()
    .action((x, c) => c.copy(input = x))
}

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _            => sys.exit(1)
    }
      }

  def run(params: Params): Unit = {
     val conf = new SparkConf().setMaster("local[2]").setAppName("LinearRegressionExample with $params")

    val ssc = new StreamingContext(conf, Seconds(1))
    val spark = SparkSession
      .builder
      .appName(s"LinearRegressionExample with $params")
      .getOrCreate()

    println(s"LinearRegressionExample with parameters:\n$params")

    // Load training and test data and cache it.
    val (training: DataFrame, test: DataFrame) =     DecisionTreeExample.loadDatasets(params.input,
      params.dataFormat, params.testInput, "regression", params.fracTest)

    val lir = new LinearRegression()
      .setFeaturesCol("features")
      .setLabelCol("label")
      .setRegParam(params.regParam)
      .setElasticNetParam(params.elasticNetParam)
      .setMaxIter(params.maxIter)
      .setTol(params.tol)

    // Train the model
    val startTime = System.nanoTime()
    val lirModel = lir.fit(training)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    // Print the weights and intercept for linear regression.
    println(s"Weights: ${lirModel.coefficients} Intercept:     ${lirModel.intercept}")

    println("Training data results:")
    DecisionTreeExample.evaluateRegressionModel(lirModel, training, "label")
    println("Test data results:")
    DecisionTreeExample.evaluateRegressionModel(lirModel, test, "label")

    spark.stop()
  }
}

Spark Streaming 2.0.1 - 数据集和RDD

0 个答案: