Spark 1.6.1:从RDD创建DataFrame [Array [Error]]

时间:2016-05-12 13:47:36

标签: scala apache-spark apache-spark-sql spark-dataframe

我尝试在我写的scala应用中创建一个DataFrame时遇到了一个问题。

我遇到的问题是编译scala,但错误表明toDF不是RDD的一部分。我已经看到了一些答案,建议在sqlContext声明之后将case类定义移出main并导入implicits,但即便这样也不适用于我。

这就是我目前所拥有的:

import scala.collection.mutable.ArrayBuffer
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql._

object ErrorParser {

    case class Error(time: String, status: String, statusType: String, host: String, message: String)

    def splitError(line: String) : Array[String] = {

        var array:Array[String] = new Array[String](5)

        ...

        return array

    }

    def filterErrors(errors: Array[Array[String]]) : Array[Array[String]] = {

        var filteredErrors = ArrayBuffer[Array[String]]()

        ...

        return filteredErrors.toArray
    }

    def main(args: Array[String]) {

        val conf = new SparkConf().setAppName("ErrorParserAPI")
        val sc = new SparkContext(conf)

        val sqlContext = new org.apache.spark.sql.SQLContext(sc)
        import sqlContext.implicits._

        var logs = sc.textFile("hdfs://hadoop-master:9000/logs/data/logs/server.*")
        var errors = logs.filter(line => line.contains("ERROR"))

        val errors1 = errors.map(line => splitError(line))
        val filteredErrors = filterErrors(errors1.collect)

        val dfErrors = filteredErrors.map(p => Error(p(0).split(":")(0) + ":" + p(0).split(":")(1), p(1), p(2), p(3), p(4)))
        val filteredRDD = sc.parallelize(dfErrors)
        var errorDF = filteredRDD.toDF()

        errorDF.write.json("hdfs://hadoop-master:9000/results/errorParserResult")

   }

}

我很难过,因为火花壳的东西就像这样。

我也看到一些答案建议将RDD更改为RDD [Row]的实例,然后使用

sc.createDataFrame(rdd, scheme)

但是我无法理解我将如何做到这一点。

非常感谢任何帮助!

这是我的.sbt文件:

name := "ErrorParserAPI"
version := "1.0"
scalaVersion := "2.11.7"
libraryDependencies ++= Seq(
        "org.apache.spark" % "spark-core_2.10" % "1.6.1",
        "org.apache.spark" % "spark-sql_2.10" % "1.6.1"
)

编辑:错字

1 个答案:

答案 0 :(得分:1)

我只是复制了你的代码并粘贴在我的eclipse中并且工作正常而没有任何编译错误。如果您正在使用eclipse,您可以尝试清理和刷新项目。

import scala.Array.canBuildFrom
import scala.collection.mutable.ArrayBuffer
import scala.reflect.runtime.universe

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object ErrorParser {


  def filterErrors(errors: Array[Array[String]]): Array[Array[String]] = {

    var filteredErrors = ArrayBuffer[Array[String]]()

    return filteredErrors.toArray
  }

  def main(args: Array[String]) {



    val conf = new SparkConf().setAppName("ErrorParserAPI")
    val sc = new SparkContext(conf)

    val sqlContext = new org.apache.spark.sql.SQLContext(sc)
    import sqlContext.implicits._

    var logs = sc.textFile("hdfs://hadoop-master:9000/logs/data/logs/server.*")
    var errors = logs.filter(line => line.contains("ERROR"))

    val errors1 = errors.map(line => splitError(line))
    val filteredErrors = filterErrors(errors1.collect)

    val dfErrors = filteredErrors.map(p => Error(p(0).split(":")(0) + ":" + p(0).split(":")(1), p(1), p(2), p(3), p(4)))
    val filteredRDD = sc.parallelize(dfErrors)
    var errorDF = filteredRDD.toDF()
  }

  case class Error(time: String, status: String, statusType: String, host: String, message: String)

  def splitError(line: String): Array[String] = {

    var array: Array[String] = new Array[String](5)

    return array

  }
}