我尝试在我写的scala应用中创建一个DataFrame时遇到了一个问题。
我遇到的问题是编译scala,但错误表明toDF不是RDD的一部分。我已经看到了一些答案,建议在sqlContext声明之后将case类定义移出main并导入implicits,但即便这样也不适用于我。
这就是我目前所拥有的:
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql._
object ErrorParser {
case class Error(time: String, status: String, statusType: String, host: String, message: String)
def splitError(line: String) : Array[String] = {
var array:Array[String] = new Array[String](5)
...
return array
}
def filterErrors(errors: Array[Array[String]]) : Array[Array[String]] = {
var filteredErrors = ArrayBuffer[Array[String]]()
...
return filteredErrors.toArray
}
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("ErrorParserAPI")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
var logs = sc.textFile("hdfs://hadoop-master:9000/logs/data/logs/server.*")
var errors = logs.filter(line => line.contains("ERROR"))
val errors1 = errors.map(line => splitError(line))
val filteredErrors = filterErrors(errors1.collect)
val dfErrors = filteredErrors.map(p => Error(p(0).split(":")(0) + ":" + p(0).split(":")(1), p(1), p(2), p(3), p(4)))
val filteredRDD = sc.parallelize(dfErrors)
var errorDF = filteredRDD.toDF()
errorDF.write.json("hdfs://hadoop-master:9000/results/errorParserResult")
}
}
我很难过,因为火花壳的东西就像这样。
我也看到一些答案建议将RDD更改为RDD [Row]的实例,然后使用
sc.createDataFrame(rdd, scheme)
但是我无法理解我将如何做到这一点。
非常感谢任何帮助!
这是我的.sbt文件:
name := "ErrorParserAPI"
version := "1.0"
scalaVersion := "2.11.7"
libraryDependencies ++= Seq(
"org.apache.spark" % "spark-core_2.10" % "1.6.1",
"org.apache.spark" % "spark-sql_2.10" % "1.6.1"
)
编辑:错字
答案 0 :(得分:1)
我只是复制了你的代码并粘贴在我的eclipse中并且工作正常而没有任何编译错误。如果您正在使用eclipse,您可以尝试清理和刷新项目。
import scala.Array.canBuildFrom
import scala.collection.mutable.ArrayBuffer
import scala.reflect.runtime.universe
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object ErrorParser {
def filterErrors(errors: Array[Array[String]]): Array[Array[String]] = {
var filteredErrors = ArrayBuffer[Array[String]]()
return filteredErrors.toArray
}
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("ErrorParserAPI")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
var logs = sc.textFile("hdfs://hadoop-master:9000/logs/data/logs/server.*")
var errors = logs.filter(line => line.contains("ERROR"))
val errors1 = errors.map(line => splitError(line))
val filteredErrors = filterErrors(errors1.collect)
val dfErrors = filteredErrors.map(p => Error(p(0).split(":")(0) + ":" + p(0).split(":")(1), p(1), p(2), p(3), p(4)))
val filteredRDD = sc.parallelize(dfErrors)
var errorDF = filteredRDD.toDF()
}
case class Error(time: String, status: String, statusType: String, host: String, message: String)
def splitError(line: String): Array[String] = {
var array: Array[String] = new Array[String](5)
return array
}
}