我在scala脚本中编写简单的ETL过程并以'spark-shell -i rawetl.scala'运行但是得到了异常“Table not found BaseTable”。我也检查了文件,它正在正确选择。
以下是示例代码
val logData: RDD[Array[String]] = sc.textFile("/path/data.txt", 2).map(line => line.split(",")
logData.map(a => (a.head, (a(1),a(2)))
注意:如果我在spark shell(没有脚本)中逐个运行波纹管命令,我得到了正确的输出,没有任何异常。
import java.io.File
import sqlContext.implicits._
import scala.io.Source
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql.SQLContext
import sys.process._
case class pageRow(affiliateid : String , pageurl : String, alertedbrandsafety : String, blockedbrandsafety : String, grossimpressions : String, spider_bot : String, invalidbrowser : String ,outlieractivity : String , day : String)
object batch_nht {
def main() {
processRawNHT()
}
def processRawNHT() {
val rawFile = "hadoop fs -ls /tmp/XXX/rawDB/" #| "tail -1" !!
val fileName = rawFile.substring(rawFile.indexOf("/"))
val filePathName = "hdfs://AAAAA:8020" + fileName.trim()
println(filePathName)
val sc = new SparkContext(new SparkConf().setAppName("analyzeBlog"))
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val inviewraw = sc.textFile(filePathName).map(_.split(",")).map(x=>x.map(_.replace("\"","")))
val base_people = inviewraw.map{r => if(r(13) == null || r(13).trim.isEmpty) (r(5) ,r(32), r(48), r(49),r(14), r(71), r(72), r(73),r(0)) else (r(5) ,r(32), r(48), r(49),r(14), r(71), r(72), r(73),r(0))}
val logs_base_page_schemaRDD = base_people.map(p => pageRow(p._1, p._2, p._3, p._4,p._5, p._6, p._7, p._8,p._9)).toDF()
logs_base_page_schemaRDD.registerTempTable("baseTable")
sqlContext.sql("select * from baseTable").collect().foreach(println)
}
}
batch_nht.main()
请说明出了什么问题?在脚本中
答案 0 :(得分:1)
以下是经过测试的代码段。您的方法不适用于使用scala的Programmatic方法。
import java.io.File
import sqlContext.implicits._
import scala.io.Source
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql.SQLContext
import sys.process._
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.{StructType,StructField,StringType};
object batch_nht {
def main() {
processRawNHT()
}
def processRawNHT() {
val rawFile = "hadoop fs -ls /user/cloudera/cards/" #| "tail -1" !!
val fileName = rawFile.substring(rawFile.indexOf("/"))
val filePathName = "hdfs://quickstart.cloudera:8020" + fileName.trim()
println(filePathName)
val schemaString = "color|suit|pip"
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val deck = sc.textFile(filePathName).map(_.split("\\|"))
val schema =
StructType(
schemaString.split("\\|").map(fieldName => StructField(fieldName, StringType, true)))
val base_deckRDD = deck.map{r => Row(r(0), r(1), r(2))}
val cardsDataFrame = sqlContext.createDataFrame(base_deckRDD, schema)
cardsDataFrame.registerTempTable("deck_of_cards")
val firstTen = sqlContext.sql("select * from deck_of_cards limit 10")
firstTen.map(r => (r(0), r(1), r(2))).collect().foreach(println)
}
}
batch_nht.main()