我正在尝试根据某些分隔符或正则表达式模式拆分列。 这是输入文件。
c1 c2
jan$100 u1*21/10/15
feb$200 u2*30/12/15
mar$300$400 u1*26/2/16
无法找到错误原因。这是原始代码。
override def execute(dataFrames: List[DataFrame], sc: SparkContext, sqlContext: SQLContext, params: String, productId: Int) : List[DataFrame] = {
val explodeData = parse(params).extractOpt[ExplodeDataSchema].get
logger.info(s"EXPLODE - explodeData --> $explodeData")
val inputDf = dataFrames.head
val outputDfList = explodeData.explodeExpr flatMap{
explodeItem =>
explodeItem.outputCol map{
outputColItem =>
logger.info("(split("+explodeItem.inputCol+", '"+explodeItem.delimeter+"'))["+outputColItem.index+"]")
(outputColItem.outputCol,inputDf.select(expr("(split("+explodeItem.inputCol+", '"+explodeItem.delimeter+"'))["+outputColItem.index+"]").cast("string").as(outputColItem.outputCol)))
}
}
val outCol = outputDfList map{
x => x._1
}
logger.info(outCol.toString)
var outputDf = inputDf
logger.info(outputDf.columns.toList.toString)
logger.info(outputDf.schema.toString)
for(i <- outputDfList){
logger.info(i._1)
logger.info(i._2.columns.toList.toString)
logger.info(i._2.schema.toString)
//error comes in this line
outputDf = outputDf.withColumn(i._1, i._2.col(i._1))
}
List(outputDf)
}
}
这些是相关的日志声明。
17/02/06 10:21:10 INFO Explode$: EXPLODE - explodeData --> ExplodeDataSchema(List(SchemaDetectData(s3n://com.tookitaki.dev.product-2/uploads/generic_data/explodedata3.csv,None,None)),List(ExplodeExpr(c1,\$,List(OutputExpr(0,month), OutputExpr(1,sales), OutputExpr(2,sales2))), ExplodeExpr(c2,\*,List(OutputExpr(0,user), OutputExpr(1,date)))))
17/02/06 10:21:10 INFO Explode$: (split(c1, '\$'))[0]
17/02/06 10:21:10 INFO Explode$: (split(c1, '\$'))[1]
17/02/06 10:21:10 INFO Explode$: (split(c1, '\$'))[2]
17/02/06 10:21:11 INFO Explode$: (split(c2, '\*'))[0]
17/02/06 10:21:11 INFO Explode$: (split(c2, '\*'))[1]
17/02/06 10:21:11 INFO Explode$: List(month, sales, sales2, user, date)
17/02/06 10:21:11 INFO Explode$: List(c1, c2)
17/02/06 10:21:11 INFO Explode$: StructType(StructField(c1,StringType,true), StructField(c2,StringType,true))
17/02/06 10:21:11 INFO Explode$: month
17/02/06 10:21:11 INFO Explode$: List(month)
17/02/06 10:21:11 INFO Explode$: StructType(StructField(month,StringType,true))
17/02/06 10:21:11 ERROR Main$: resolved attribute(s) month#2 missing from c1#0,c2#1 in operator !Project [c1#0,c2#1,month#2 AS month#7];
org.apache.spark.sql.AnalysisException: resolved attribute(s) month#2 missing from c1#0,c2#1 in operator !Project [c1#0,c2#1,month#2 AS month#7];
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:38)
at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:44)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:183)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:50)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:121)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:50)
at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:44)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:34)
at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:133)
at org.apache.spark.sql.DataFrame.org$apache$spark$sql$DataFrame$$withPlan(DataFrame.scala:2126)
at org.apache.spark.sql.DataFrame.select(DataFrame.scala:707)
at org.apache.spark.sql.DataFrame.withColumn(DataFrame.scala:1188)
at com.tookitaki.aip.transformation.operations.Explode$$anonfun$execute$1.apply(Explode.scala:67)
at com.tookitaki.aip.transformation.operations.Explode$$anonfun$execute$1.apply(Explode.scala:62)
at scala.collection.immutable.List.foreach(List.scala:318)
at com.tookitaki.aip.transformation.operations.Explode$.execute(Explode.scala:62)
at com.tookitaki.aip.transformation.ExecuteTransformationJob$.run(ExecuteTransformationJob.scala:27)
at com.tookitaki.aip.transformation.Main$.processTransformationJobs(Main.scala:111)
at com.tookitaki.aip.transformation.Main$.main(Main.scala:61)
at com.tookitaki.aip.transformation.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.worker.DriverWrapper$.main(DriverWrapper.scala:58)
at org.apache.spark.deploy.worker.DriverWrapper.main(DriverWrapper.scala)
17/02/06 10:21:11 INFO Main$: Job failed
请帮助删除错误。