我要从csv文件中提取一些字段,并尝试将其放入Json文件中。如果 AdditionalIdentifier 不包含任何值,我想使用空白数组(即[]
),但是保存在JSON文件中时,我会在{{1}之间插入null
}}。尝试使用when时的其他选项,但显示模式不匹配错误。有人可以帮我吗?
在选择字段时,我尝试检查某个字段的值是否为null,然后创建数组并放入某些值,否则为空数组,但不起作用。粘贴到我得到的错误下面:
[null]
Exception in thread "main" org.apache.spark.sql.AnalysisException: cannot resolve 'CASE WHEN (`ISIN` IS NOT NULL) THEN named_struct('IdentifierType', 'ISIN', 'IdentifierValue', `ISIN`) ELSE [] END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type;;
'Project [ObservationID#30, PublicationDate#11, substring(ObservationDate#37, 1, 10) AS ObservationDate#72, ObservationType#32, RIC#10, array(CASE WHEN isnotnull(ISIN#28) THEN named_struct(IdentifierType, ISIN, IdentifierValue, ISIN#28) ELSE [] END) AS AdditionalIdentifier#75]
+- Relation[RIC#10,PublicationDate#11,Type#12,Price#13,Volume#14,Qualifiers#15,SeqNo#16,ExchangeTime#17,ExchangeDate#18,AccVolume#19,OriginalExchangeDate#20,OriginalPrice#21,OriginalVolume#22,OriginalSeqNo#23,OriginalExchangeTime#24,TradePriceCurrency#25,TradeID#26,OriginalTradeID#27,ISIN#28,GMTOffset#29,ObservationID#30,OriginalObservationDate#31,ObservationType#32,TradeEventType#33,... 7 more fields] csv
at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$3.applyOrElse(CheckAnalysis.scala:115)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$3.applyOrElse(CheckAnalysis.scala:107)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:278)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:278)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:277)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:275)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:275)
at org.apache.spark.sql.catalyst.trees.TreeNode.org$apache$spark$sql$catalyst$trees$TreeNode$$mapChild$2(TreeNode.scala:295)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4$$anonfun$apply$13.apply(TreeNode.scala:354)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:354)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:324)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:275)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:275)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:275)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:326)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:324)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:275)
at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$transformExpressionsUp$1.apply(QueryPlan.scala:93)
at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$transformExpressionsUp$1.apply(QueryPlan.scala:93)
at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$1.apply(QueryPlan.scala:105)
at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$1.apply(QueryPlan.scala:105)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpression$1(QueryPlan.scala:104)
at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:116)
at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1$2.apply(QueryPlan.scala:121)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:121)
at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$2.apply(QueryPlan.scala:126)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at org.apache.spark.sql.catalyst.plans.QueryPlan.mapExpressions(QueryPlan.scala:126)
at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsUp(QueryPlan.scala:93)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:107)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:85)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:127)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:85)
at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:95)
at org.apache.spark.sql.catalyst.analysis.Analyzer$$anonfun$executeAndCheck$1.apply(Analyzer.scala:108)
at org.apache.spark.sql.catalyst.analysis.Analyzer$$anonfun$executeAndCheck$1.apply(Analyzer.scala:105)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:201)
at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:105)
at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:57)
at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:55)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:47)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:79)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withPlan(Dataset.scala:3407)
at org.apache.spark.sql.Dataset.select(Dataset.scala:1335)
at com.tr.edp.bulk.frtb.processor.SvdsFileProcessor.process(SvdsFileProcessor.scala:58)
at com.tr.edp.bulk.frtb.executer.SVDSFileRunner$.main(SVDSFileRunner.scala:28)
at com.tr.edp.bulk.frtb.executer.SVDSFileRunner.main(SVDSFileRunner.scala)
如果我删除了我得到的其他部分输出,则为:
val result = svdsFile.select(col("ObservationID"),col("PublicationDate"),col("ObservationDate").substr(1, 10).alias("ObservationDate"),//extracting the required fields
col("ObservationType"),col("RIC"),array(when(col("ISIN").isNotNull,struct(lit("ISIN").alias("IdentifierType"),col("ISIN")
.alias("IdentifierValue"))).otherwise(Array.empty[Long])).alias("AdditionalIdentifier"))
其他标识符字段中的{
"ObservationID": "6B86B27355FF34375747F56ASD4567GH",
"PublicationDate": "8/8/2018",
"ObservationDate": "08/08/2018",
"ObservationType": "Trade",
"RIC": "AAME.gF",
"AdditionalIdentifier": [{
"IdentifierType": "ISIN",
"IdentifierValue": "hdsjdjs"
}]
} {
"ObservationID": "4G5FD6HSUR48HFEDKH5HFSFHFG8888",
"PublicationDate": "12/18/2018",
"ObservationDate": "8/9/201812",
"ObservationType": "Mkt. Condition",
"RIC": "AAME.gF",
"AdditionalIdentifier": [{
"IdentifierType": "ISIN",
"IdentifierValue": "hkcsdhsdhc"
}]
} {
"ObservationID": "8FGH45JKGH78HG90BHUIDF4GH56FYT",
"PublicationDate": "12/18/2018",
"ObservationDate": "8/10/20181",
"ObservationType": "Trade",
"RIC": "gAOI.DF",
"AdditionalIdentifier": [null]
} {
"ObservationID": "8GH56FGHR798HFGRTY465HFJ89HG74",
"PublicationDate": "12/18/2018",
"ObservationDate": "8/11/20181",
"ObservationType": "Market Price",
"RIC": "gIIQ.DF",
"AdditionalIdentifier": [{
"IdentifierType": "ISIN",
"IdentifierValue": "sddvgsdag"
}]
} {
"ObservationID": "8DFG73HRR7HDHD7HEJF7HDE7HEH66",
"PublicationDate": "12/18/2018",
"ObservationDate": "8/12/20181",
"ObservationType": "Market Price",
"RIC": "ggNE.DF",
"AdditionalIdentifier": [null]
}
之间不应有null
,如果为空。它应显示为[]