我通过查询和合并配置单元表的列来构建“结构图”列。稍后,我将这些记录分组到id列,以构建这些ID的相关映射。稍后将其连接到其他Dataframe,然后再将其写回配置单元表。
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StructType, StructField, StringType, MapType, ArrayType, LongType}
import scala.collection.Map
import scala.collection.JavaConversions._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.rdd.RDD
val eschema = new StructType(Array(StructField("id", LongType, nullable = false), StructField("DEFINITION", MapType(StringType, StructType(List(StructField("a",LongType,true), StructField("b", StringType, true), StructField("c",StringType,true), StructField("d",StringType,true), StructField("e",StringType,true), StructField("f",StringType,true), StructField("g",StringType,true), StructField("h",StringType,true), StructField("i",StringType,true), StructField("j",StringType,true), StructField("k",StringType,true))))) ))
val etrans = sqlContext.sql("""select id, map(table.col1, named_struct("a", table.col2, "b", table.col3, "c", table.col4, "d", table.col5, "e", table.col6, "f", table.col7, "g", table.col8, "h", table.col9, "i", table.col10, "j", table.col11, "k", table.col12)) AS DEFINITION from table""")
val aggregatedRdd: RDD[Row] = etrans.rdd.groupBy(r => r.getAs[Long]("id")).map(row => Row(row._1, row._2.map(_.getAs[Map[String, List[(String, Any)]]]("DEFINITION")).toList))
val aggregatedDf = sqlContext.createDataFrame(aggregatedRdd, eschema)
aggregatedDf.registerTempTable("event")
aggregatedDf.printSchema()
aggregatedDf.show()
我遇到以下匹配错误
ERROR Executor: Exception in task 0.0 in stage 83.0 (TID 3652)
scala.MatchError: List(Map(qwe -> [204,,abc,,positive,False,everywhere,always_record,counter,xyz,disabled]), Map(N/A -> [20,,something,,null,null,null,null,null,null,null]), Map(xyz -> [220,,something,,positive,False,everywhere,always_record,counter,xyz,enabled])) (of class scala.collection.immutable.$colon$colon)
at org.apache.spark.sql.catalyst.CatalystTypeConverters$MapConverter.toCatalystImpl(CatalystTypeConverters.scala:201)
at org.apache.spark.sql.catalyst.CatalystTypeConverters$MapConverter.toCatalystImpl(CatalystTypeConverters.scala:193)
at org.apache.spark.sql.catalyst.CatalystTypeConverters$CatalystTypeConverter.toCatalyst(CatalystTypeConverters.scala:102)
答案 0 :(得分:0)
MatchError中的类是:
class scala.collection.immutable.$colon$colon
这是实现ArrayType的类型:
我认为问题是ArrayType没有类转换为List [(String,Any)]:
_.getAs[Map[String, List[(String, Any)]]]("DEFINITION"))
你在getAs()中使用的代码不是递归的,只是一个asInstanceOf [Map [....:
这是getMap()的定义可能会更好:
def getMap[K, V](i: Int): scala.collection.Map[K, V] = getAs[Map[K, V]](i)
然后你可以有一个二级类强制转换,它运行在Map的值上并从未知的V转换为List [(String,Any)]。