将Scala Json解析为数据帧

时间:2018-06-26 11:40:51

标签: scala apache-spark

Json示例

 "alternateId": [
    {
        "type": "POPID",
        "value": "1-7842-0759-001"
    },
    {
        "type": "CAMID",
        "value": "CAMID 0000-0002-7EC1-02FF-O-0000-0000-2"
    },
    {
        "type": "ProgrammeUuid",
        "value": "1ddb01e2-6146-4e10-bba9-dde40d0ad886"
    }
]

我想用两列更新现有的数据框,这两列是POPID和CAMID。这两个值需要从json结构中解析 我不知道如何解析此结构,您能帮我在fetchField方法上进行哪些更改。按照上述方法,将JSON POPID放在首位,将CAMID放在第二位,但是在实际的json中,可以将其放在alternateId内这3个位置之一。

 val fetchCAMID_udf = udf(fetchCAMID _)
 val fetchPOPID_udf = udf(fetchPOPID _)

 var updatedDf = //Data frame initialize

 updatedDf = updatedDf.withColumn("CAMID", fetchCAMID_udf(col("alternate_id")))
 updatedDf = updatedDf.withColumn("POPID", fetchPOPID_udf(col("alternate_id")))
 updatedDf .show(10,false)


 def fetchCAMID(jsonStr: String): String = {
var CAMID: String = fetchField(jsonStr, "CAMID")
 CAMID
}

 def fetchPOPID(jsonStr: String): String = {
fetchField(jsonStr, "POPID")
}


 def fetchField(jsonStr: String, fieldName: String): String = {
 try {
   implicit val formats = DefaultFormats
   val extractedField = jsonStr match {
    case "(unknown)" => jsonStr
    case _ => {
      val json = JsonMethods.parse(jsonStr)
      val resultExtracted = (json \\ fieldName)
      val result = resultExtracted match {
        case _: JString => resultExtracted.extract[String]
        case _: JInt => resultExtracted.extract[Int].toString
        case _: JObject => "(unknown)"
      }
      result
    }
   }
  extractedField
 }
catch{
  case e: Exception =>{
    log.error(s"Fetch field failed. Field name: $fieldName . Json: $jsonStr")
    "(unknown)"
   }
  }
}

2 个答案:

答案 0 :(得分:1)

将您的fetchField函数更改为以下

def fetchField(jsonStr: String, fieldName: String): String = {
  try {
    val typeAndValue = (JsonMethods.parse("{"+jsonStr+"}") \ "alternateId" \ "type" \\ classOf[JString]).zip(JsonMethods.parse("{"+jsonStr+"}") \ "alternateId" \ "value" \\ classOf[JString])
    typeAndValue.filter(_._1 == fieldName).map(_._2).toList(0)
  }catch{
    case e: Exception =>{
      "(unknown)"
    }
  }
}

然后您填充了CAMIDPOPID

答案 1 :(得分:0)

您可以使用Spark读取JSON,并通过常规的Spark操作获取

val df=spark.read.option("multiLine",true).json("test.json")

 df.select($"alternateId".getItem(0).as("pop"),$"alternateId".getItem(1).as("cam")).select($"pop.value".as("POPID"),$"cam.value".as("CAMID")).show()

+---------------+--------------------+
|          POPID|               CAMID|
+---------------+--------------------+
|1-7842-0759-001|CAMID 0000-0002-7...|
+---------------+--------------------+