我有一个json字符串,例如:
{"sequence":89,"id":8697344444103393,"trackingInfo":{"location":"Browse","row":0,"trackId":14170286,"listId":"cd7c2c7a-00f6-4035-867f-d1dd7d89972d_6625365X3XX1505943605585","videoId":80000778,"rank":0,"requestId":"ac12f4e1-5644-46af-87d1-ec3b92ce4896-4071171"},"type":["Play","Action","Session"],"time":527636408955},1],
{"sequence":155,"id":8697389381205360,"trackingInfo":{"location":"Browse","row":0,"trackId":14170286,"listId":"cd7c2c7a-00f6-4035-867f-d1dd7d89972d_6625365X3XX1505943605585","videoId":80000778,"rank":0,"requestId":"ac12f4e1-5644-46af-87d1-ec3b92ce4896-4071171"},"type":["Play","Action","Session"],"time":527637858607},1],
{"sequence":136,"id":8697374208897843,"trackingInfo":{"location":"Browse","row":0,"trackId":14170286,"listId":"cd7c2c7a-00f6-4035-867f-d1dd7d89972d_6625365X3XX1505943605585","videoId":80000778,"rank":0,"requestId":"ac12f4e1-5644-46af-87d1-ec3b92ce4896-4071171"},"type":["Play","Action","Session"],"time":527637405129},1],
{"sequence":189,"id":8697413135394406,"trackingInfo":{"row":0,"trackId":14272744,"requestId":"284929d9-6147-4924-a19f-4a308730354c-3348447","rank":0,"videoId":80075830,"location":"PostPlay\/Next"},"type":["Play","Action","Session"],"time":527638558756},1],
{"sequence":130,"id":8697373887446384,"trackingInfo":{"location":"Browse","row":0,"trackId":14170286,"listId":"cd7c2c7a-00f6-4035-867f-d1dd7d89972d_6625365X3XX1505943605585","videoId":80000778,"rank":0,"requestId":"ac12f4e1-5644-46af-87d1-ec3b92ce4896-4071171"},"type":["Play","Action","Session"],"time":527637394083}]
这里最好的方法是什么?我累了
val rdd = sc.parallelize(Seq(jsonString)).flatMap(_.split("}"))
val trackingRdd = rdd.filter(_.contains("trackingInfo"))
此尝试的示例输出是:
,{"sequence":89,"id":8697344444103393,"trackingInfo":{"location":"Browse","row":0,"trackId":14170286,"listId":"cd7c2c7a-00f6-4035-867f-d1dd7d89972d_6625365X3XX1505943605585","videoId":80000778,"rank":0,"requestId":"ac12f4e1-5644-46af-87d1-ec3b92ce4896-4071171"
正如你所看到的那样,除了"type":["Play","Action","Session"],"time":527636408955},1]
之外,我几乎拥有了我想要的所有数据}
感谢任何帮助
答案 0 :(得分:1)
我们可以使用JSON结构读取数据,例如:
scala> val df=spark.read.json(sc.parallelize(Seq(jsonString))).select(explode(col("reverseDeltas"))).select(explode(col("col"))).map(_.getString(0)).filter(_.indexOf('{')>=0)
warning: there was one deprecation warning; re-run with -deprecation for details
df: org.apache.spark.sql.Dataset[String] = [value: string]
scala> spark.read.json(df).filter(col("trackingInfo").isNotNull).select("trackingInfo").toJSON.show(false)
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"trackingInfo":{"listId":"cd7c2c7a-00f6-4035-867f-d1dd7d89972d_6625365X3XX1505943605585","location":"Browse","rank":0,"requestId":"ac12f4e1-5644-46af-87d1-ec3b92ce4896-4071171","row":0,"trackId":14170286,"videoId":80000778}}|
|{"trackingInfo":{"listId":"cd7c2c7a-00f6-4035-867f-d1dd7d89972d_6625365X3XX1505943605585","location":"Browse","rank":0,"requestId":"ac12f4e1-5644-46af-87d1-ec3b92ce4896-4071171","row":0,"trackId":14170286,"videoId":80000778}}|
|{"trackingInfo":{"listId":"cd7c2c7a-00f6-4035-867f-d1dd7d89972d_6625365X3XX1505943605585","location":"Browse","rank":0,"requestId":"ac12f4e1-5644-46af-87d1-ec3b92ce4896-4071171","row":0,"trackId":14170286,"videoId":80000778}}|
|{"trackingInfo":{"listId":"cd7c2c7a-00f6-4035-867f-d1dd7d89972d_6625365X3XX1505943605585","location":"Browse","rank":0,"requestId":"ac12f4e1-5644-46af-87d1-ec3b92ce4896-4071171","row":0,"trackId":14170286,"videoId":80000778}}|
|{"trackingInfo":{"location":"PostPlay/Next","rank":0,"requestId":"284929d9-6147-4924-a19f-4a308730354c-3348447","row":0,"trackId":14272744,"videoId":80075830}} |
|{"trackingInfo":{"listId":"cd7c2c7a-00f6-4035-867f-d1dd7d89972d_6625365X3XX1505943605585","location":"Browse","rank":0,"requestId":"ac12f4e1-5644-46af-87d1-ec3b92ce4896-4071171","row":0,"trackId":14170286,"videoId":80000778}}|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
scala>