我有以下数据条目:
{" UID":" XA8O3jlfAxxc""事件" {"轮廓" {" LAST_UPDATE" :1502287200"簇" {" 129":11," 17":13," 99&#34:4," 1":9," 162":12," 161":11," 233":11," 120":6 " 61":12," 115":8," 168":10," 220":10," 135& #34;:6," 231":10," 109":3," 89":9," 140":11, #&34; 113":9," 124":3," 35":10," 155":8," 131&# 34;:7," 11":2" 207":3," 91":2" 167":3,& #34; 212":12," 77":11," 174":13," 154":11," 23&#34 ;:12," 13":6," 157":12," 235":11," 159":12,&# 34; 138":13," 199":11," 111":1," 41":6," 211" :12," 15":10," 47":3," 209":10," 173":13,&#34 ; 56":14," 101":13," 45":2" 169":14," 86&#34 ;: 12}"段" {" 11":6," 21":9," 7":12," 17& #34;:13," 22":13," 1":10," 18":14," 16":13, #&34; 13":12," 23":11," 6":8," 3":11," 9&# 34;:12," 12":13," 15":2" 14":8," 8":14,& #34; 4":12," 10":6," 5":12}" geoloc" {"国家&#34 ;:" ES""经度":2.81908"纬度":41.9781}" sociodemos" {" 11&# 34;:6," 21":11," 7":12," 2":5," 22":5,& #34; 18":3," 16":10," 13&#34:4," 23":10," 6&#34 ;:11," 3":12," 9&#34:4," 12&#34:4," 20":3,&# 34; 15":6," 14":6," 8&#34:4," 4":9," 24" :10," 5":11}}" WAM" {" TECHNO" {"浏览器":"其他& #34;"装置":"移动"" OS":" Android和#34;" ISP":& #34;西班牙电信"}" LAST_UPDATE":1502568000," WCM" {"转换":[{" LAST_UPDATE&#34 ;: 1502564400" ID":" 1"}]}}}} {" UID":" Mq0tCKsYwzMy""事件" {"轮廓" {" LAST_UPDATE":1502456400, #&34;簇" {" 170":10," 32":6," 63":10," 90&#34 ;:2" 7":2" 227":5," 119&#34:4," 200":5,&# 34; 180&#34:4," 18":1," 179":2" 162":2" 125" :1," 16":8," 84":9," 190":7," 161":10,&#34 ; 61":7," 115":5," 220":12," 20":8," 92&#34 ;: 2," 231":2" 109":7," 103":9," 151&#34:4," 89":2" 113":8," 35":3," 189":9," 11":14 " 207":11," 91":3," 167":7," 77":10," 174& #34;:3," 157&#34:4," 29":7," 203":11," 210":7, #&34; 138":12," 97":3," 199":8," 41":13," 15&# 34;:7," 153&#34:4," 56":6," 45":10," 101":8,& #34; 86":2" 54":5," 237&#34:4," 67":9," 129&#34 ;:5," 2":10," 17 ":1," 1":6," 136":5," 186":10," 110":3 " 82":9," 25":2" 28":12," 120&#34:4," 75& #34;:6," 168":8," 177":2" 140":5," 124":8, #&34; 155":12," 131":2" 53":10," 181":10," 122&# 34;:11," 79":3," 212":6," 154":3," 13":10,& #34; 23":8," 235":7," 126":3," 159":2" 85&#34 :4" 3":10," 185":11," 183":13," 111":3,&# 34; 9":13," 51":8," 47":3," 209":3," 216" :3," 1000":3," 37":11," 132":3," 169":2&#34 ; 117":5," 5" 10},"段" {" 11":10," 21&#34 ;: 8," 7":10," 17":13," 2":9," 22":13," 1":11," 18":2" 16":14," 13":9," 23":5 " 6":5," 25":3," 3":10," 9":8," 12& #34;:10," 15":10," 14":12," 8":6," 4":13, &# 34; 10&#34:4," 19":10," 5" 10}," geoloc" {"国家" :" ES""经度": - 3.70358"纬度":40.4167}" sociodemos" {" 11&# 34;:3," 21":6," 7":10," 2":10," 22":5,& #34; 18":6," 23":6," 16":6," 13":7," 6&#34 ; 6" 3":11," 9":7," 12&#34:4," 14&#34:4,&# 34; 15":3," 20":7," 8":9," 4":12," 24" :14," 5":12}}" WAM" {" TECHNO" {"浏览器":"铬, #34;"装置":"移动"" OS":" Android和#34;" ISP":& #34;西班牙电信"}" LAST_UPDATE":1502575200," WCM" {"转换":[{" LAST_UPDATE&#34 ;: 1502560800" ID":" 1"}]}}}} {" UID":" 1NaQF91h10rU""事件" {" WAM" {" TECHNO":{& #34;浏览器":"铬""装置":"移动"" OS":"的Android&# 34;," ISP":"其他"}" LAST_UPDATE":1502571600," WCM" {"转换&#34 ;:[{" LAST_UPDATE":1502568000," ID":" 1"}]}}}}
我只对集群的信息感兴趣:"集群":{"数字":亲和力,...}
我用这句话收集了这些信息:
val trafico = sqlContext.read.json("/weborama/WAM_files/*/*")
val traficoRDD = trafico.selectExpr(List("events.profile.clusters"): _*).filter("clusters is not null").rdd
输出:
[[9,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,11,2,NULL,NULL,NULL,如图4所示,空,如图5所示,空值,如图4所示,空值,如图5所示,空,3,NULL,NULL,NULL,8,NULL,NULL,NULL,NULL,NULL,6,NULL,NULL,NULL,NULL,NULL,10,NULL,NULL ,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,11,7,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL ,NULL,NULL,3,13,7,5,6,NULL,NULL,8,11,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL ,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,8,NULL,NULL,8,NULL,NULL,NULL,NULL,12,NULL,NULL ,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,12,3,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,10,NULL,NULL,11,5,空,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL ,NULL,NULL,NULL,NULL,NULL,6,NULL,NULL,NULL,NULL,NULL,NULL,6,NULL,NULL,NULL,12,NULL,NULL,NULL,NULL,NULL,NULL,9,空5,NULL,NULL,8,空,6,5,10,NULL,6,NULL,NULL,NULL,NULL,NULL,13,12,NULL,NULL,NULL,NULL,NULL,NU LL,NULL,NULL,NULL,NULL,如图8所示,空,7,6,NULL,NULL,NULL,NULL,NULL,9,NULL,NULL,NULL,NULL,NULL,NULL,6,3,NULL,NULL, NULL,NULL,NULL,NULL,NULL] [[2,NULL,NULL,8,NULL,NULL,11,空,8,NULL,NULL,NULL,NULL,NULL,NULL,NULL,12,NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL,NULL,1,NULL,NULL,NULL,1,3,NULL,NULL,NULL,NULL,10,空,1,NULL,NULL,NULL,NULL,NULL,NULL,8,空,12,NULL,NULL,8,NULL,NULL,NULL,NULL,12,NULL,NULL,4,NULL,NULL,NULL,NULL,4,NULL,NULL,12,空,8,NULL,NULL, NULL,NULL,3,13,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,4,13,NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,7,NULL,NULL,NULL,NULL,4,NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,9,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,9,空值,如图3所示,空值,如图6所示,NULL,NULL, NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL,NULL,NULL,13,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,12,空,1, NULL,NULL,如图4所示,零,2,4,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,2,NULL,NULL,NULL,NULL,NUL升,NULL,NULL,NULL,NULL,NULL,NULL,4,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1,NULL,NULL,NULL,NULL,NULL,13,14,空NULL,NULL,NULL,NULL,NULL,NULL,9]]
我想要的是分析每个亲和力重复多少个聚类和其他东西。
为此我想创建一对rdd(亲和力,簇号)。有人能帮助我吗?
(1,[129,99,17])
(2,[63,80,3])
.
.
.
(14,[222,69])
谢谢!
答案 0 :(得分:0)
您正在以正确的方式阅读json
,这将创建dataframe
。所以
val trafico = sqlContext.read.json("/weborama/WAM_files/*/*")
将使用dataframe
创建schema
作为
root
|-- events: struct (nullable = true)
| |-- profile: struct (nullable = true)
| | |-- clusters: struct (nullable = true)
| | | |-- 1: long (nullable = true)
| | | |-- 101: long (nullable = true)
| | | |-- 109: long (nullable = true)
| | | |-- 11: long (nullable = true)
| | | |-- 111: long (nullable = true)
| | | |-- 113: long (nullable = true)
| | | |-- 115: long (nullable = true)
| | | |-- 120: long (nullable = true)
| | | |-- 124: long (nullable = true)
| | | |-- 129: long (nullable = true)
| | | |-- 13: long (nullable = true)
| | | |-- 131: long (nullable = true)
| | | |-- 135: long (nullable = true)
| | | |-- 138: long (nullable = true)
| | | |-- 140: long (nullable = true)
| | | |-- 15: long (nullable = true)
| | | |-- 154: long (nullable = true)
| | | |-- 155: long (nullable = true)
| | | |-- 157: long (nullable = true)
| | | |-- 159: long (nullable = true)
| | | |-- 161: long (nullable = true)
| | | |-- 162: long (nullable = true)
| | | |-- 167: long (nullable = true)
| | | |-- 168: long (nullable = true)
| | | |-- 169: long (nullable = true)
| | | |-- 17: long (nullable = true)
| | | |-- 173: long (nullable = true)
| | | |-- 174: long (nullable = true)
| | | |-- 199: long (nullable = true)
| | | |-- 207: long (nullable = true)
| | | |-- 209: long (nullable = true)
| | | |-- 211: long (nullable = true)
| | | |-- 212: long (nullable = true)
| | | |-- 220: long (nullable = true)
| | | |-- 23: long (nullable = true)
| | | |-- 231: long (nullable = true)
| | | |-- 233: long (nullable = true)
| | | |-- 235: long (nullable = true)
| | | |-- 35: long (nullable = true)
| | | |-- 41: long (nullable = true)
| | | |-- 45: long (nullable = true)
| | | |-- 47: long (nullable = true)
| | | |-- 56: long (nullable = true)
| | | |-- 61: long (nullable = true)
| | | |-- 77: long (nullable = true)
| | | |-- 86: long (nullable = true)
| | | |-- 89: long (nullable = true)
| | | |-- 91: long (nullable = true)
| | | |-- 99: long (nullable = true)
| | |-- geoloc: struct (nullable = true)
| | | |-- country: string (nullable = true)
| | | |-- latitude: double (nullable = true)
| | | |-- longitude: double (nullable = true)
| | |-- last_update: long (nullable = true)
| | |-- segments: struct (nullable = true)
| | | |-- 1: long (nullable = true)
| | | |-- 10: long (nullable = true)
| | | |-- 11: long (nullable = true)
| | | |-- 12: long (nullable = true)
| | | |-- 13: long (nullable = true)
| | | |-- 14: long (nullable = true)
| | | |-- 15: long (nullable = true)
| | | |-- 16: long (nullable = true)
| | | |-- 17: long (nullable = true)
| | | |-- 18: long (nullable = true)
| | | |-- 21: long (nullable = true)
| | | |-- 22: long (nullable = true)
| | | |-- 23: long (nullable = true)
| | | |-- 3: long (nullable = true)
| | | |-- 4: long (nullable = true)
| | | |-- 5: long (nullable = true)
| | | |-- 6: long (nullable = true)
| | | |-- 7: long (nullable = true)
| | | |-- 8: long (nullable = true)
| | | |-- 9: long (nullable = true)
| | |-- sociodemos: struct (nullable = true)
| | | |-- 11: long (nullable = true)
| | | |-- 12: long (nullable = true)
| | | |-- 13: long (nullable = true)
| | | |-- 14: long (nullable = true)
| | | |-- 15: long (nullable = true)
| | | |-- 16: long (nullable = true)
| | | |-- 18: long (nullable = true)
| | | |-- 2: long (nullable = true)
| | | |-- 20: long (nullable = true)
| | | |-- 21: long (nullable = true)
| | | |-- 22: long (nullable = true)
| | | |-- 23: long (nullable = true)
| | | |-- 24: long (nullable = true)
| | | |-- 3: long (nullable = true)
| | | |-- 4: long (nullable = true)
| | | |-- 5: long (nullable = true)
| | | |-- 6: long (nullable = true)
| | | |-- 7: long (nullable = true)
| | | |-- 8: long (nullable = true)
| | | |-- 9: long (nullable = true)
| |-- wam: struct (nullable = true)
| | |-- last_update: long (nullable = true)
| | |-- techno: struct (nullable = true)
| | | |-- browser: string (nullable = true)
| | | |-- device: string (nullable = true)
| | | |-- isp: string (nullable = true)
| | | |-- os: string (nullable = true)
| | |-- wcm: struct (nullable = true)
| | | |-- conversion: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- id: string (nullable = true)
| | | | | |-- last_update: long (nullable = true)
|-- uid: string (nullable = true)
现在,因为您只对clusters
字段感兴趣
val clusters = trafico.select("events.profile.clusters.*")
这会为dataframe
schema
提供
root
|-- 1: long (nullable = true)
|-- 101: long (nullable = true)
|-- 109: long (nullable = true)
|-- 11: long (nullable = true)
|-- 111: long (nullable = true)
|-- 113: long (nullable = true)
|-- 115: long (nullable = true)
|-- 120: long (nullable = true)
|-- 124: long (nullable = true)
|-- 129: long (nullable = true)
|-- 13: long (nullable = true)
|-- 131: long (nullable = true)
|-- 135: long (nullable = true)
|-- 138: long (nullable = true)
|-- 140: long (nullable = true)
|-- 15: long (nullable = true)
|-- 154: long (nullable = true)
|-- 155: long (nullable = true)
|-- 157: long (nullable = true)
|-- 159: long (nullable = true)
|-- 161: long (nullable = true)
|-- 162: long (nullable = true)
|-- 167: long (nullable = true)
|-- 168: long (nullable = true)
|-- 169: long (nullable = true)
|-- 17: long (nullable = true)
|-- 173: long (nullable = true)
|-- 174: long (nullable = true)
|-- 199: long (nullable = true)
|-- 207: long (nullable = true)
|-- 209: long (nullable = true)
|-- 211: long (nullable = true)
|-- 212: long (nullable = true)
|-- 220: long (nullable = true)
|-- 23: long (nullable = true)
|-- 231: long (nullable = true)
|-- 233: long (nullable = true)
|-- 235: long (nullable = true)
|-- 35: long (nullable = true)
|-- 41: long (nullable = true)
|-- 45: long (nullable = true)
|-- 47: long (nullable = true)
|-- 56: long (nullable = true)
|-- 61: long (nullable = true)
|-- 77: long (nullable = true)
|-- 86: long (nullable = true)
|-- 89: long (nullable = true)
|-- 91: long (nullable = true)
|-- 99: long (nullable = true)
这些column names
是配对的Rdd 中您想要的键。所以你可以将它们保存为
val clusterNames = clusters.schema.fieldNames
将是
Array[1, 101, 109, 11, 111, 113, 115, 120, 124, 129, 13, 131, 135, 138, 140, 15, 154, 155, 157, 159, 161, 162, 167, 168, 169, 17, 173, 174, 199, 207, 209, 211, 212, 220, 23, 231, 233, 235, 35, 41, 45, 47, 56, 61, 77, 86, 89, 91, 99]
所需配对RDD 中的值是每个columns
clusters
{{的收集列表 1}}以上你可以做以下事情来获得
dataframe
将是
import org.apache.spark.sql.functions._
val collectedClusters = clusters.select(clusterNames.map(x => collect_list(col(x))) : _*).rdd.flatMap(_.toSeq.toList).collect
最后一步是创建配对的RDD ,可以使用Array[WrappedArray(9), WrappedArray(13), WrappedArray(3), WrappedArray(2), WrappedArray(1), WrappedArray(9), WrappedArray(8), WrappedArray(6), WrappedArray(3), WrappedArray(11), WrappedArray(6), WrappedArray(7), WrappedArray(6), WrappedArray(13), WrappedArray(11), WrappedArray(10), WrappedArray(11), WrappedArray(8), WrappedArray(12), WrappedArray(12), WrappedArray(11), WrappedArray(12), WrappedArray(3), WrappedArray(10), WrappedArray(14), WrappedArray(13), WrappedArray(13), WrappedArray(13), WrappedArray(11), WrappedArray(3), WrappedArray(10), WrappedArray(12), WrappedArray(12), WrappedArray(10), WrappedArray(12), WrappedArray(10), WrappedArray(11), WrappedArray(11), WrappedArray(10), WrappedArray(6), WrappedArray(2), WrappedArray(3), WrappedArray(14), WrappedArray(12), WrappedArray(11), WrappedArray(12), WrappedArray(9), WrappedArray(2), WrappedArray(4)]
zip
你应该按照
的要求配置配对的RDDclusterNames.zip(collectedClusters)
我希望答案很有帮助