Spark Row keyBy值

时间:2017-11-14 14:56:19

标签: sql scala apache-spark dataframe rdd

我有以下数据条目:

  

{" UID":" XA8O3jlfAxxc""事件" {"轮廓" {" LAST_UPDATE" :1502287200"簇" {" 129":11," 17":13," 99&#34:4," 1":9," 162":12," 161":11," 233":11," 120":6 " 61":12," 115":8," 168":10," 220":10," 135& #34;:6," 231":10," 109":3," 89":9," 140":11, #&34; 113":9," 124":3," 35":10," 155":8," 131&# 34;:7," 11":2" 207":3," 91":2" 167":3,& #34; 212":12," 77":11," 174":13," 154":11," 23&#34 ;:12," 13":6," 157":12," 235":11," 159":12,&# 34; 138":13," 199":11," 111":1," 41":6," 211" :12," 15":10," 47":3," 209":10," 173":13,&#34 ; 56":14," 101":13," 45":2" 169":14," 86&#34 ;: 12}"段" {" 11":6," 21":9," 7":12," 17& #34;:13," 22":13," 1":10," 18":14," 16":13, #&34; 13":12," 23":11," 6":8," 3":11," 9&# 34;:12," 12":13," 15":2" 14":8," 8":14,& #34; 4":12," 10":6," 5":12}" geoloc" {"国家&#34 ;:" ES""经度":2.81908"纬度":41.9781}" sociodemos" {" 11&# 34;:6," 21":11," 7":12," 2":5," 22":5,& #34; 18":3," 16":10," 13&#34:4," 23":10," 6&#34 ;:11," 3":12," 9&#34:4," 12&#34:4," 20":3,&# 34; 15":6," 14":6," 8&#34:4," 4":9," 24" :10," 5":11}}" WAM" {" TECHNO" {"浏览器":"其他& #34;"装置":"移动"" OS":" Android和#34;" ISP":& #34;西班牙电信"}" LAST_UPDATE":1502568000," WCM" {"转换":[{" LAST_UPDATE&#34 ;: 1502564400" ID":" 1"}]}}}}   {" UID":" Mq0tCKsYwzMy""事件" {"轮廓" {" LAST_UPDATE":1502456400, #&34;簇" {" 170":10," 32":6," 63":10," 90&#34 ;:2" 7":2" 227":5," 119&#34:4," 200":5,&# 34; 180&#34:4," 18":1," 179":2" 162":2" 125" :1," 16":8," 84":9," 190":7," 161":10,&#34 ; 61":7," 115":5," 220":12," 20":8," 92&#34 ;: 2," 231":2" 109":7," 103":9," 151&#34:4," 89":2" 113":8," 35":3," 189":9," 11":14 " 207":11," 91":3," 167":7," 77":10," 174& #34;:3," 157&#34:4," 29":7," 203":11," 210":7, #&34; 138":12," 97":3," 199":8," 41":13," 15&# 34;:7," 153&#34:4," 56":6," 45":10," 101":8,& #34; 86":2" 54":5," 237&#34:4," 67":9," 129&#34 ;:5," 2":10," 17 ":1," 1":6," 136":5," 186":10," 110":3 " 82":9," 25":2" 28":12," 120&#34:4," 75& #34;:6," 168":8," 177":2" 140":5," 124":8, #&34; 155":12," 131":2" 53":10," 181":10," 122&# 34;:11," 79":3," 212":6," 154":3," 13":10,& #34; 23":8," 235":7," 126":3," 159":2" 85&#34 :4" 3":10," 185":11," 183":13," 111":3,&# 34; 9":13," 51":8," 47":3," 209":3," 216" :3," 1000":3," 37":11," 132":3," 169":2&#34 ; 117":5," 5" 10},"段" {" 11":10," 21&#34 ;: 8," 7":10," 17":13," 2":9," 22":13," 1":11," 18":2" 16":14," 13":9," 23":5 " 6":5," 25":3," 3":10," 9":8," 12& #34;:10," 15":10," 14":12," 8":6," 4":13, &# 34; 10&#34:4," 19":10," 5" 10}," geoloc" {"国家" :" ES""经度": - 3.70358"纬度":40.4167}" sociodemos" {" 11&# 34;:3," 21":6," 7":10," 2":10," 22":5,& #34; 18":6," 23":6," 16":6," 13":7," 6&#34 ; 6" 3":11," 9":7," 12&#34:4," 14&#34:4,&# 34; 15":3," 20":7," 8":9," 4":12," 24" :14," 5":12}}" WAM" {" TECHNO" {"浏览器":"铬, #34;"装置":"移动"" OS":" Android和#34;" ISP":& #34;西班牙电信"}" LAST_UPDATE":1502575200," WCM" {"转换":[{" LAST_UPDATE&#34 ;: 1502560800" ID":" 1"}]}}}}   {" UID":" 1NaQF91h10rU""事件" {" WAM" {" TECHNO":{& #34;浏览器":"铬""装置":"移动"" OS":"的Android&# 34;," ISP":"其他"}" LAST_UPDATE":1502571600," WCM" {"转换&#34 ;:[{" LAST_UPDATE":1502568000," ID":" 1"}]}}}}

我只对集群的信息感兴趣:"集群":{"数字":亲和力,...}

我用这句话收集了这些信息:

val trafico = sqlContext.read.json("/weborama/WAM_files/*/*")
val traficoRDD  = trafico.selectExpr(List("events.profile.clusters"): _*).filter("clusters is not null").rdd

输出:

  

[[9,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,11,2,NULL,NULL,NULL,如图4所示,空,如图5所示,空值,如图4所示,空值,如图5所示,空,3,NULL,NULL,NULL,8,NULL,NULL,NULL,NULL,NULL,6,NULL,NULL,NULL,NULL,NULL,10,NULL,NULL ,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,11,7,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL ,NULL,NULL,3,13,7,5,6,NULL,NULL,8,11,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL ,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,8,NULL,NULL,8,NULL,NULL,NULL,NULL,12,NULL,NULL ,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,12,3,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,10,NULL,NULL,11,5,空,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL ,NULL,NULL,NULL,NULL,NULL,6,NULL,NULL,NULL,NULL,NULL,NULL,6,NULL,NULL,NULL,12,NULL,NULL,NULL,NULL,NULL,NULL,9,空5,NULL,NULL,8,空,6,5,10,NULL,6,NULL,NULL,NULL,NULL,NULL,13,12,NULL,NULL,NULL,NULL,NULL,NU LL,NULL,NULL,NULL,NULL,如图8所示,空,7,6,NULL,NULL,NULL,NULL,NULL,9,NULL,NULL,NULL,NULL,NULL,NULL,6,3,NULL,NULL, NULL,NULL,NULL,NULL,NULL]   [[2,NULL,NULL,8,NULL,NULL,11,空,8,NULL,NULL,NULL,NULL,NULL,NULL,NULL,12,NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL,NULL,1,NULL,NULL,NULL,1,3,NULL,NULL,NULL,NULL,10,空,1,NULL,NULL,NULL,NULL,NULL,NULL,8,空,12,NULL,NULL,8,NULL,NULL,NULL,NULL,12,NULL,NULL,4,NULL,NULL,NULL,NULL,4,NULL,NULL,12,空,8,NULL,NULL, NULL,NULL,3,13,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,4,13,NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,7,NULL,NULL,NULL,NULL,4,NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,9,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,9,空值,如图3所示,空值,如图6所示,NULL,NULL, NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL,NULL,NULL,13,N​​ULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,12,空,1, NULL,NULL,如图4所示,零,2,4,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,2,NULL,NULL,NULL,NULL,NUL升,NULL,NULL,NULL,NULL,NULL,NULL,4,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1,NULL,NULL,NULL,NULL,NULL,13,14,空NULL,NULL,NULL,NULL,NULL,NULL,9]]

我想要的是分析每个亲和力重复多少个聚类和其他东西。

为此我想创建一对rdd(亲和力,簇号)。有人能帮助我吗?

(1,[129,99,17])
(2,[63,80,3])
.
.
.
(14,[222,69])

谢谢!

1 个答案:

答案 0 :(得分:0)

您正在以正确的方式阅读json,这将创建dataframe。所以

val trafico = sqlContext.read.json("/weborama/WAM_files/*/*")

将使用dataframe创建schema作为

root
 |-- events: struct (nullable = true)
 |    |-- profile: struct (nullable = true)
 |    |    |-- clusters: struct (nullable = true)
 |    |    |    |-- 1: long (nullable = true)
 |    |    |    |-- 101: long (nullable = true)
 |    |    |    |-- 109: long (nullable = true)
 |    |    |    |-- 11: long (nullable = true)
 |    |    |    |-- 111: long (nullable = true)
 |    |    |    |-- 113: long (nullable = true)
 |    |    |    |-- 115: long (nullable = true)
 |    |    |    |-- 120: long (nullable = true)
 |    |    |    |-- 124: long (nullable = true)
 |    |    |    |-- 129: long (nullable = true)
 |    |    |    |-- 13: long (nullable = true)
 |    |    |    |-- 131: long (nullable = true)
 |    |    |    |-- 135: long (nullable = true)
 |    |    |    |-- 138: long (nullable = true)
 |    |    |    |-- 140: long (nullable = true)
 |    |    |    |-- 15: long (nullable = true)
 |    |    |    |-- 154: long (nullable = true)
 |    |    |    |-- 155: long (nullable = true)
 |    |    |    |-- 157: long (nullable = true)
 |    |    |    |-- 159: long (nullable = true)
 |    |    |    |-- 161: long (nullable = true)
 |    |    |    |-- 162: long (nullable = true)
 |    |    |    |-- 167: long (nullable = true)
 |    |    |    |-- 168: long (nullable = true)
 |    |    |    |-- 169: long (nullable = true)
 |    |    |    |-- 17: long (nullable = true)
 |    |    |    |-- 173: long (nullable = true)
 |    |    |    |-- 174: long (nullable = true)
 |    |    |    |-- 199: long (nullable = true)
 |    |    |    |-- 207: long (nullable = true)
 |    |    |    |-- 209: long (nullable = true)
 |    |    |    |-- 211: long (nullable = true)
 |    |    |    |-- 212: long (nullable = true)
 |    |    |    |-- 220: long (nullable = true)
 |    |    |    |-- 23: long (nullable = true)
 |    |    |    |-- 231: long (nullable = true)
 |    |    |    |-- 233: long (nullable = true)
 |    |    |    |-- 235: long (nullable = true)
 |    |    |    |-- 35: long (nullable = true)
 |    |    |    |-- 41: long (nullable = true)
 |    |    |    |-- 45: long (nullable = true)
 |    |    |    |-- 47: long (nullable = true)
 |    |    |    |-- 56: long (nullable = true)
 |    |    |    |-- 61: long (nullable = true)
 |    |    |    |-- 77: long (nullable = true)
 |    |    |    |-- 86: long (nullable = true)
 |    |    |    |-- 89: long (nullable = true)
 |    |    |    |-- 91: long (nullable = true)
 |    |    |    |-- 99: long (nullable = true)
 |    |    |-- geoloc: struct (nullable = true)
 |    |    |    |-- country: string (nullable = true)
 |    |    |    |-- latitude: double (nullable = true)
 |    |    |    |-- longitude: double (nullable = true)
 |    |    |-- last_update: long (nullable = true)
 |    |    |-- segments: struct (nullable = true)
 |    |    |    |-- 1: long (nullable = true)
 |    |    |    |-- 10: long (nullable = true)
 |    |    |    |-- 11: long (nullable = true)
 |    |    |    |-- 12: long (nullable = true)
 |    |    |    |-- 13: long (nullable = true)
 |    |    |    |-- 14: long (nullable = true)
 |    |    |    |-- 15: long (nullable = true)
 |    |    |    |-- 16: long (nullable = true)
 |    |    |    |-- 17: long (nullable = true)
 |    |    |    |-- 18: long (nullable = true)
 |    |    |    |-- 21: long (nullable = true)
 |    |    |    |-- 22: long (nullable = true)
 |    |    |    |-- 23: long (nullable = true)
 |    |    |    |-- 3: long (nullable = true)
 |    |    |    |-- 4: long (nullable = true)
 |    |    |    |-- 5: long (nullable = true)
 |    |    |    |-- 6: long (nullable = true)
 |    |    |    |-- 7: long (nullable = true)
 |    |    |    |-- 8: long (nullable = true)
 |    |    |    |-- 9: long (nullable = true)
 |    |    |-- sociodemos: struct (nullable = true)
 |    |    |    |-- 11: long (nullable = true)
 |    |    |    |-- 12: long (nullable = true)
 |    |    |    |-- 13: long (nullable = true)
 |    |    |    |-- 14: long (nullable = true)
 |    |    |    |-- 15: long (nullable = true)
 |    |    |    |-- 16: long (nullable = true)
 |    |    |    |-- 18: long (nullable = true)
 |    |    |    |-- 2: long (nullable = true)
 |    |    |    |-- 20: long (nullable = true)
 |    |    |    |-- 21: long (nullable = true)
 |    |    |    |-- 22: long (nullable = true)
 |    |    |    |-- 23: long (nullable = true)
 |    |    |    |-- 24: long (nullable = true)
 |    |    |    |-- 3: long (nullable = true)
 |    |    |    |-- 4: long (nullable = true)
 |    |    |    |-- 5: long (nullable = true)
 |    |    |    |-- 6: long (nullable = true)
 |    |    |    |-- 7: long (nullable = true)
 |    |    |    |-- 8: long (nullable = true)
 |    |    |    |-- 9: long (nullable = true)
 |    |-- wam: struct (nullable = true)
 |    |    |-- last_update: long (nullable = true)
 |    |    |-- techno: struct (nullable = true)
 |    |    |    |-- browser: string (nullable = true)
 |    |    |    |-- device: string (nullable = true)
 |    |    |    |-- isp: string (nullable = true)
 |    |    |    |-- os: string (nullable = true)
 |    |    |-- wcm: struct (nullable = true)
 |    |    |    |-- conversion: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- id: string (nullable = true)
 |    |    |    |    |    |-- last_update: long (nullable = true)
 |-- uid: string (nullable = true)

现在,因为您只对clusters字段感兴趣

val clusters = trafico.select("events.profile.clusters.*")

这会为dataframe schema提供

root
 |-- 1: long (nullable = true)
 |-- 101: long (nullable = true)
 |-- 109: long (nullable = true)
 |-- 11: long (nullable = true)
 |-- 111: long (nullable = true)
 |-- 113: long (nullable = true)
 |-- 115: long (nullable = true)
 |-- 120: long (nullable = true)
 |-- 124: long (nullable = true)
 |-- 129: long (nullable = true)
 |-- 13: long (nullable = true)
 |-- 131: long (nullable = true)
 |-- 135: long (nullable = true)
 |-- 138: long (nullable = true)
 |-- 140: long (nullable = true)
 |-- 15: long (nullable = true)
 |-- 154: long (nullable = true)
 |-- 155: long (nullable = true)
 |-- 157: long (nullable = true)
 |-- 159: long (nullable = true)
 |-- 161: long (nullable = true)
 |-- 162: long (nullable = true)
 |-- 167: long (nullable = true)
 |-- 168: long (nullable = true)
 |-- 169: long (nullable = true)
 |-- 17: long (nullable = true)
 |-- 173: long (nullable = true)
 |-- 174: long (nullable = true)
 |-- 199: long (nullable = true)
 |-- 207: long (nullable = true)
 |-- 209: long (nullable = true)
 |-- 211: long (nullable = true)
 |-- 212: long (nullable = true)
 |-- 220: long (nullable = true)
 |-- 23: long (nullable = true)
 |-- 231: long (nullable = true)
 |-- 233: long (nullable = true)
 |-- 235: long (nullable = true)
 |-- 35: long (nullable = true)
 |-- 41: long (nullable = true)
 |-- 45: long (nullable = true)
 |-- 47: long (nullable = true)
 |-- 56: long (nullable = true)
 |-- 61: long (nullable = true)
 |-- 77: long (nullable = true)
 |-- 86: long (nullable = true)
 |-- 89: long (nullable = true)
 |-- 91: long (nullable = true)
 |-- 99: long (nullable = true)

这些column names配对的Rdd 中您想要的。所以你可以将它们保存为

val clusterNames = clusters.schema.fieldNames

将是

Array[1, 101, 109, 11, 111, 113, 115, 120, 124, 129, 13, 131, 135, 138, 140, 15, 154, 155, 157, 159, 161, 162, 167, 168, 169, 17, 173, 174, 199, 207, 209, 211, 212, 220, 23, 231, 233, 235, 35, 41, 45, 47, 56, 61, 77, 86, 89, 91, 99]

所需配对RDD 中的是每个columns clusters {{的收集列表 1}}以上你可以做以下事情来获得

dataframe

将是

import org.apache.spark.sql.functions._
val collectedClusters = clusters.select(clusterNames.map(x => collect_list(col(x))) : _*).rdd.flatMap(_.toSeq.toList).collect

最后一步是创建配对的RDD ,可以使用Array[WrappedArray(9), WrappedArray(13), WrappedArray(3), WrappedArray(2), WrappedArray(1), WrappedArray(9), WrappedArray(8), WrappedArray(6), WrappedArray(3), WrappedArray(11), WrappedArray(6), WrappedArray(7), WrappedArray(6), WrappedArray(13), WrappedArray(11), WrappedArray(10), WrappedArray(11), WrappedArray(8), WrappedArray(12), WrappedArray(12), WrappedArray(11), WrappedArray(12), WrappedArray(3), WrappedArray(10), WrappedArray(14), WrappedArray(13), WrappedArray(13), WrappedArray(13), WrappedArray(11), WrappedArray(3), WrappedArray(10), WrappedArray(12), WrappedArray(12), WrappedArray(10), WrappedArray(12), WrappedArray(10), WrappedArray(11), WrappedArray(11), WrappedArray(10), WrappedArray(6), WrappedArray(2), WrappedArray(3), WrappedArray(14), WrappedArray(12), WrappedArray(11), WrappedArray(12), WrappedArray(9), WrappedArray(2), WrappedArray(4)]

来实现
zip

你应该按照

的要求配置配对的RDD
clusterNames.zip(collectedClusters)

我希望答案很有帮助