我有一个文件跟随结构。第一列是它的nodeID。在"之后:"它是与nodeID连接的节点。每个nodeID可以有多个连接。
0: 5305811,
1: 4798401,
2: 7922543,
3: 7195074,
4: 6399935,
5: 5697217,
6: 5357407,
7: 4798401,
8: 629131,5330605,6481451,6280292,6909396,7325128,
...
如何应用转换以导入GraphFrame?
答案 0 :(得分:1)
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.LongType
import org.graphframes.GraphFrame
import scala.util.Try
val spark = SparkSession.builder()
.master("local[2]")
.appName("test")
.getOrCreate()
spark.sparkContext.setCheckpointDir(spark.conf.getOption(s"spark.checkpointdir").getOrElse("/tmp"))
import spark.implicits._
def cleanIds = udf((ids: Seq[String]) => ids.flatMap(x => Try(x.trim.toLong).toOption))
val ds = spark
.read
.option("mode", "PERMISSIVE")
.option("header", "false")
.option("delimiter", ":")
.csv("src/main/resources/connections.txt")
.toDF("id", "links")
.select(
'id.cast(LongType),
cleanIds(split(trim('links), ",")).as("links"))
.cache()
val vertices = ds.select('id).distinct()
val edges = ds.select(
'id.as("src"),
explode('links).as("dst")
)
val graphFrame = GraphFrame(vertices, edges)
val connectedComponents = graphFrame.connectedComponents.run()
connectedComponents
.groupBy('component).agg(
collect_list(struct('id)).as("vertices")
).show(false)
给出这样的输入:
0: 5,6,
1: 4,
2: 3,4,5,
3: 2,
4: 2,1,
5: 2,0,
6: 0,7,
10: 11,13,
11: 12,14,
12: 13,14,
13: 10,12,
14: 11,12,
这将创建一个如下所示的顶点数据框:
+---+
| id|
+---+
| 0|
| 6|
| 5|
| 1|
| 10|
| 3|
| 12|
| 11|
| 2|
| 4|
| 13|
| 14|
+---+
和这样的边缘:
+---+---+
|src|dst|
+---+---+
| 0| 5|
| 0| 6|
| 1| 4|
| 2| 3|
| 2| 4|
| 2| 5|
| 3| 2|
| 4| 2|
| 4| 1|
| 5| 2|
| 5| 0|
| 6| 0|
| 6| 7|
| 10| 11|
| 10| 13|
| 11| 12|
| 11| 14|
| 12| 13|
| 12| 14|
| 13| 10|
+---+---+
和连接的组件如下:
+---------+-----------------------------------+
|component|vertices |
+---------+-----------------------------------+
|0 |[[0], [6], [5], [1], [3], [2], [4]]|
|10 |[[10], [12], [11], [13], [14]] |
+---------+-----------------------------------+