当我运行以下代码时,我希望e2和r仅返回那些将validation属性设置为1的边。但是,两个结果都显示了将validation设置为0或1的行,因为它完全忽略了过滤。 测试到关系字段的相同代码确实按预期工作。 有谁知道为什么e2和r继续显示验证为0的行?
我尝试将验证字段设置为整数和字符串类型。两者显示相同的结果。
def split_func():
i = np.random.randint(0, high=100)
if i < 30:
return '1'
else:
return '0'
split_udf = F.udf(split_func, t.StringType())
v = sqlContext.createDataFrame([
("a", "Alice", 34),
("b", "Bob", 36),
("c", "Charlie", 30),
("d", "David", 29),
("e", "Esther", 32),
("f", "Fanny", 36),
("g", "Gabby", 60)
], ["id", "name", "age"])
# Edge DataFrame
e = sqlContext.createDataFrame([
("a", "b", "friend"),
("b", "c", "follow"),
("c", "b", "follow"),
("f", "c", "follow"),
("e", "f", "follow"),
("e", "d", "friend"),
("d", "a", "friend"),
("a", "e", "friend")
], ["src", "dst", "relationship"])
e = e.withColumn('validation',split_udf())
e.show(10)
# Create a GraphFrame
g = GraphFrame(v, e)
r = g.filterEdges("validation = '1'")
r.edges.show()
paths = g.find("(a)-[e]->(b)")\
.filter("e.validation = '1'")
e2 = paths.select("e.src", "e.dst", "e.validation")
e2.show(3)
paths = g.find("(a)-[e]->(b)")\
.filter("e.relationship = 'friend'")
e3 = paths.select("e.src", "e.dst", "e.relationship")
e3.show(3)
下面是输出
e.show(10)
+---+---+------------+----------+
|src|dst|relationship|validation|
+---+---+------------+----------+
| a| b| friend| 0|
| b| c| follow| 1|
| c| b| follow| 0|
| f| c| follow| 1|
| e| f| follow| 0|
| e| d| friend| 1|
| d| a| friend| 0|
| a| e| friend| 1|
+---+---+------------+----------+
r.edges.show()
+---+---+------------+----------+
|src|dst|relationship|validation|
+---+---+------------+----------+
| b| c| follow| 0|<-- I would expect only to see validation = 1
| a| e| friend| 0|
+---+---+------------+----------+
e2.show(3)
+---+---+----------+
|src|dst|validation|
+---+---+----------+
| a| e| 0| <-- I would expect only to see validation = 1
| e| d| 0|
| f| c| 1|
+---+---+----------+
only showing top 3 rows
e3.show(3)
+---+---+------------+
|src|dst|relationship|
+---+---+------------+
| a| e| friend|<-- works as expected
| e| d| friend|
| a| b| friend|
+---+---+------------+
only showing top 3 rows