我有一个pipline rdd,我尝试了几种打印方法。 我使用map函数在数据rdd中的每个元素上实现函数“节点”。 “ data.map(nodes)”的类型为“ pipline dataframe”。节点功能返回一个字符串。
这是我的代码:
import warnings
warnings.filterwarnings('ignore')
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark import SparkContext
from pyspark.sql import SQLContext
# Load data
def nodes(node):
def spark_bfs(G, origins, max_depth):
schema = StructType([
StructField("node", IntegerType(), True)
])
frontier_nodes_sdf = spark.createDataFrame(origins,schema)
# frontier_nodes_sdf.show()
G = G.cache()
frontier_nodes_sdf.cache()
for i in range(max_depth):
reachable_nodes_sdf = G.join(frontier_nodes_sdf, G.from_node == frontier_nodes_sdf.node).select(G.to_node)
if (i == 0):
visited_nodes_sdf = frontier_nodes_sdf
final_sdf = visited_nodes_sdf.withColumn("Depth", F.lit(i))
G = G.join(frontier_nodes_sdf, G.from_node == frontier_nodes_sdf.node, 'leftanti')
G = G.join(frontier_nodes_sdf, G.to_node == frontier_nodes_sdf.node, 'leftanti')
frontier_nodes_sdf = reachable_nodes_sdf
frontier_nodes_sdf = frontier_nodes_sdf.distinct().withColumnRenamed("to_node","node")
reached_sdf = reachable_nodes_sdf.join(visited_nodes_sdf, reachable_nodes_sdf.to_node == visited_nodes_sdf.node, 'leftanti')
visited_nodes_sdf = visited_nodes_sdf.unionAll(frontier_nodes_sdf)
final_sdf = final_sdf.unionAll(reached_sdf.withColumn("Depth", F.lit(i+1)))
G.unpersist()
frontier_nodes_sdf.unpersist()
return_sdf = final_sdf
# return_sdf = return_sdf.withColumnRenamed("to_node","node")
# return_sdf = return_sdf.withColumnRenamed("from_node","depth")
#return_sdf = return_sdf.select("depth","node")
return return_sdf
origin_map = [{'node': node}]
bfs_sdf = spark_bfs(comments_questions_sdf, origin_map, 2)
result=bfs_sdf.select('node').rdd.map(lambda row : row[0]).collect()
p=''
for i in range(len(result)):
result[i]=str(result[i])+','
p+=result[i]
print type(p[0:len(p)-1])
return p[0:len(p)-1]
spark = SparkSession.builder.appName('Graphs').getOrCreate()
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
comments_questions_sdf = spark.read.load('/Users/chloe/Downloads/sx-stackoverflow-c2q.txt', format="text")
comments_questions_sdf.createOrReplaceTempView('comments_questions_sdf_view')
comments_questions_sdf = spark.sql('SELECT CAST(split(value, " ")[0] AS int) AS from_node, CAST(split(value, " ")[1] AS int) AS to_node, "comment-on-question" AS edge_type FROM comments_questions_sdf_view ')
data = sc.parallelize([
(1, ([1])),
(2, ([6]))
])
print 'aaa',type(data),data.collect()
dd=data.map(lambda x:nodes(x[0])).map(lambda x: Row(x)).toDF('a')
print(type(dd))
,输出为:
pyspark.sql.utils.ParseException: u"\nmismatched input '<EOF>' expecting {'SELECT', 'FROM', 'ADD', 'AS', 'ALL', 'ANY', 'DISTINCT', 'WHERE', 'GROUP', 'BY', 'GROUPING', 'SETS', 'CUBE', 'ROLLUP', 'ORDER', 'HAVING', 'LIMIT', 'AT', 'OR', 'AND', 'IN', NOT, 'NO', 'EXISTS', 'BETWEEN', 'LIKE', RLIKE, 'IS', 'NULL', 'TRUE', 'FALSE', 'NULLS', 'ASC', 'DESC', 'FOR', 'INTERVAL', 'CASE', 'WHEN', 'THEN', 'ELSE', 'END', 'JOIN', 'CROSS', 'OUTER', 'INNER', 'LEFT', 'SEMI', 'RIGHT', 'FULL', 'NATURAL', 'ON', 'PIVOT', 'LATERAL', 'WINDOW', 'OVER', 'PARTITION', 'RANGE', 'ROWS', 'UNBOUNDED', 'PRECEDING', 'FOLLOWING', 'CURRENT', 'FIRST', 'AFTER', 'LAST', 'ROW', 'WITH', 'VALUES', 'CREATE', 'TABLE', 'DIRECTORY', 'VIEW', 'REPLACE', 'INSERT', 'DELETE', 'INTO', 'DESCRIBE', 'EXPLAIN'......
我希望看到rdd的内容。 谁能帮我? 谢谢!