我正在使用PySpark并从Kafka Broker获取数据。
以下代码可以帮助我做到这一点:
import json
import sys
from pyspark import Row
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql.functions import *
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils, TopicAndPartition
from pymongo import MongoClient
# Spark Context created here
sc = SparkContext("local[*]", appName="TwitterStreamKafka")
#Spark Streaming Context Created Here
ssc = StreamingContext(sc, 1)
#Kafka Stream Created Here
tweets = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"twitter": 1})
tweets.pprint()
ssc.start()
ssc.awaitTermination()
使用以下代码从推文中提取JSON:
tweet_json = tweets.map(lambda x: json.loads(x[1]))
现在我想将此 tweet_json 插入MongoDB。但无法做到这一点。
检查Spark-MongoConnector它说它需要DataFrame存储在MongoDB中
但 tweet_json 的类型为 “pyspark.streaming.dstream.TransformedDStream”
如何将其转换为Dataframe以存储到MongoDB中?
或
如何使用PySpark将获取的推文保存到MongoDB
提前谢谢!
在程序中建议编辑后,我用以下方式编辑了它:
import json
import sys
from pyspark import Row
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql.functions import *
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils, TopicAndPartition
from pymongo import MongoClient
def convert(rdd):
df_json = rdd.map(lambda x: json.loads(x[1])).toDF()
return df_json
# Spark Context created here
sc = SparkContext("local[*]", appName="TwitterStreamKafka")
#Spark Streaming Context Created Here
ssc = StreamingContext(sc, 1)
#Kafka Stream Created Here
tweets = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"twitter": 1})
#Tweets Printed Here
tweets.pprint()
tweets.foreachRDD(lambda rdd: convert(rdd).write\
.format('com.mongodb.spark.sql.DefaultSource').mode('append')
.option('database',Twitter).option('collection',RestTwitter).save())
ssc.start()
ssc.awaitTermination()
然后以下面的方式初始化spark:
spark-submit --conf "spark.mongodb.inuri=mongodb://127.0.0.1/TwitterDB.RestDatareadPreference=primaryPreferred" --conf "spark.mongodb.output.uri=mongodb://127.0.0.1/TwitterDB.RestData" --packages org.mongodb.spark:mongo-spark-connector_2.11:2.0.0 --jars spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar PySparkTwitter.py
它引发了以下错误:
文件“/home/hduser/test/PySparkTwitter.py”,第24行,转换 df_json = rdd.map(lambda x:json.loads(x [1]))。toDF() AttributeError:'PipelinedRDD'对象没有属性'toDF'
CreateStream的数据格式:
(None, '{"lang": "en", "id": 967004613332303873, "favorited": false, "possibly_sensitive": false, "is_quote_status": false, "geo": null, "user": {"lang": "en", "profile_use_background_image": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/19908749/1517325981", "is_translator": false, "id": 19908749, "profile_sidebar_border_color": "FFFFFF", "favourites_count": 1912, "profile_background_tile": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/921307498409230336/VWkCvBcu_normal.jpg", "friends_count": 1960, "profile_background_image_url": "http://pbs.twimg.com/profile_background_images/222053103/twitter-image-bg.jpg", "has_extended_profile": false, "profile_link_color": "AB0D0D", "screen_name": "ZNConsulting", "geo_enabled": true, "url": "", "profile_text_color": "000000", "default_profile": false, "utc_offset": 3600, "is_translation_enabled": false, "statuses_count": 5833, "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/222053103/twitter-image-bg.jpg", "verified": false, "name": "ZN Consulting", "notifications": false, "protected": false, "id_str": "19908749", "translator_type": "none", "profile_image_url": "http://pbs.twimg.com/profile_images/921307498409230336/VWkCvBcu_normal.jpg", "created_at": "Mon Feb 02 14:28:00 +0000 2009", "time_zone": "Brussels", "listed_count": 253, "follow_request_sent": false, "location": "Brussels; Our website:", "profile_background_color": "F7F7F7", "contributors_enabled": false, "entities": {"url": {"urls": [{"display_url": "znconsulting.com", "expanded_url": "http://znconsulting.com", "url": "", "indices": [0, 23]}]}, "description": {"urls": []}}, "default_profile_image": false, "following": false, "followers_count": 2312, "description": "The #digital communication agency in #Brussels. Strategy, digital campaigns, analysis & #socialmedia. #Hyperthinking to give you the #DigitalAdvantage \\ud83d\\ude0e", "profile_sidebar_fill_color": "C7C7C7"}, "in_reply_to_user_id_str": null, "contributors": null, "retweet_count": 0, "text": "Following user feedback, #Google is now blocking intrusive ads with a built-in adblocker in #Chrome \\u26d4\\n\\n", "retweeted": false, "truncated": false, "in_reply_to_user_id": null, "id_str": "967004613332303873", "source": "<a href=\\"https://about.twitter.com/products/tweetdeck\\" rel=\\"nofollow\\">TweetDeck</a>", "created_at": "Fri Feb 23 11:54:00 +0000 2018", "metadata": {"iso_language_code": "en", "result_type": "recent"}, "in_reply_to_screen_name": null, "in_reply_to_status_id_str": null, "entities": {"symbols": [], "urls": [{"display_url": "theguardian.com/technology/201\\u2026", "expanded_url": "https://www.theguardian.com/technology/2018/feb/15/google-adblocker-chrome-browser", "url": "", "indices": [103, 126]}], "user_mentions": [], "hashtags": [{"text": "Google", "indices": [25, 32]}, {"text": "Chrome", "indices": [92, 99]}]}, "coordinates": null, "in_reply_to_status_id": null, "place": null, "favorite_count": 0}')
答案 0 :(得分:2)
您必须将rdds的dstream转换为数据帧的dstream。对于此类情况,请使用 .foreachRDD 。
from pyspark.sql import SQLContext
sql = SQLContext(sc)
def _construct_key(previous_key, separator, new_key):
if previous_key:
return "{}{}{}".format(previous_key, separator, new_key)
else:
return new_key
def _flatten_JSON(nested_dict, separator='_', root_keys_to_ignore=set()):
assert isinstance(nested_dict, dict)
assert isinstance(separator, str)
flattened_dict = dict()
def _flatten(object_, key):
if isinstance(object_, dict):
for object_key in object_:
if not (not key and object_key in root_keys_to_ignore):
_flatten(object_[object_key], _construct_key(key, separator, object_key))
elif isinstance(object_, list) or isinstance(object_, set):
for index, item in enumerate(object_):
_flatten(item, _construct_key(key, separator, index))
else:
flattened_dict[key] = object_
_flatten(nested_dict, None)
return flattened_dict
def convert(rdd):
df_json = rdd.map(lambda x: _flatten_JSON(json.loads(x[1]))).toDF()
return df_json
def write_mongo(rdd):
try:
convert(rdd).write\
.format('com.mongodb.spark.sql.DefaultSource').mode('append')\
.option('database',NAME).option('collection',COLLECTION_MONGODB).save()
except:
pass
tweets.foreachRDD(lambda rdd: write_mongo(rdd))
此外,您需要根据您的版本提供conf和包以及spark-submit,
/bin/spark-submit --conf "spark.mongodb.inuri=mongodb://127.0.0.1/DATABASE.COLLECTION_NAME?readPreference=primaryPreferred"
--conf "spark.mongodb.output.uri=mongodb://127.0.0.1/DATABASE.COLLECTION_NAME"
--packages org.mongodb.spark:mongo-spark-connector_2.11:2.2.0
tester.py
这是createStream获取数据的格式
(None, '{"lang": "en", "id": 967004613332303873, "favorited": false, "possibly_sensitive": false, "is_quote_status": false, "geo": null, "user": {"lang": "en", "profile_use_background_image": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/19908749/1517325981", "is_translator": false, "id": 19908749, "profile_sidebar_border_color": "FFFFFF", "favourites_count": 1912, "profile_background_tile": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/921307498409230336/VWkCvBcu_normal.jpg", "friends_count": 1960, "profile_background_image_url": "http://pbs.twimg.com/profile_background_images/222053103/twitter-image-bg.jpg", "has_extended_profile": false, "profile_link_color": "AB0D0D", "screen_name": "ZNConsulting", "geo_enabled": true, "url": "", "profile_text_color": "000000", "default_profile": false, "utc_offset": 3600, "is_translation_enabled": false, "statuses_count": 5833, "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/222053103/twitter-image-bg.jpg", "verified": false, "name": "ZN Consulting", "notifications": false, "protected": false, "id_str": "19908749", "translator_type": "none", "profile_image_url": "http://pbs.twimg.com/profile_images/921307498409230336/VWkCvBcu_normal.jpg", "created_at": "Mon Feb 02 14:28:00 +0000 2009", "time_zone": "Brussels", "listed_count": 253, "follow_request_sent": false, "location": "Brussels; Our website:", "profile_background_color": "F7F7F7", "contributors_enabled": false, "entities": {"url": {"urls": [{"display_url": "znconsulting.com", "expanded_url": "http://znconsulting.com", "url": "", "indices": [0, 23]}]}, "description": {"urls": []}}, "default_profile_image": false, "following": false, "followers_count": 2312, "description": "The #digital communication agency in #Brussels. Strategy, digital campaigns, analysis & #socialmedia. #Hyperthinking to give you the #DigitalAdvantage \\ud83d\\ude0e", "profile_sidebar_fill_color": "C7C7C7"}, "in_reply_to_user_id_str": null, "contributors": null, "retweet_count": 0, "text": "Following user feedback, #Google is now blocking intrusive ads with a built-in adblocker in #Chrome \\u26d4\\n\\n", "retweeted": false, "truncated": false, "in_reply_to_user_id": null, "id_str": "967004613332303873", "source": "<a href=\\"https://about.twitter.com/products/tweetdeck\\" rel=\\"nofollow\\">TweetDeck</a>", "created_at": "Fri Feb 23 11:54:00 +0000 2018", "metadata": {"iso_language_code": "en", "result_type": "recent"}, "in_reply_to_screen_name": null, "in_reply_to_status_id_str": null, "entities": {"symbols": [], "urls": [{"display_url": "theguardian.com/technology/201\\u2026", "expanded_url": "https://www.theguardian.com/technology/2018/feb/15/google-adblocker-chrome-browser", "url": "", "indices": [103, 126]}], "user_mentions": [], "hashtags": [{"text": "Google", "indices": [25, 32]}, {"text": "Chrome", "indices": [92, 99]}]}, "coordinates": null, "in_reply_to_status_id": null, "place": null, "favorite_count": 0}')