解析JSON数据并保存到PySpark中的MongoDB

时间:2018-02-22 15:05:12

标签: json mongodb apache-spark pyspark spark-dataframe

我正在使用PySpark并从Kafka Broker获取数据。

以下代码可以帮助我做到这一点:

import json
import sys
from pyspark import Row
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql.functions import *
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils, TopicAndPartition
from pymongo import MongoClient

# Spark Context created here
sc = SparkContext("local[*]", appName="TwitterStreamKafka")

#Spark Streaming Context Created Here
ssc = StreamingContext(sc, 1)

#Kafka Stream Created Here
tweets = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"twitter": 1})

tweets.pprint()

ssc.start()
ssc.awaitTermination()

使用以下代码从推文中提取JSON:

tweet_json = tweets.map(lambda x: json.loads(x[1]))

现在我想将此 tweet_json 插入MongoDB。但无法做到这一点。

检查Spark-MongoConnector它说它需要DataFrame存储在MongoDB中

tweet_json 的类型为 “pyspark.streaming.dstream.TransformedDStream”

如何将其转换为Dataframe以存储到MongoDB中?

如何使用PySpark将获取的推文保存到MongoDB

提前谢谢!

在程序中建议编辑后,我用以下方式编辑了它:

import json
import sys
from pyspark import Row
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql.functions import *
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils, TopicAndPartition
from pymongo import MongoClient
def convert(rdd):
    df_json = rdd.map(lambda x: json.loads(x[1])).toDF()
    return df_json
# Spark Context created here
sc = SparkContext("local[*]", appName="TwitterStreamKafka")
#Spark Streaming Context Created Here
ssc = StreamingContext(sc, 1)
#Kafka Stream Created Here
tweets = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"twitter": 1})
#Tweets Printed Here
tweets.pprint()
tweets.foreachRDD(lambda rdd: convert(rdd).write\
                  .format('com.mongodb.spark.sql.DefaultSource').mode('append')
                  .option('database',Twitter).option('collection',RestTwitter).save())
ssc.start()
ssc.awaitTermination()

然后以下面的方式初始化spark:

spark-submit --conf "spark.mongodb.inuri=mongodb://127.0.0.1/TwitterDB.RestDatareadPreference=primaryPreferred" --conf "spark.mongodb.output.uri=mongodb://127.0.0.1/TwitterDB.RestData" --packages org.mongodb.spark:mongo-spark-connector_2.11:2.0.0 --jars spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar PySparkTwitter.py

它引发了以下错误:

文件“/home/hduser/test/PySparkTwitter.py”,第24行,转换     df_json = rdd.map(lambda x:json.loads(x [1]))。toDF() AttributeError:'PipelinedRDD'对象没有属性'toDF'

CreateStream的数据格式:

(None, '{"lang": "en", "id": 967004613332303873, "favorited": false, "possibly_sensitive": false, "is_quote_status": false, "geo": null, "user": {"lang": "en", "profile_use_background_image": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/19908749/1517325981", "is_translator": false, "id": 19908749, "profile_sidebar_border_color": "FFFFFF", "favourites_count": 1912, "profile_background_tile": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/921307498409230336/VWkCvBcu_normal.jpg", "friends_count": 1960, "profile_background_image_url": "http://pbs.twimg.com/profile_background_images/222053103/twitter-image-bg.jpg", "has_extended_profile": false, "profile_link_color": "AB0D0D", "screen_name": "ZNConsulting", "geo_enabled": true, "url": "", "profile_text_color": "000000", "default_profile": false, "utc_offset": 3600, "is_translation_enabled": false, "statuses_count": 5833, "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/222053103/twitter-image-bg.jpg", "verified": false, "name": "ZN Consulting", "notifications": false, "protected": false, "id_str": "19908749", "translator_type": "none", "profile_image_url": "http://pbs.twimg.com/profile_images/921307498409230336/VWkCvBcu_normal.jpg", "created_at": "Mon Feb 02 14:28:00 +0000 2009", "time_zone": "Brussels", "listed_count": 253, "follow_request_sent": false, "location": "Brussels; Our website:", "profile_background_color": "F7F7F7", "contributors_enabled": false, "entities": {"url": {"urls": [{"display_url": "znconsulting.com", "expanded_url": "http://znconsulting.com", "url": "", "indices": [0, 23]}]}, "description": {"urls": []}}, "default_profile_image": false, "following": false, "followers_count": 2312, "description": "The #digital communication agency in #Brussels. Strategy, digital campaigns, analysis & #socialmedia. #Hyperthinking to give you the #DigitalAdvantage \\ud83d\\ude0e", "profile_sidebar_fill_color": "C7C7C7"}, "in_reply_to_user_id_str": null, "contributors": null, "retweet_count": 0, "text": "Following user feedback, #Google is now blocking intrusive ads with a built-in adblocker in #Chrome \\u26d4\\n\\n", "retweeted": false, "truncated": false, "in_reply_to_user_id": null, "id_str": "967004613332303873", "source": "<a href=\\"https://about.twitter.com/products/tweetdeck\\" rel=\\"nofollow\\">TweetDeck</a>", "created_at": "Fri Feb 23 11:54:00 +0000 2018", "metadata": {"iso_language_code": "en", "result_type": "recent"}, "in_reply_to_screen_name": null, "in_reply_to_status_id_str": null, "entities": {"symbols": [], "urls": [{"display_url": "theguardian.com/technology/201\\u2026", "expanded_url": "https://www.theguardian.com/technology/2018/feb/15/google-adblocker-chrome-browser", "url": "", "indices": [103, 126]}], "user_mentions": [], "hashtags": [{"text": "Google", "indices": [25, 32]}, {"text": "Chrome", "indices": [92, 99]}]}, "coordinates": null, "in_reply_to_status_id": null, "place": null, "favorite_count": 0}')

1 个答案:

答案 0 :(得分:2)

您必须将rdds的dstream转换为数据帧的dstream。对于此类情况,请使用 .foreachRDD

from pyspark.sql import SQLContext

sql = SQLContext(sc)

def _construct_key(previous_key, separator, new_key):
    if previous_key:
        return "{}{}{}".format(previous_key, separator, new_key)
    else:
        return new_key

def _flatten_JSON(nested_dict, separator='_', root_keys_to_ignore=set()):
    assert isinstance(nested_dict, dict)
    assert isinstance(separator, str)
    flattened_dict = dict()

    def _flatten(object_, key):        
        if isinstance(object_, dict):
            for object_key in object_:
                if not (not key and object_key in root_keys_to_ignore):
                    _flatten(object_[object_key], _construct_key(key, separator, object_key))
        elif isinstance(object_, list) or isinstance(object_, set):
            for index, item in enumerate(object_):
                _flatten(item, _construct_key(key, separator, index))
        else:
            flattened_dict[key] = object_

    _flatten(nested_dict, None)
    return flattened_dict

def convert(rdd):
    df_json = rdd.map(lambda x: _flatten_JSON(json.loads(x[1]))).toDF()
    return df_json

def write_mongo(rdd):
    try:
        convert(rdd).write\
          .format('com.mongodb.spark.sql.DefaultSource').mode('append')\
          .option('database',NAME).option('collection',COLLECTION_MONGODB).save()
    except:
        pass

tweets.foreachRDD(lambda rdd: write_mongo(rdd))

此外,您需要根据您的版本提供conf和包以及spark-submit,

/bin/spark-submit --conf "spark.mongodb.inuri=mongodb://127.0.0.1/DATABASE.COLLECTION_NAME?readPreference=primaryPreferred"
                  --conf "spark.mongodb.output.uri=mongodb://127.0.0.1/DATABASE.COLLECTION_NAME" 
                  --packages org.mongodb.spark:mongo-spark-connector_2.11:2.2.0
                  tester.py

这是createStream获取数据的格式

(None, '{"lang": "en", "id": 967004613332303873, "favorited": false, "possibly_sensitive": false, "is_quote_status": false, "geo": null, "user": {"lang": "en", "profile_use_background_image": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/19908749/1517325981", "is_translator": false, "id": 19908749, "profile_sidebar_border_color": "FFFFFF", "favourites_count": 1912, "profile_background_tile": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/921307498409230336/VWkCvBcu_normal.jpg", "friends_count": 1960, "profile_background_image_url": "http://pbs.twimg.com/profile_background_images/222053103/twitter-image-bg.jpg", "has_extended_profile": false, "profile_link_color": "AB0D0D", "screen_name": "ZNConsulting", "geo_enabled": true, "url": "", "profile_text_color": "000000", "default_profile": false, "utc_offset": 3600, "is_translation_enabled": false, "statuses_count": 5833, "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/222053103/twitter-image-bg.jpg", "verified": false, "name": "ZN Consulting", "notifications": false, "protected": false, "id_str": "19908749", "translator_type": "none", "profile_image_url": "http://pbs.twimg.com/profile_images/921307498409230336/VWkCvBcu_normal.jpg", "created_at": "Mon Feb 02 14:28:00 +0000 2009", "time_zone": "Brussels", "listed_count": 253, "follow_request_sent": false, "location": "Brussels; Our website:", "profile_background_color": "F7F7F7", "contributors_enabled": false, "entities": {"url": {"urls": [{"display_url": "znconsulting.com", "expanded_url": "http://znconsulting.com", "url": "", "indices": [0, 23]}]}, "description": {"urls": []}}, "default_profile_image": false, "following": false, "followers_count": 2312, "description": "The #digital communication agency in #Brussels. Strategy, digital campaigns, analysis & #socialmedia. #Hyperthinking to give you the #DigitalAdvantage \\ud83d\\ude0e", "profile_sidebar_fill_color": "C7C7C7"}, "in_reply_to_user_id_str": null, "contributors": null, "retweet_count": 0, "text": "Following user feedback, #Google is now blocking intrusive ads with a built-in adblocker in #Chrome \\u26d4\\n\\n", "retweeted": false, "truncated": false, "in_reply_to_user_id": null, "id_str": "967004613332303873", "source": "<a href=\\"https://about.twitter.com/products/tweetdeck\\" rel=\\"nofollow\\">TweetDeck</a>", "created_at": "Fri Feb 23 11:54:00 +0000 2018", "metadata": {"iso_language_code": "en", "result_type": "recent"}, "in_reply_to_screen_name": null, "in_reply_to_status_id_str": null, "entities": {"symbols": [], "urls": [{"display_url": "theguardian.com/technology/201\\u2026", "expanded_url": "https://www.theguardian.com/technology/2018/feb/15/google-adblocker-chrome-browser", "url": "", "indices": [103, 126]}], "user_mentions": [], "hashtags": [{"text": "Google", "indices": [25, 32]}, {"text": "Chrome", "indices": [92, 99]}]}, "coordinates": null, "in_reply_to_status_id": null, "place": null, "favorite_count": 0}')