无法将数据框保存到mongodb

时间:2019-08-25 21:09:22

标签: java scala dataframe apache-spark apache-spark-sql

我在Scala上有一个代码,可以通过帮助流从Twitter读取数据,我也想在Java上做同样的事情。我正在尝试在Jackson Jackson的帮助下序列化数据。但是我在MongoSpark.save(dataFrame,writeConfig);This one is underlined (Cannot resolve method save(org,apache.spark.sql.Dataframe, com.mongodb.saprk.config.WriteConfig))Can do the same in other的方式上有错误吗?MongoSpark.save(rawTweetsDF.coalesce(1).write.format("org.apache.spark.sql.json").option("forensicdb","LiveRawTweets").mode("append"), writeConfig)我也对这行感到困惑,我可以在Java中做同样的事情吗? 附言我正在使用Spark 1.6.2版本

object tweetstreamingmodel {
 //***********************************************************************************
 @transient
 @volatile private
 var spark_SparkSession: SparkSession = _ //Equivalent of SQLContext
 val naivemodelpth = "/home/smulwa/data/naiveBayesModel"
 case class SummaryStats(Recall: Double, Precision: Double, F1measure: Double, Accuracy: Double)
 var tweetcategory: String = _
 //***********************************************************************************
 def main(args: Array[String]) {
   try {
    var totalTweets: Long = 0
    if (spark_SparkSession == null) {
     spark_SparkSession = SentUtilities.getSparkSession() //Get Spark Session Object
    }
    val spark_streamcontext = SentUtilities.getSparkStreamingContext(spark_SparkSession.sparkContext)
    spark_streamcontext.checkpoint("hdfs://KENBO-SPK08.forensics.net:54310/checkpoint/")
    // Load Naive Bayes Model from local drive.
    val sqlcontext = spark_SparkSession.sqlContext //Create SQLContext from SparkSession Object
    import sqlcontext.implicits._
    val twitteroAuth: Some[OAuthAuthorization] = OAuthUtilities.getTwitterOAuth()
    val tweetfilters = MongoScalaUtil.getTweetFilters(spark_SparkSession)
    val Twitterstream: DStream[Status] = TwitterUtils.createStream(spark_streamcontext, twitteroAuth, tweetfilters,
     StorageLevel.MEMORY_AND_DISK_SER).filter(_.getLang() == "en")

    Twitterstream.foreachRDD {
     rdd =>
      if (rdd != null && !rdd.isEmpty() && !rdd.partitions.isEmpty) {
       saveRawTweetsToMongoDB(rdd)
       rdd.foreachPartition {
        partitionOfRecords =>
         if (!partitionOfRecords.isEmpty) {
          partitionOfRecords.foreach(record =>
           MongoScalaUtil.SaveRawtweetstoMongodb(record.toString, record.getUser.getId, record.getId, SentUtilities.getStrea mDate(), SentUtilities.getStreamTime())) //mongo_utilities.save(record.toString,spark_SparkSession.sparkContext))
         }
       }
      }
    }



    val jacksonObjectMapper: ObjectMapper = new ObjectMapper()
    // @param rdd -- RDD of Status objects to save.
    def saveRawTweetsToMongoDB(rdd: RDD[Status]): Unit = {
     try {
      val sqlContext = spark_SparkSession.sqlContext
      val tweet = rdd.map(status => jacksonObjectMapper.writeValueAsString(status))
      val rawTweetsDF = sqlContext.read.json(tweet)
      val readConfig: ReadConfig = ReadConfig(Map("uri" ->
       "mongodb://10.0.10.100:27017/forensicdb.LiveRawTweets?readPreference=primaryPreferred"))
      val writeConfig: WriteConfig = WriteConfig(Map("uri" ->
       "mongodb://10.0.10.100:27017/forensicdb.LiveRawTweets"))
      MongoSpark.save(rawTweetsDF.coalesce(1).write.format("org.apache.spark.sql.json").option("forensicdb",
       "LiveRawTweets").mode("append"), writeConfig)
     } catch {
      case e: Exception => println("Error Saving tweets to Mongodb:", e)
     }
    }
   }

和Java类似物

public class Main {

 // Set system credentials for access to twitter
 private static void setTwitterOAuth() {
  System.setProperty("twitter4j.oauth.consumerKey", TwitterCredentials.consumerKey);
  System.setProperty("twitter4j.oauth.consumerSecret", TwitterCredentials.consumerSecret);
  System.setProperty("twitter4j.oauth.accessToken", TwitterCredentials.accessToken);
  System.setProperty("twitter4j.oauth.accessTokenSecret", TwitterCredentials.accessTokenSecret);
 }


 public static void main(String[] args) {
  setTwitterOAuth();

  SparkConf conf = new SparkConf().setMaster("local[2]")
   .setAppName("SparkTwitter");


  // Spark contexts
  JavaSparkContext sparkContext = new JavaSparkContext(conf);
  JavaStreamingContext jssc = new JavaStreamingContext(sparkContext, new Duration(1000));

  JavaReceiverInputDStream < Status > twitterStream = TwitterUtils.createStream(jssc);

  // Stream that contains just tweets in english
  JavaDStream < Status > enTweetsDStream = twitterStream.filter((status) -> "en".equalsIgnoreCase(status.getLang()));
  enTweetsDStream.persist(StorageLevel.MEMORY_AND_DISK());

  enTweetsDStream.foreachRDD(rdd -> {
   if (rdd != null && !rdd.isEmpty() && !rdd.partitions().isEmpty()) {
    saveRawTweetsToMondoDb(rdd, sparkContext);
   }
  });

  enTweetsDStream.print();
  jssc.start();
  jssc.awaitTermination();
 }

 static void saveRawTweetsToMondoDb(JavaRDD < Status > rdd, JavaSparkContext sparkContext) {
  try {
   ObjectMapper objectMapper = new ObjectMapper();
   Function < Status, String > toJsonString = status -> objectMapper.writeValueAsString(status);
   SQLContext sqlContext = new SQLContext(sparkContext);
   JavaRDD < String > tweet = (JavaRDD < String > ) rdd.map(toJsonString);

   DataFrame dataFrame = sqlContext.read().json(tweet);

   // Setting for read
   Map < String, String > readOverrides = new HashMap < > ();
   readOverrides.put("uri", "mongodb://127.0.0.1/forensicdb.LiveRawTweets");
   readOverrides.put("readPreference", "primaryPreferred");
   ReadConfig readConfig = ReadConfig.create(sparkContext).withJavaOptions(readOverrides);

   // Settings for writing
   Map < String, String > writeOverrides = new HashMap < > ();
   writeOverrides.put("uri", "mongodb://127.0.0.1/forensicdb.LiveRawTweets");
   WriteConfig writeConfig = WriteConfig.create(sparkContext).withJavaOptions(writeOverrides);
   MongoSpark.write(dataFrame).option("collection", "LiveRawTweets").mode("append").save();
   MongoSpark.save(dataFrame, writeConfig);

  } catch (Exception e) {
   System.out.println("Error saving to database");
  }
 }

0 个答案:

没有答案