在Db上使用Spark Mongo-Hadoop读写时,应该打开错误状态

时间:2015-08-27 04:50:38

标签: mongodb scala hadoop apache-spark

方案: 我在maping之后从kafka收集了一个RDD并且减少了我想把它写入数据库db_aggregate集合date_xx

映射时我需要从该数据库获取以获得先前的结果。

就像我写A我需要B的结果(之前写过)来计算然后将A写入db。

我认为我面临的问题是,当我在写入新记录时读取db_aggregate时,db游标可能会被一个动作写入或读取关闭。

我正在使用Spark 1.4.1 mongo-hadoop.1.4.1 mongo 2.6

功能:

def getPreviousAggregate(campaignId: String, publisher: String, width: Int, height: Int,
                           date: Int, month: Int, year: Int): BasicBSONObject = {
    findLatestAggregate(campaignId, publisher, width, height, date, month, year) match {
      case Some(toReturn) => return toReturn
      case None => {
        println("Not found previous date ....")
        val previousDate = Calendar.getInstance();
        previousDate.set(year, month, date)
        previousDate.add(Calendar.DATE, -1)
        val _date = previousDate.get(Calendar.DATE)
        val _month = previousDate.get(Calendar.MONTH)
        val _year = previousDate.get(Calendar.YEAR)
        findLatestAggregate(campaignId, publisher, width, height, _date, _month, _year) match {
          case Some(toReturn) => return toReturn
          case None => {
          }
        }
      }
    }
    null
  }

  def findLatestAggregate(campaignId: String, publisher: String, width: Int, height: Int,
                          date: Int, month: Int, year: Int): Option[BasicBSONObject] = {
    val config = new Configuration()
    val outDb = DB_AGGREGATE + "_%02d_%s".format(month, year)
    val collName: String = COLL_AGGREGATE + "_%02d".format(date)
    val mongoInputUri = "mongodb://%s:%s/%s.%s".format(DB_STATISTIC_HOST, DB_STATISTIC_PORT, outDb, collName)
    config.set("mongo.input.uri", mongoInputUri)
    try {
      val aggregate = sc.newAPIHadoopRDD(config,
        classOf[MongoInputFormat],
        classOf[Object],
        classOf[BSONObject])
      val res = aggregate.sortBy(k => k._2.get("timestamp").toString, true).filter(r =>
        //        Integer.parseInt(r._2.get("timestamp").toString) <= timestamp - BATCH_TIME
        //          &&
        Integer.parseInt(r._2.get("width").toString) == width
          && Integer.parseInt(r._2.get("height").toString) == height
          && r._2.get("publisher").toString == publisher
          && r._2.get("campaignId").toString == campaignId
      ).map(x => x._2).take(1)

      if (res.nonEmpty) {
        println("\nfound previous record")
        val bson = new BasicBSONObject()
        val collect: BSONObject = res(0)
        bson.put("totalBudgetSpent", collect.get("totalBudgetSpent"))
        bson.put("totalAuctions", collect.get("totalAuctions"))
        bson.put("totalWin", collect.get("totalWin"))
        return Some(bson)
      }
    }
    catch {
      case ex: MongoCommandException => {
        println(ex.getMessage)
      }
    }
    None
  }

主要行动在这里

// store aggregate data in mongo
    val outDb = DB_AGGREGATE + "_%02d_%s".format(month, year)
    val _config = new Configuration()

    val collName: String = COLL_AGGREGATE + "_%02d".format(date)
    val mongoOutputAggregateUri: String = "mongodb://%s:%s/%s.%s".format(DB_STATISTIC_HOST, DB_STATISTIC_PORT, outDb, collName)
    _config.set("mongo.output.uri", mongoOutputAggregateUri)
    bidWinBSON.map(x => {
      (x._1, (totalAuctions, totalWin, totalBudgetSpent))
    }).reduceByKey((a, b) => {
      (a._1 + b._1,
        a._2 + b._2,
        a._3 + b._3)
    }).map(f = x => {
      val timestamp: java.lang.Long = java.lang.Long.parseLong(x._1._1.toString)
      val campaignId = x._1._2.toString
      val publisher = x._1._3
      val width = x._1._4
      val height = x._1._5

      //      get previous aggregate
      val previousResult = getPreviousAggregate(campaignId, publisher, width, height, date, month, year)
      if (previousResult != null) {
        print("\n\ngetPreviousAggregate\n\n")
        totalBudgetSpent += java.lang.Long.parseLong(previousResult.get("totalBudgetSpent").toString)
        totalAuctions += java.lang.Long.parseLong(previousResult.get("totalAuctions").toString)
        totalWin += java.lang.Long.parseLong(previousResult.get("totalWin").toString)
      }
      println("\nAggregate ............................")
      val bson = new BasicBSONObject()
      bson.put("timestamp", timestamp)
      bson.put("campaignId", campaignId)
      bson.put("publisher", publisher)
      bson.put("width", width)
      bson.put("height", height)
      bson.put("totalAuctions", totalAuctions)
      bson.put("totalWin", totalWin)
      bson.put("totalBudgetSpent", totalBudgetSpent)
      (null, bson)
    }).saveAsNewAPIHadoopFile("file:///xxx",
        classOf[Any],
        classOf[Any],
        classOf[MongoOutputFormat[Any, Any]],
        _config)

我有错误

15/08/27 10:35:44 ERROR Executor: Exception in task 0.0 in stage 19.0 (TID 23)
java.lang.IllegalStateException: state should be: open
    at com.mongodb.assertions.Assertions.isTrue(Assertions.java:70)
    at com.mongodb.connection.BaseCluster.selectServer(BaseCluster.java:79)
    at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:75)
    at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:71)
    at com.mongodb.binding.ClusterBinding.getWriteConnectionSource(ClusterBinding.java:68)
    at com.mongodb.operation.OperationHelper.withConnection(OperationHelper.java:175)
    at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:141)
    at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:72)
    at com.mongodb.Mongo.execute(Mongo.java:745)
    at com.mongodb.Mongo$2.execute(Mongo.java:728)
    at com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1968)
    at com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1962)
    at com.mongodb.BulkWriteOperation.execute(BulkWriteOperation.java:98)
    at com.mongodb.hadoop.output.MongoOutputCommitter.commitTask(MongoOutputCommitter.java:133)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1045)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1014)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
15/08/27 10:35:44 WARN TaskSetManager: Lost task 0.0 in stage 19.0 (TID 23, localhost): java.lang.IllegalStateException: state should be: open
    at com.mongodb.assertions.Assertions.isTrue(Assertions.java:70)
    at com.mongodb.connection.BaseCluster.selectServer(BaseCluster.java:79)
    at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:75)
    at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:71)
    at com.mongodb.binding.ClusterBinding.getWriteConnectionSource(ClusterBinding.java:68)
    at com.mongodb.operation.OperationHelper.withConnection(OperationHelper.java:175)
    at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:141)
    at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:72)
    at com.mongodb.Mongo.execute(Mongo.java:745)
    at com.mongodb.Mongo$2.execute(Mongo.java:728)
    at com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1968)
    at com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1962)
    at com.mongodb.BulkWriteOperation.execute(BulkWriteOperation.java:98)
    at com.mongodb.hadoop.output.MongoOutputCommitter.commitTask(MongoOutputCommitter.java:133)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1045)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1014)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)

0 个答案:

没有答案