方案:
我在maping之后从kafka收集了一个RDD并且减少了我想把它写入数据库db_aggregate
集合date_xx
。
映射时我需要从该数据库获取以获得先前的结果。
就像我写A我需要B的结果(之前写过)来计算然后将A写入db。
我认为我面临的问题是,当我在写入新记录时读取db_aggregate时,db游标可能会被一个动作写入或读取关闭。
我正在使用Spark 1.4.1 mongo-hadoop.1.4.1 mongo 2.6
功能:
def getPreviousAggregate(campaignId: String, publisher: String, width: Int, height: Int,
date: Int, month: Int, year: Int): BasicBSONObject = {
findLatestAggregate(campaignId, publisher, width, height, date, month, year) match {
case Some(toReturn) => return toReturn
case None => {
println("Not found previous date ....")
val previousDate = Calendar.getInstance();
previousDate.set(year, month, date)
previousDate.add(Calendar.DATE, -1)
val _date = previousDate.get(Calendar.DATE)
val _month = previousDate.get(Calendar.MONTH)
val _year = previousDate.get(Calendar.YEAR)
findLatestAggregate(campaignId, publisher, width, height, _date, _month, _year) match {
case Some(toReturn) => return toReturn
case None => {
}
}
}
}
null
}
def findLatestAggregate(campaignId: String, publisher: String, width: Int, height: Int,
date: Int, month: Int, year: Int): Option[BasicBSONObject] = {
val config = new Configuration()
val outDb = DB_AGGREGATE + "_%02d_%s".format(month, year)
val collName: String = COLL_AGGREGATE + "_%02d".format(date)
val mongoInputUri = "mongodb://%s:%s/%s.%s".format(DB_STATISTIC_HOST, DB_STATISTIC_PORT, outDb, collName)
config.set("mongo.input.uri", mongoInputUri)
try {
val aggregate = sc.newAPIHadoopRDD(config,
classOf[MongoInputFormat],
classOf[Object],
classOf[BSONObject])
val res = aggregate.sortBy(k => k._2.get("timestamp").toString, true).filter(r =>
// Integer.parseInt(r._2.get("timestamp").toString) <= timestamp - BATCH_TIME
// &&
Integer.parseInt(r._2.get("width").toString) == width
&& Integer.parseInt(r._2.get("height").toString) == height
&& r._2.get("publisher").toString == publisher
&& r._2.get("campaignId").toString == campaignId
).map(x => x._2).take(1)
if (res.nonEmpty) {
println("\nfound previous record")
val bson = new BasicBSONObject()
val collect: BSONObject = res(0)
bson.put("totalBudgetSpent", collect.get("totalBudgetSpent"))
bson.put("totalAuctions", collect.get("totalAuctions"))
bson.put("totalWin", collect.get("totalWin"))
return Some(bson)
}
}
catch {
case ex: MongoCommandException => {
println(ex.getMessage)
}
}
None
}
主要行动在这里
// store aggregate data in mongo
val outDb = DB_AGGREGATE + "_%02d_%s".format(month, year)
val _config = new Configuration()
val collName: String = COLL_AGGREGATE + "_%02d".format(date)
val mongoOutputAggregateUri: String = "mongodb://%s:%s/%s.%s".format(DB_STATISTIC_HOST, DB_STATISTIC_PORT, outDb, collName)
_config.set("mongo.output.uri", mongoOutputAggregateUri)
bidWinBSON.map(x => {
(x._1, (totalAuctions, totalWin, totalBudgetSpent))
}).reduceByKey((a, b) => {
(a._1 + b._1,
a._2 + b._2,
a._3 + b._3)
}).map(f = x => {
val timestamp: java.lang.Long = java.lang.Long.parseLong(x._1._1.toString)
val campaignId = x._1._2.toString
val publisher = x._1._3
val width = x._1._4
val height = x._1._5
// get previous aggregate
val previousResult = getPreviousAggregate(campaignId, publisher, width, height, date, month, year)
if (previousResult != null) {
print("\n\ngetPreviousAggregate\n\n")
totalBudgetSpent += java.lang.Long.parseLong(previousResult.get("totalBudgetSpent").toString)
totalAuctions += java.lang.Long.parseLong(previousResult.get("totalAuctions").toString)
totalWin += java.lang.Long.parseLong(previousResult.get("totalWin").toString)
}
println("\nAggregate ............................")
val bson = new BasicBSONObject()
bson.put("timestamp", timestamp)
bson.put("campaignId", campaignId)
bson.put("publisher", publisher)
bson.put("width", width)
bson.put("height", height)
bson.put("totalAuctions", totalAuctions)
bson.put("totalWin", totalWin)
bson.put("totalBudgetSpent", totalBudgetSpent)
(null, bson)
}).saveAsNewAPIHadoopFile("file:///xxx",
classOf[Any],
classOf[Any],
classOf[MongoOutputFormat[Any, Any]],
_config)
我有错误
15/08/27 10:35:44 ERROR Executor: Exception in task 0.0 in stage 19.0 (TID 23)
java.lang.IllegalStateException: state should be: open
at com.mongodb.assertions.Assertions.isTrue(Assertions.java:70)
at com.mongodb.connection.BaseCluster.selectServer(BaseCluster.java:79)
at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:75)
at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:71)
at com.mongodb.binding.ClusterBinding.getWriteConnectionSource(ClusterBinding.java:68)
at com.mongodb.operation.OperationHelper.withConnection(OperationHelper.java:175)
at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:141)
at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:72)
at com.mongodb.Mongo.execute(Mongo.java:745)
at com.mongodb.Mongo$2.execute(Mongo.java:728)
at com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1968)
at com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1962)
at com.mongodb.BulkWriteOperation.execute(BulkWriteOperation.java:98)
at com.mongodb.hadoop.output.MongoOutputCommitter.commitTask(MongoOutputCommitter.java:133)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1045)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1014)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
15/08/27 10:35:44 WARN TaskSetManager: Lost task 0.0 in stage 19.0 (TID 23, localhost): java.lang.IllegalStateException: state should be: open
at com.mongodb.assertions.Assertions.isTrue(Assertions.java:70)
at com.mongodb.connection.BaseCluster.selectServer(BaseCluster.java:79)
at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:75)
at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:71)
at com.mongodb.binding.ClusterBinding.getWriteConnectionSource(ClusterBinding.java:68)
at com.mongodb.operation.OperationHelper.withConnection(OperationHelper.java:175)
at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:141)
at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:72)
at com.mongodb.Mongo.execute(Mongo.java:745)
at com.mongodb.Mongo$2.execute(Mongo.java:728)
at com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1968)
at com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1962)
at com.mongodb.BulkWriteOperation.execute(BulkWriteOperation.java:98)
at com.mongodb.hadoop.output.MongoOutputCommitter.commitTask(MongoOutputCommitter.java:133)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1045)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1014)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)