我正在运行一个spark应用程序,它从一个非常大的(~7M)表读取消息,处理消息并将结果写回同一个表。应用程序应该可以正常使用小表,但是在这个表上我得到了如上所述的错误。
代码
package com.vocp.userProfile
import java.util.Arrays
import java.util.ArrayList
import gate.util.GateException
import java.net.MalformedURLException
import org.codehaus.jettison.json.JSONArray
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.HConstants
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.mapreduce.{MultiTableOutputFormat, TableOutputFormat}
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.KeyValue.Type
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import java.util.ArrayList
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.log4j.{Level, Logger}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.codehaus.jettison.json.{JSONArray, JSONObject}
import com.vocp.userProfile.UserProfile.{Utility, controller_java, pipeline}
import com.vocp.userProfile.Configuration.{VOCPConstants, VocpConfiguration}
import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.{Base64, Bytes}
/**
* Created by sahil on 12/4/17.
*/
class runner {
var log: Logger = Logger.getLogger(classOf[runner])
val conf = VocpConfiguration.create()
}
object runner {
val run = new runner
val conf = run.conf
val log = run.log
@throws(classOf[Exception])
def nlpAnnotationExtraction(batchString: String): Int = {
log.info("In Main Object..")
//Initializing Spark Context
val sc = new SparkContext(new SparkConf().setAppName("TheExorcist"))
sc.addFile(conf.get(VOCPConstants.GATE_PLUGIN_ARCHIVE), true)
val batchId =
if (batchString == "newbatch")
java.lang.Long.toString(System.currentTimeMillis())
else batchString
conf.set("batchId", batchId)
try {
conf.set(TableInputFormat.INPUT_TABLE, conf.get(VOCPConstants.INPUTTABLE))
conf.set(TableOutputFormat.OUTPUT_TABLE, conf.get(VOCPConstants.OUTPUTTABLE))
val job: Job = Job.getInstance(conf, "NLPAnnotationJob")
job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, conf.get(VOCPConstants.OUTPUTTABLE))
job.setOutputFormatClass(classOf[MultiTableOutputFormat])
val admin = new HBaseAdmin(conf)
if (!admin.isTableAvailable(conf.get(VOCPConstants.OUTPUTTABLE))) {
val tableDesc = new HTableDescriptor(TableName.valueOf(conf.get(VOCPConstants.OUTPUTTABLE)))
admin.createTable(tableDesc)
}
val scan = new Scan()
scan.setCaching(5000)
scan.setCacheBlocks(false)
val inputCfs = conf.get(VOCPConstants.INPUTCOLUMNFAMILIES).split(",")
inputCfs.foreach { x => scan.addFamily(Bytes.toBytes(x)) }
val proto = ProtobufUtil.toScan(scan)
val scan_string = Base64.encodeBytes(proto.toByteArray)
job.getConfiguration().set(TableInputFormat.SCAN, scan_string)
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
val processedFilteredRDD = hBaseRDD.map(x => x._2)
.filter{ result =>
val to_process : String = Bytes.toString(result.getValue(Bytes.toBytes("p"),
Bytes.toBytes("to_process")))
val user_flag : String = Bytes.toString(result.getValue(Bytes.toBytes("p"),
Bytes.toBytes("user_flag")))
(to_process != "0" && user_flag == null)
}
val messageRDD = processedFilteredRDD.map { result =>
val message = Bytes.toString(result.getValue(Bytes.toBytes("p"),
Bytes.toBytes("message")))
(Bytes.toString(result.getRow()), message)
}
val filterRDD = messageRDD.filter { x =>
var flag = true
if (x._2 == null || x._2.toString.trim.length == 0) {
flag = false
}
(flag)
}
val pluginHome = "plugins"
val pipelineRDD = filterRDD.mapPartitions { iter =>
val pipe = new pipeline(pluginHome)
iter.map { x =>
val result = pipe.exec(x._2.toLowerCase())
val resultJson: JSONObject = new JSONObject(result.trim())
var Json: JSONArray = resultJson.get("cancerStage").asInstanceOf[JSONArray]
val CancerStage: String = controller_java.getEntity(Json, "cancerStage")
Json = resultJson.get("age").asInstanceOf[JSONArray]
val Age: String = controller_java.getEntity(Json, "age")
Json = resultJson.get("location").asInstanceOf[JSONArray]
val Location: String = controller_java.getEntity(Json, "location")
Json = resultJson.get("her2Status").asInstanceOf[JSONArray]
val Her2Status: String = controller_java.getEntity(Json, "her2Status")
(x._1, x._2, Age, Location, Her2Status, CancerStage)
}
}
val outputBroadcast = sc.broadcast(conf.get(VOCPConstants.OUTPUTTABLE))
val inputBroadcast = sc.broadcast(conf.get(VOCPConstants.INPUTTABLE))
val filterPipelineRDD = pipelineRDD.filter { x =>
var flag = true
if (x._3 == null && x._4 == null && x._5 == null && x._6 == null) flag = false
(flag)
}
val newRDD = filterPipelineRDD.flatMap{x => convertToPut(x, outputBroadcast, inputBroadcast)}
newRDD.saveAsNewAPIHadoopDataset(job.getConfiguration())
return 0
}
}
def convertToPut(row: (String, String, String, String, String, String),
output_table: Broadcast[String], input_table: Broadcast[String]):
List[(ImmutableBytesWritable, Put)] = {
var putList: List[(ImmutableBytesWritable, Put)] = List()
val rowkey = row._1
val message = row._2
val age = row._3
val location = row._4
val her2status = row._5
val cancerStage = row._6
val put = new Put(Bytes.toBytes(rowkey))
val put_input = new Put(Bytes.toBytes(rowkey))
val flagCFDataBytes = Bytes.toBytes("p")
val pfamily = Bytes.toBytes("data")
put.add(flagCFDataBytes, Bytes.toBytes("message"), Bytes.toBytes(message))
put_input.add(flagCFDataBytes, Bytes.toBytes("user_flag"), Bytes.toBytes("1"))
if(age != null)
put.add(pfamily, Bytes.toBytes("age"), Bytes.toBytes(age))
if(location != null)
put.add(pfamily, Bytes.toBytes("location"), Bytes.toBytes(location))
if(her2status != null)
put.add(pfamily, Bytes.toBytes("her2Status"), Bytes.toBytes(her2status))
if(cancerStage != null)
put.add(pfamily, Bytes.toBytes("cancerStage"), Bytes.toBytes(cancerStage))
putList = putList:+(new ImmutableBytesWritable(Bytes.toBytes(output_table.value)), put)
putList = putList:+(new ImmutableBytesWritable(Bytes.toBytes(input_table.value)), put_input)
return putList
}
def pipeLineExecute(args: Array[String]): Int = {
var batchString = ""
val usage = "Usage: NLPAnnotationController" + " -inputTable tableName -outputTable tableName" +
" -batchId batchId / -newbatch \n"
if (args.length == 0) {
System.err.println(usage)
return -1
}
var i = 0
while ( i < args.length){
if ("-inputTable" == args(i)) {
conf.set(VOCPConstants.INPUTTABLE, args(i+1))
i=i+2
} else if ("-outputTable" == args(i)) {
conf.set(VOCPConstants.OUTPUTTABLE, args(i+1))
i=i+2
} else if ("-batchId" == args(i)) {
batchString = args(i+1)
i=i+2
} else if ("-newbatch" == args(i)) {
batchString = "newbatch"
i=i+1
} else {
throw new IllegalArgumentException("arg " + args(i) + " not recognized")
}
}
val result = nlpAnnotationExtraction(batchString)
result
}
def main(args: Array[String]) {
val res = pipeLineExecute(args)
System.exit(res)
}
}
错误堆栈:
java.lang.OutOfMemoryError: Java heap space
at java.util.HashMap.newNode(HashMap.java:1742)
at java.util.HashMap.putVal(HashMap.java:630)
at java.util.HashMap.putMapEntries(HashMap.java:514)
at java.util.HashMap.putAll(HashMap.java:784)
at gate.annotation.AnnotationSetImpl.<init>(AnnotationSetImpl.java:164)
at gate.jape.SinglePhaseTransducer.attemptAdvance(SinglePhaseTransducer.java:572)
at gate.jape.SinglePhaseTransducer.transduce(SinglePhaseTransducer.java:338)
at gate.jape.MultiPhaseTransducer.transduce(MultiPhaseTransducer.java:188)
at gate.jape.Batch.transduce(Batch.java:204)
at gate.creole.Transducer.execute(Transducer.java:166)
at gate.util.Benchmark.executeWithBenchmarking(Benchmark.java:291)
at gate.creole.SerialController.runComponent(SerialController.java:225)
at gate.creole.SerialController.executeImpl(SerialController.java:157)
at gate.creole.SerialAnalyserController.executeImpl(SerialAnalyserController.java:223)
at gate.creole.SerialAnalyserController.execute(SerialAnalyserController.java:126)
at com.vocp.userProfile.UserProfile.pipeline.exec(pipeline.java:75)
at com.vocp.userProfile.runner$$anonfun$5$$anonfun$apply$1.apply(runner.scala:117)
at com.vocp.userProfile.runner$$anonfun$5$$anonfun$apply$1.apply(runner.scala:116)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$14.hasNext(Iterator.scala:389)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12$$anonfun$apply$4.apply$mcV$sp(PairRDDFunctions.scala:1111)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12$$anonfun$apply$4.apply(PairRDDFunctions.scala:1111)
我已经完成的事情。
任何帮助都会有用。 提前谢谢。