java.lang.OutOfMemoryError:spark应用程序中的Java堆空间

时间:2017-04-13 05:31:05

标签: scala apache-spark hbase

我正在运行一个spark应用程序,它从一个非常大的(~7M)表读取消息,处理消息并将结果写回同一个表。应用程序应该可以正常使用小表,但是在这个表上我得到了如上所述的错误。

代码

package com.vocp.userProfile


import java.util.Arrays
import java.util.ArrayList

import gate.util.GateException
import java.net.MalformedURLException

import org.codehaus.jettison.json.JSONArray
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.HConstants
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.mapreduce.{MultiTableOutputFormat, TableOutputFormat}
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.KeyValue.Type
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import java.util.ArrayList

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.log4j.{Level, Logger}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.codehaus.jettison.json.{JSONArray, JSONObject}
import com.vocp.userProfile.UserProfile.{Utility, controller_java, pipeline}
import com.vocp.userProfile.Configuration.{VOCPConstants, VocpConfiguration}
import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.{Base64, Bytes}

/**
  * Created by sahil on 12/4/17.
  */

class runner {
  var log: Logger = Logger.getLogger(classOf[runner])
  val conf = VocpConfiguration.create()
}

object runner {
  val run = new runner
  val conf = run.conf
  val log = run.log

  @throws(classOf[Exception])
  def nlpAnnotationExtraction(batchString: String): Int = {

    log.info("In Main Object..")

    //Initializing Spark Context
    val sc = new SparkContext(new SparkConf().setAppName("TheExorcist"))
    sc.addFile(conf.get(VOCPConstants.GATE_PLUGIN_ARCHIVE), true)
    val batchId =
      if (batchString == "newbatch")
        java.lang.Long.toString(System.currentTimeMillis())
      else batchString

    conf.set("batchId", batchId)
    try {

      conf.set(TableInputFormat.INPUT_TABLE, conf.get(VOCPConstants.INPUTTABLE))
      conf.set(TableOutputFormat.OUTPUT_TABLE, conf.get(VOCPConstants.OUTPUTTABLE))

      val job: Job = Job.getInstance(conf, "NLPAnnotationJob")
      job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, conf.get(VOCPConstants.OUTPUTTABLE))
      job.setOutputFormatClass(classOf[MultiTableOutputFormat])

      val admin = new HBaseAdmin(conf)
      if (!admin.isTableAvailable(conf.get(VOCPConstants.OUTPUTTABLE))) {
        val tableDesc = new HTableDescriptor(TableName.valueOf(conf.get(VOCPConstants.OUTPUTTABLE)))
        admin.createTable(tableDesc)
      }
      val scan = new Scan()
      scan.setCaching(5000)
      scan.setCacheBlocks(false)

      val inputCfs = conf.get(VOCPConstants.INPUTCOLUMNFAMILIES).split(",")
      inputCfs.foreach { x => scan.addFamily(Bytes.toBytes(x)) }

      val proto = ProtobufUtil.toScan(scan)
      val scan_string = Base64.encodeBytes(proto.toByteArray)
      job.getConfiguration().set(TableInputFormat.SCAN, scan_string)

      val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
        classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
        classOf[org.apache.hadoop.hbase.client.Result])
      val processedFilteredRDD = hBaseRDD.map(x => x._2)
        .filter{ result =>
          val to_process : String = Bytes.toString(result.getValue(Bytes.toBytes("p"),
            Bytes.toBytes("to_process")))
          val user_flag : String = Bytes.toString(result.getValue(Bytes.toBytes("p"),
            Bytes.toBytes("user_flag")))
          (to_process != "0" && user_flag == null)
        }
      val messageRDD = processedFilteredRDD.map { result =>
        val message = Bytes.toString(result.getValue(Bytes.toBytes("p"),
          Bytes.toBytes("message")))
        (Bytes.toString(result.getRow()), message)
      }
      val filterRDD = messageRDD.filter { x =>
        var flag = true
        if (x._2 == null || x._2.toString.trim.length == 0) {
          flag = false
        }
        (flag)
      }

      val pluginHome = "plugins"
      val pipelineRDD = filterRDD.mapPartitions { iter =>
        val pipe = new pipeline(pluginHome)
        iter.map { x =>
          val result = pipe.exec(x._2.toLowerCase())
          val resultJson: JSONObject = new JSONObject(result.trim())
          var Json: JSONArray = resultJson.get("cancerStage").asInstanceOf[JSONArray]
          val CancerStage: String = controller_java.getEntity(Json, "cancerStage")
          Json = resultJson.get("age").asInstanceOf[JSONArray]
          val Age: String = controller_java.getEntity(Json, "age")
          Json = resultJson.get("location").asInstanceOf[JSONArray]
          val Location: String = controller_java.getEntity(Json, "location")
          Json = resultJson.get("her2Status").asInstanceOf[JSONArray]
          val Her2Status: String = controller_java.getEntity(Json, "her2Status")
          (x._1, x._2, Age, Location, Her2Status, CancerStage)
        }
      }
      val outputBroadcast = sc.broadcast(conf.get(VOCPConstants.OUTPUTTABLE))
      val inputBroadcast = sc.broadcast(conf.get(VOCPConstants.INPUTTABLE))

      val filterPipelineRDD = pipelineRDD.filter { x =>
        var flag = true
        if (x._3 == null && x._4 == null && x._5 == null && x._6 == null) flag = false
        (flag)
      }
      val newRDD = filterPipelineRDD.flatMap{x => convertToPut(x, outputBroadcast, inputBroadcast)}
      newRDD.saveAsNewAPIHadoopDataset(job.getConfiguration())
      return 0
    }
  }
  def convertToPut(row: (String, String, String,  String, String, String),
                   output_table: Broadcast[String], input_table: Broadcast[String]):
  List[(ImmutableBytesWritable, Put)] = {
    var putList: List[(ImmutableBytesWritable, Put)] = List()
    val rowkey = row._1
    val message = row._2
    val age = row._3
    val location = row._4
    val her2status = row._5
    val cancerStage = row._6
    val put = new Put(Bytes.toBytes(rowkey))
    val put_input = new Put(Bytes.toBytes(rowkey))
    val flagCFDataBytes = Bytes.toBytes("p")
    val pfamily = Bytes.toBytes("data")
    put.add(flagCFDataBytes, Bytes.toBytes("message"), Bytes.toBytes(message))
    put_input.add(flagCFDataBytes, Bytes.toBytes("user_flag"), Bytes.toBytes("1"))
    if(age != null)
      put.add(pfamily, Bytes.toBytes("age"), Bytes.toBytes(age))
    if(location != null)
      put.add(pfamily, Bytes.toBytes("location"), Bytes.toBytes(location))
    if(her2status != null)
      put.add(pfamily, Bytes.toBytes("her2Status"), Bytes.toBytes(her2status))
    if(cancerStage != null)
      put.add(pfamily, Bytes.toBytes("cancerStage"), Bytes.toBytes(cancerStage))
    putList = putList:+(new ImmutableBytesWritable(Bytes.toBytes(output_table.value)), put)
    putList = putList:+(new ImmutableBytesWritable(Bytes.toBytes(input_table.value)), put_input)
    return putList
  }
  def pipeLineExecute(args: Array[String]): Int = {

    var batchString = ""
    val usage = "Usage: NLPAnnotationController" + " -inputTable tableName -outputTable tableName" +
      " -batchId batchId / -newbatch \n"
    if (args.length == 0) {
      System.err.println(usage)
      return -1
    }

    var i = 0
    while ( i < args.length){
      if ("-inputTable" == args(i)) {
        conf.set(VOCPConstants.INPUTTABLE, args(i+1))
        i=i+2
      } else if ("-outputTable" == args(i)) {
        conf.set(VOCPConstants.OUTPUTTABLE, args(i+1))
        i=i+2
      } else if ("-batchId" == args(i)) {
        batchString = args(i+1)
        i=i+2
      } else if ("-newbatch" == args(i)) {
        batchString = "newbatch"
        i=i+1
      } else {
        throw new IllegalArgumentException("arg " + args(i) + " not recognized")
      }

    }
    val result = nlpAnnotationExtraction(batchString)
    result

  }

  def main(args: Array[String]) {
    val res = pipeLineExecute(args)
    System.exit(res)
  }
}

错误堆栈:

java.lang.OutOfMemoryError: Java heap space
        at java.util.HashMap.newNode(HashMap.java:1742)
        at java.util.HashMap.putVal(HashMap.java:630)
        at java.util.HashMap.putMapEntries(HashMap.java:514)
        at java.util.HashMap.putAll(HashMap.java:784)
        at gate.annotation.AnnotationSetImpl.<init>(AnnotationSetImpl.java:164)
        at gate.jape.SinglePhaseTransducer.attemptAdvance(SinglePhaseTransducer.java:572)
        at gate.jape.SinglePhaseTransducer.transduce(SinglePhaseTransducer.java:338)
        at gate.jape.MultiPhaseTransducer.transduce(MultiPhaseTransducer.java:188)
        at gate.jape.Batch.transduce(Batch.java:204)
        at gate.creole.Transducer.execute(Transducer.java:166)
        at gate.util.Benchmark.executeWithBenchmarking(Benchmark.java:291)
        at gate.creole.SerialController.runComponent(SerialController.java:225)
        at gate.creole.SerialController.executeImpl(SerialController.java:157)
        at gate.creole.SerialAnalyserController.executeImpl(SerialAnalyserController.java:223)
        at gate.creole.SerialAnalyserController.execute(SerialAnalyserController.java:126)
        at com.vocp.userProfile.UserProfile.pipeline.exec(pipeline.java:75)
        at com.vocp.userProfile.runner$$anonfun$5$$anonfun$apply$1.apply(runner.scala:117)
        at com.vocp.userProfile.runner$$anonfun$5$$anonfun$apply$1.apply(runner.scala:116)
        at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
        at scala.collection.Iterator$$anon$14.hasNext(Iterator.scala:389)
        at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
        at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12$$anonfun$apply$4.apply$mcV$sp(PairRDDFunctions.scala:1111)
        at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12$$anonfun$apply$4.apply(PairRDDFunctions.scala:1111)

我已经完成的事情。

  1. 增加执行程序内存 - executor-memory 10g
  2. 优化了代码。我仍然认为GATE处理导致异常并消耗内存。
  3. 任何帮助都会有用。 提前谢谢。

1 个答案:

答案 0 :(得分:0)

我建议你将输入的RDD大小减少到一半或四分之一,看看是否有效

Performing operations only on subset of a RDD