火花mllib中的Java堆空间

时间:2016-01-06 09:11:35

标签: scala apache-spark apache-spark-mllib

我有以下代码,它通过交叉验证为随机森林分类计算一些指标。

def run(data:RDD[LabeledPoint], metric:String = "PR") = {

    val cv_data:Array[(RDD[LabeledPoint], RDD[LabeledPoint])] = MLUtils.kFold(data, numFolds, 0)

    val result : Array[(Double, Double)] = cv_data.par.map{case (training, validation) =>
      training.persist(org.apache.spark.storage.StorageLevel.MEMORY_ONLY)
      validation.persist(org.apache.spark.storage.StorageLevel.MEMORY_ONLY)

      val res :ParArray[(Double, Double)] = CV_params.par.zipWithIndex.map { case (p,i) =>
        // Training classifier
        val model = RandomForest.trainClassifier(training, numClasses, categoricalFeaturesInfo, params(0).asInstanceOf[Int], params(3).asInstanceOf[String], params(4).asInstanceOf[String],
  params(1).asInstanceOf[Int], params(2).asInstanceOf[Int])
        // Prediction
        val labelAndPreds:RDD[(Double, Double)] = model.predictWithLabels(validation)
        // Metrics computation
        val bcm = new BinaryClassificationMetrics(labelAndPreds)
        (bcm.areaUnderROC() / numFolds, bcm.areaUnderPR() / numFolds)
      }

      training.unpersist()
      validation.unpersist()
      res
    }.reduce((s1,s2) => s1.zip(s2).map(t => (t._1._1 + t._2._1, t._1._2 + t._2._2))).toArray

    val cv_roc = result.map(_._1)
    val cv_pr = result.map(_._2)

    // Extract best params
    val which_max = (metric match {
      case "ROC" => cv_roc
      case "PR" => cv_pr
      case _ =>
        logWarning("Metrics set to default one: PR")
        cv_pr
    }).zipWithIndex.maxBy(_._1)._2

    best_values_array = CV_params(which_max)
    CV_areaUnderROC = cv_roc
    CV_areaUnderPR = cv_pr
  }
}

val numTrees = Array(50)
val maxDepth = Array(30)
val maxBins = Array(100)
val featureSubsetStrategy = Array("sqrt")
val impurity = Array("gini")

val CV_params: Array[Array[Any]] = {
    for (a <- numTrees; b <- maxDepth; c <- maxBins; d <- featureSubsetStrategy;
         e <- impurityString) yield Array(a, b, c, d, e)
}

run(data, "PR")

它在50个容器(总共26GB内存)上的YARN集群上运行。 data参数是RDD[LabeledPoint]。我使用kryo序列化和1000的默认并行度。

对于较小的data,它可以工作但是对于我的600 000大小的真实数据,我得到以下错误:

Exception in thread "dag-scheduler-event-loop" java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOf(Arrays.java:2271)
at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:113)
at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:140)
at java.io.ObjectOutputStream$BlockDataOutputStream.write(ObjectOutputStream.java:1841)
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1533)
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1508)
at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177)
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1547)
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1508)
at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177)
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1547)
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1508)
at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177)
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1547)
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1508)
at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177)
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1547)
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1508)
at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177)
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1547)
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1508)
at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177)
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1547)
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1508)
at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431)

我无法确定错误的来源,因为总分配内存(26GB)远远高于作业期间消耗的内存(我已经检查过火花网页UI)。

任何帮助将不胜感激。谢谢!

1 个答案:

答案 0 :(得分:0)

只是一个猜测,但你正在做的一件不寻常的事情就是在调用.par的同时提交许多工作。请注意,spark通常以不同的方式实现并行 - 您提交一个作业,但该作业被分解为许多可以并行运行的任务。

原则上,对于您正在做的事情没有任何错误,如果一个作业中的并行化很小,这可能很有用;在这种情况下,如果您一次提交一份工作,则无法有效地使用群集。 OTOH,仅使用.par可能会导致太多并行提交许多作业。这种便利方法将继续提交作业以试图保持驱动程序忙(无论如何都是第一次近似);但事实上,在火花中,驾驶员在等待你的集群进行繁重的工作时相对闲置并不罕见。因此,虽然驱动程序可能有足够的cpu可用,但它可能会使用大量内存,因为它需要同时准备1000个作业所需的簿记(不确定要生成多少个作业)。

如果您确实想要并行提交作业,则可能有助于将其限制为较小的数量,例如。一次只能做2到4个工作。