尝试在GraphX中加载Big Graph时SPARK:java.lang.NegativeArraySizeException

时间:2016-01-11 20:22:22

标签: java scala graph apache-spark spark-graphx

我试图在本地模式下使用带有16个线程的spark1.4.1中的GraphX加载一个大图(60GB)。 spark-defaults.conf中的驱动程序内存设置为500GB。我工作的机器有590341 MB空闲(由free -m命令显示),实际上是576GB。

我尝试加载的图表的一些细节:它是从snap.stanford.edu下载的Friendster图表。实际大小为30GB,因为Friendster是一个有向图形,但我将边缘加倍以创建一个无向图形版本。 (这就是为什么尺寸加倍60 = 2 * 30)。我使用Scala加载GraphLoader.edgeListFile来加载Graph,如下所示:

def main(args: Array[String]): Unit = {

   val logFile = "/home/panos/spark-1.2.0/README.md"
   // Create spark configuration and spark context
   val conf = new SparkConf().setAppName("My    App").setMaster("local[16]")
   val sc = new SparkContext(conf)

   val currentDir = System.getProperty("user.dir") // get the current directory
   val edgeFile = "file://" + currentDir + "/friendsterUndirected.txt"

   // Load the edges as a graph   
   val graph = GraphLoader.edgeListFile(sc,    edgeFile,false,1,StorageLevel.MEMORY_AND_DISK,StorageLevel.MEMORY_AND_DISK)
 }

问题是无法加载图形,接收java.lang.negativeArraySize异常,如下所示。我尝试使用较小的图形(~4GB)成功。有什么想法吗?

16/01/10 13:04:09 WARN TaskSetManager: Stage 0 contains a task of very    large size (440 KB). The maximum recommended task size is 100 KB.
16/01/10 13:32:56 ERROR Executor: Exception in task 0.0 in stage 0.0  (TID 0)
java.lang.NegativeArraySizeException
at java.lang.reflect.Array.newArray(Native Method)
at java.lang.reflect.Array.newInstance(Array.java:75)
at scala.reflect.ClassTag$class.newArray(ClassTag.scala:62)
at scala.reflect.ClassTag$$anon$1.newArray(ClassTag.scala:144)
at org.apache.spark.util.collection.PrimitiveVector.copyArrayWithLength(PrimitiveVector.scala:87)
at org.apache.spark.util.collection.PrimitiveVector.resize(PrimitiveVector.scala:74)
at org.apache.spark.util.collection.PrimitiveVector.$plus$eq(PrimitiveVector.scala:41)
at org.apache.spark.graphx.impl.EdgePartitionBuilder$mcI$sp.add$mcI$sp(EdgePartitionBuilder.scala:34)
at org.apache.spark.graphx.GraphLoader$$anonfun$1$$anonfun$apply$1.apply(GraphLoader.scala:87)
at org.apache.spark.graphx.GraphLoader$$anonfun$1$$anonfun$apply$1.apply(GraphLoader.scala:76)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at org.apache.spark.graphx.GraphLoader$$anonfun$1.apply(GraphLoader.scala:76)
at org.apache.spark.graphx.GraphLoader$$anonfun$1.apply(GraphLoader.scala:74)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$18.apply(RDD.scala:703)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$18.apply(RDD.scala:703)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:69)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)

16/01/10 13:32:56 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.lang.NegativeArraySizeException
at java.lang.reflect.Array.newArray(Native Method)
at java.lang.reflect.Array.newInstance(Array.java:75)
at scala.reflect.ClassTag$class.newArray(ClassTag.scala:62)
at scala.reflect.ClassTag$$anon$1.newArray(ClassTag.scala:144)
at org.apache.spark.util.collection.PrimitiveVector.copyArrayWithLength(PrimitiveVector.scala:87)
at org.apache.spark.util.collection.PrimitiveVector.resize(PrimitiveVector.scala:74)
at org.apache.spark.util.collection.PrimitiveVector.$plus$eq(PrimitiveVector.scala:41)
at org.apache.spark.graphx.impl.EdgePartitionBuilder$mcI$sp.add$mcI$sp(EdgePartitionBuilder.scala:34)
at org.apache.spark.graphx.GraphLoader$$anonfun$1$$anonfun$apply$1.apply(GraphLoader.scala:87)
at org.apache.spark.graphx.GraphLoader$$anonfun$1$$anonfun$apply$1.apply(GraphLoader.scala:76)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at org.apache.spark.graphx.GraphLoader$$anonfun$1.apply(GraphLoader.scala:76)
at org.apache.spark.graphx.GraphLoader$$anonfun$1.apply(GraphLoader.scala:74)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$18.apply(RDD.scala:703)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$18.apply(RDD.scala:703)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:69)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)

16/01/10 13:32:56 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.lang.NegativeArraySizeException
at java.lang.reflect.Array.newArray(Native Method)
at java.lang.reflect.Array.newInstance(Array.java:75)
at scala.reflect.ClassTag$class.newArray(ClassTag.scala:62)
at scala.reflect.ClassTag$$anon$1.newArray(ClassTag.scala:144)
at org.apache.spark.util.collection.PrimitiveVector.copyArrayWithLength(PrimitiveVector.scala:87)
at org.apache.spark.util.collection.PrimitiveVector.resize(PrimitiveVector.scala:74)
at org.apache.spark.util.collection.PrimitiveVector.$plus$eq(PrimitiveVector.scala:41)
at org.apache.spark.graphx.impl.EdgePartitionBuilder$mcI$sp.add$mcI$sp(EdgePartitionBuilder.scala:34)
at org.apache.spark.graphx.GraphLoader$$anonfun$1$$anonfun$apply$1.apply(GraphLoader.scala:87)
at org.apache.spark.graphx.GraphLoader$$anonfun$1$$anonfun$apply$1.apply(GraphLoader.scala:76)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at org.apache.spark.graphx.GraphLoader$$anonfun$1.apply(GraphLoader.scala:76)
at org.apache.spark.graphx.GraphLoader$$anonfun$1.apply(GraphLoader.scala:74)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$18.apply(RDD.scala:703)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$18.apply(RDD.scala:703)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:69)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)

0 个答案:

没有答案