我写了一个火花作业(spark 1.3 cloudera 5.4),它循环通过一个avro文件,并为每个记录发出一个hivecontext查询
val path = "/user/foo/2016/03/07/ALL"
val fw2 = new FileWriter("/home.nfs/Foo/spark-query-result.txt", false)
val conf = new SparkConf().setAppName("App")
val sc = new SparkContext(conf)
sc.hadoopConfiguration.set("mapreduce.input.fileinputformat.input.dir.recursive","true")
val sqlSc = new SQLContext(sc)
import sqlSc.implicits._
val df = sqlSc.load(path, "com.databricks.spark.avro").cache()
val hc = new HiveContext(sc)
df.filter("fieldA = 'X'").select($"fieldA", $"fieldB", $"fieldC").rdd.toLocalIterator.filter(x => x(1) != null).foreach{x =>
val query = s"select from hive_table where fieldA = ${x(0)} and fieldB='${x(1)}' and fieldC=${x(2)}"
val df1 = hc.sql(query)
df1.rdd.toLocalIterator.foreach { r =>
println(s"For ${x(0)} Found ${r(0)}\n")
fw1.write(s"For ${x(0)} Found ${r(0)}\n")
}
}
作业运行2小时,但随后出现错误
16/03/08 12:35:53 WARN TaskSetManager: Lost task 17.0 in stage 34315.0 (TID 82258, foo-cloudera04.foo.com): java.io.IOException: Filesystem closed
at org.apache.hadoop.hdfs.DFSClient.checkOpen(DFSClient.java:794)
at org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:833)
at org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:897)
at java.io.DataInputStream.read(DataInputStream.java:100)
at org.apache.hadoop.util.LineReader.fillBuffer(LineReader.java:180)
at org.apache.hadoop.util.LineReader.readDefaultLine(LineReader.java:216)
at org.apache.hadoop.util.LineReader.readLine(LineReader.java:174)
at org.apache.hadoop.mapred.LineRecordReader.next(LineRecordReader.java:246)
....
16/03/08 12:35:53 INFO TaskSetManager: Starting task 0.0 in stage 34315.0 (TID 82260, foo-cloudera09.foo.com, NODE_LOCAL, 1420 bytes)
16/03/08 12:35:53 INFO TaskSetManager: Finished task 67.0 in stage 34314.0 (TID 82256) in 1298 ms on foo-cloudera09.foo.com (42/75)
16/03/08 12:35:53 INFO BlockManagerInfo: Added broadcast_12501_piece0 in memory on foo-cloudera09.foo.com:43893 (size: 6.5 KB, free: 522.8 MB)
16/03/08 12:35:53 INFO BlockManagerInfo: Added broadcast_12499_piece0 in memory on foo-cloudera09.foo.com:43893 (size: 44.2 KB, free: 522.7 MB)
16/03/08 12:35:53 INFO TaskSetManager: Starting task 17.1 in stage 34315.0 (TID 82261, foo-cloudera04.foo.com, NODE_LOCAL, 1420 bytes)
16/03/08 12:35:53 WARN TaskSetManager: Lost task 19.0 in stage 34315.0 (TID 82259, foo-cloudera04.foo.com): java.io.FileNotFoundException: /data/1/yarn/nm/usercache/Foo.Bar/appcache/application_1456200816465_188203/blockmgr-79a08609-56ae-490e-afc9-0f0143441a76/27/temp_shuffle_feb9ae13-6cb0-4a19-a60f-8c433f30e0e0 (No such file or directory)
at java.io.FileOutputStream.open0(Native Method)
at java.io.FileOutputStream.open(FileOutputStream.java:270)
at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
at org.apache.spark.storage.DiskBlockObjectWriter.open(BlockObjectWriter.scala:130)
at org.apache.spark.util.collection.ExternalSorter$$anonfun$spillToPartitionFiles$1.apply(ExternalSorter.scala:360)
at org.apache.spark.util.collection.ExternalSorter$$anonfun$spillToPartitionFiles$1.apply(ExternalSorter.scala:355)
at scala.Array$.fill(Array.scala:267)
at org.apache.spark.util.collection.ExternalSorter.spillToPartitionFiles(ExternalSorter.scala:355)
16/03/08 12:35:53 INFO TaskSetManager: Starting task 19.1 in stage 34315.0 (TID 82262, foo-cloudera04.foo.com, NODE_LOCAL, 1420 bytes)
16/03/08 12:35:53 WARN TaskSetManager: Lost task 17.1 in stage 34315.0 (TID 82261, foo-cloudera04.foo.com): java.io.FileNotFoundException: /data/1/yarn/nm/usercache/Foo.Bar/appcache/application_1456200816465_188203/blockmgr-79a08609-56ae-490e-afc9-0f0143441a76/13/temp_shuffle_2f89df35-9e35-4558-a0f2-1f7353d3f9b0 (No such file or directory)
at java.io.FileOutputStream.open0(Native Method)
at java.io.FileOutputStream.open(FileOutputStream.java:270)
at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
at org.apache.spark.storage.DiskBlockObjectWriter.open(BlockObjectWriter.scala:130)
at org.apache.spark.util.collection.ExternalSorter$$anonfun$spillToPartitionFiles$1.apply(ExternalSorter.scala:360)
at org.apache.spark.util.collection.ExternalSorter$$anonfun$spillToPartitionFiles$1.apply(ExternalSorter.scala:355)