无法在java

时间:2016-08-03 07:12:31

标签: java json hadoop apache-spark apache-spark-sql

请帮忙,我就Spark和Haddop一般都是一个完整的菜鸟。我的代码如下所示:

public static void main(String[] args) throws IOException {

    String[] jars = {"D:\\customJars\\sparky.jar","D:\\customJars\\guava-19.0.jar"};
      System.setProperty("hadoop.home.dir", "D:\\hadoop-common-2.2.0-bin-master");
        SparkConf sparkConf = new SparkConf().setAppName("com.nucleus.spark.MlibPOC")
                .setMaster("spark://10.1.50.165:7077")
                .setJars(jars);


    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    SQLContext sqlContext = new SQLContext(jsc);


   DataFrame df = sqlContext.read().json("src/com/nucleus/spark/generated.json");


}

My Spark Cluster在运行RHEL6的10.1.50.165上作为单个集群部署,当我运行这个简单的代码时,在尝试读取json文件时,我得到例外情况:

  

线程中的异常" main" org.apache.spark.SparkException:Job   由于阶段失败而中止:阶段0.0中的任务1失败4次,大多数   最近的失败:在阶段0.0(TID 5,10.1.50.165)丢失了任务1.3:   java.io.FileNotFoundException:文件   文件:/ d:/WorkSpace2/SparkHadoopProject/src/com/nucleus/spark/generated.json   不存在于   org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:534)     在   org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:747)     在   org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:524)     在   org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:409)     在   org.apache.hadoop.fs.ChecksumFileSystem $ ChecksumFSInputChecker。(ChecksumFileSystem.java:140)     在   org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:341)     在org.apache.hadoop.fs.FileSystem.open(FileSystem.java:766)at   org.apache.hadoop.mapred.LineRecordReader。(LineRecordReader.java:108)     在   org.apache.hadoop.mapred.TextInputFormat.getRecordReader(TextInputFormat.java:67)     在org.apache.spark.rdd.HadoopRDD $$ anon $ 1.(HadoopRDD.scala:237)     在org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:208)at at   org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:101)at at   org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)at at at   org.apache.spark.rdd.RDD.iterator(RDD.scala:270)at   org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)     在org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)     在org.apache.spark.rdd.RDD.iterator(RDD.scala:270)at   org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)     在org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)     在org.apache.spark.rdd.RDD.iterator(RDD.scala:270)at   org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)     在org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)     在org.apache.spark.rdd.RDD.iterator(RDD.scala:270)at   org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)at at   org.apache.spark.scheduler.Task.run(Task.scala:89)at   org.apache.spark.executor.Executor $ TaskRunner.run(Executor.scala:227)     在   java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)     在   java.util.concurrent.ThreadPoolExecutor中的$ Worker.run(ThreadPoolExecutor.java:617)     在java.lang.Thread.run(Thread.java:745)

     

驱动程序堆栈跟踪:at   org.apache.spark.scheduler.DAGScheduler.org $阿帕奇$火花$ $调度$$ DAGScheduler failJobAndIndependentStages(DAGScheduler.scala:1431)     在   org.apache.spark.scheduler.DAGScheduler $$ anonfun $ abortStage $ 1.适用(DAGScheduler.scala:1419)     在   org.apache.spark.scheduler.DAGScheduler $$ anonfun $ abortStage $ 1.适用(DAGScheduler.scala:1418)     在   scala.collection.mutable.ResizableArray $ class.foreach(ResizableArray.scala:59)     在scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)     在   org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)     在   org.apache.spark.scheduler.DAGScheduler $$ anonfun $ handleTaskSetFailed $ 1.适用(DAGScheduler.scala:799)     在   org.apache.spark.scheduler.DAGScheduler $$ anonfun $ handleTaskSetFailed $ 1.适用(DAGScheduler.scala:799)     在scala.Option.foreach(Option.scala:236)at   org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)     在   org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)     在   org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)     在   org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)     在org.apache.spark.util.EventLoop $$ anon $ 1.run(EventLoop.scala:48)     在   org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)     在org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)at at   org.apache.spark.SparkContext.runJob(SparkContext.scala:1952)at at   org.apache.spark.rdd.RDD $$ anonfun $ reduce $ 1.apply(RDD.scala:1025)at at   org.apache.spark.rdd.RDDOperationScope $ .withScope(RDDOperationScope.scala:150)     在   org.apache.spark.rdd.RDDOperationScope $ .withScope(RDDOperationScope.scala:111)     在org.apache.spark.rdd.RDD.withScope(RDD.scala:316)at   org.apache.spark.rdd.RDD.reduce(RDD.scala:1007)at   org.apache.spark.rdd.RDD $$ anonfun $ treeAggregate $ 1.适用(RDD.scala:1150)     在   org.apache.spark.rdd.RDDOperationScope $ .withScope(RDDOperationScope.scala:150)     在   org.apache.spark.rdd.RDDOperationScope $ .withScope(RDDOperationScope.scala:111)     在org.apache.spark.rdd.RDD.withScope(RDD.scala:316)at   org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1127)at at   org.apache.spark.sql.execution.datasources.json.InferSchema $ .infer(InferSchema.scala:65)     在   org.apache.spark.sql.execution.datasources.json.JSONRelation $$ anonfun $ 4.适用(JSONRelation.scala:114)     在   org.apache.spark.sql.execution.datasources.json.JSONRelation $$ anonfun $ 4.适用(JSONRelation.scala:109)     在scala.Option.getOrElse(Option.scala:120)at   org.apache.spark.sql.execution.datasources.json.JSONRelation.dataSchema $ lzycompute(JSONRelation.scala:109)     在   org.apache.spark.sql.execution.datasources.json.JSONRelation.dataSchema(JSONRelation.scala:108)     在   org.apache.spark.sql.sources.HadoopFsRelation.schema $ lzycompute(interfaces.scala:636)     在   org.apache.spark.sql.sources.HadoopFsRelation.schema(interfaces.scala:635)     在   。org.apache.spark.sql.execution.datasources.LogicalRelation(LogicalRelation.scala:37)     在   org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:125)     在   org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:109)     在   org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:244)     在com.nucleus.spark.MlibPOC.main(MlibPOC.java:44)引起:   java.io.FileNotFoundException:文件   文件:/ d:/WorkSpace2/SparkHadoopProject/src/com/nucleus/spark/generated.json   不存在于   org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:534)     在   org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:747)     在   org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:524)     在   org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:409)     在   org.apache.hadoop.fs.ChecksumFileSystem $ ChecksumFSInputChecker。(ChecksumFileSystem.java:140)     在   org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:341)     在org.apache.hadoop.fs.FileSystem.open(FileSystem.java:766)at   org.apache.hadoop.mapred.LineRecordReader。(LineRecordReader.java:108)     在   org.apache.hadoop.mapred.TextInputFormat.getRecordReader(TextInputFormat.java:67)     在org.apache.spark.rdd.HadoopRDD $$ anon $ 1.(HadoopRDD.scala:237)     在org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:208)at at   org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:101)at at   org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)at at at   org.apache.spark.rdd.RDD.iterator(RDD.scala:270)at   org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)     在org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)     在org.apache.spark.rdd.RDD.iterator(RDD.scala:270)at   org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)     在org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)     在org.apache.spark.rdd.RDD.iterator(RDD.scala:270)at   org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)     在org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)     在org.apache.spark.rdd.RDD.iterator(RDD.scala:270)at   org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)at at   org.apache.spark.scheduler.Task.run(Task.scala:89)at   org.apache.spark.executor.Executor $ TaskRunner.run(Executor.scala:227)     在   java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)     在   java.util.concurrent.ThreadPoolExecutor中的$ Worker.run(ThreadPoolExecutor.java:617)     在java.lang.Thread.run(Thread.java:745)

1 个答案:

答案 0 :(得分:0)

这适用于Spark 1.6:

    scala> val jtex = sqlContext.read.json("file:///opt/test.json")
    jtex: org.apache.spark.sql.DataFrame = [_corrupt_record: string, age: string, id: string, name: string]

    scala> val jtex = sqlContext.read.format("json").option("samplingRatio", "1.0").load("file:///opt/test.json")
    jtex: org.apache.spark.sql.DataFrame = [age: string, id: string, name: string]

    scala> jtex.show()
    +---+----+-------+
    |age|  id|   name|
    +---+----+-------+
    | 25|1201| satish|
    | 28|1202|krishna|
    | 39|1203|  amith|
    | 23|1204|  javed|
    | 23|1205| prudvi|
    +---+----+-------+