这是我的档案:
package org.apache.spark.rdd;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
public class DataPreperation {
public static void main(String[] args) {
SparkConf config = new SparkConf().setMaster("local").setAppName("DataPreperation");
JavaSparkContext sc = new JavaSparkContext(config);
JavaRDD<String> custRDD = sc.textFile("Data/customer.csv");
JavaRDD<String> transRDD = sc.textFile("Data/transection.csv");
////Identify distinct rows in Customer.csv
JavaPairRDD<String, String> CustKP = custRDD.mapToPair(new PairFunction<String, String, String>() {
public Tuple2<String, String> call(String x) throws Exception {
// TODO Auto-generated method stub
return new Tuple2(x.split(",")[0],x);
}
});
//System.out.println(CustKP.count()+"All rows 25");
//System.out.println(CustKP.keys().distinct()+"distinct rows 25");
JavaPairRDD<String, String> CustKPReduced = CustKP.reduceByKey(new Function2<String, String, String>() {
public String call(String x, String y) throws Exception {
// TODO Auto-generated method stub``
return y;
}
});
//System.out.println(CustKPReduced.count()+"distinct rows 21");
//System.out.println(CustKPReduced.collect());
JavaPairRDD<String, String> transKP = transRDD.mapToPair(new PairFunction<String, String, String>() {
public Tuple2<String, String> call(String x) throws Exception {
// TODO Auto-generated method stub
return new Tuple2(x.split(",")[1], x);
}
});
JavaPairRDD<String, String> transKpDist = transKP.reduceByKey(new Function2<String, String, String>() {
public String call(String x, String y) throws Exception {
// TODO Auto-generated method stub
return y;
}
});
JavaPairRDD<String, Tuple2<String, String>> CustTransKP= CustKPReduced.join(transKpDist);
//System.out.println(CustTransKP.count());
// System.out.println(CustKPReduced.take(10))
// System.out.println("Customer Distinct Rows by Key :"+CustKPReduced.count());
// System.out.println("Total Joined table Rows : "+CustTransKP.count());
// System.out.println("Distinct Joined Table Rows :"+CustTransKP.distinct().count());
// System.out.println("Transaction total rows + Distinct Rows:"+transKP.count()+" +" +transKP.distinct().count());
// JavaRDD<String> subKeys = CustKPReduced.subtractByKey(CustTransKP).keys();
// System.out.println(subKeys.distinct().count());
// JavaRDD<String> TotalCustKeys = CustTransKP.distinct().keys();//22797
// JavaRDD<String> TotalKeys = subKeys.union(TotalCustKeys);
// System.out.println(TotalKeys.count());
// TotalKeys.coalesce(1).saveAsTextFile("Data/Total_Keys");
//
//System.out.println(CustTransKP.take(1));
//JavaRDD<String> transKeys = transKP.distinct().keys();
JavaRDD<Tuple2<String, String>> transId=CustTransKP.values();
JavaRDD<String> transKey = transId.map(new Function<Tuple2<String,String>, String>() {
public String call(Tuple2<String, String> x) throws Exception {
// TODO Auto-generated method stub
return x._1().split(",")[3];//Here if I change [3] to [2] or [1] it is not showing me exception.
}
});
CustTransKP.coalesce(1).saveAsTextFile("Data/CustTransKP");
transId.coalesce(1).saveAsTextFile("Data/transId");
transKey.coalesce(1).saveAsTextFile("Data/trans_Key");
//JavaRDD<String> transKey =
//System.out.println("Count of tanrsKey:"+transKey.count());
//System.out.println("First 10: "+transKey.take(10));
}
}
这是输出:
16/01/06 09:05:05 ERROR Executor: Exception in task 0.0 in stage 8.0 (TID 4)
java.lang.ArrayIndexOutOfBoundsException: 3
at org.apache.spark.rdd.DataPreperation$5.call(DataPreperation.java:93)
at org.apache.spark.rdd.DataPreperation$5.call(DataPreperation.java:1)
at org.apache.spark.api.java.JavaPairRDD$$anonfun$toScalaFunction$1.apply(JavaPairRDD.scala:1027)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$13.next(Iterator.scala:372)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$6.apply$mcV$sp(PairRDDFunctions.scala:1109)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$6.apply(PairRDDFunctions.scala:1108)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$6.apply(PairRDDFunctions.scala:1108)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1206)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1116)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1095)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
16/01/06 09:05:05 WARN TaskSetManager: Lost task 0.0 in stage 8.0 (TID 4, localhost): java.lang.ArrayIndexOutOfBoundsException: 3
at org.apache.spark.rdd.DataPreperation$5.call(DataPreperation.java:93)
at org.apache.spark.rdd.DataPreperation$5.call(DataPreperation.java:1)
at org.apache.spark.api.java.JavaPairRDD$$anonfun$toScalaFunction$1.apply(JavaPairRDD.scala:1027)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$13.next(Iterator.scala:372)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$6.apply$mcV$sp(PairRDDFunctions.scala:1109)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$6.apply(PairRDDFunctions.scala:1108)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$6.apply(PairRDDFunctions.scala:1108)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1206)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1116)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1095)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
16/01/06 09:05:05 ERROR TaskSetManager: Task 0 in stage 8.0 failed 1 times; aborting job
16/01/06 09:05:05 INFO TaskSchedulerImpl: Removed TaskSet 8.0, whose tasks have all completed, from pool
16/01/06 09:05:05 INFO TaskSchedulerImpl: Cancelling stage 8
16/01/06 09:05:05 INFO DAGScheduler: ResultStage 8 (main at <unknown>:0) failed in 8.285 s
16/01/06 09:05:05 INFO DAGScheduler: Job 2 failed: main at <unknown>:0, took 8.317993 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 8.0 failed 1 times, most recent failure: Lost task 0.0 in stage 8.0 (TID 4, localhost): java.lang.ArrayIndexOutOfBoundsException: 3
at org.apache.spark.rdd.DataPreperation$5.call(DataPreperation.java:93)
at org.apache.spark.rdd.DataPreperation$5.call(DataPreperation.java:1)
at org.apache.spark.api.java.JavaPairRDD$$anonfun$toScalaFunction$1.apply(JavaPairRDD.scala:1027)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$13.next(Iterator.scala:372)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$6.apply$mcV$sp(PairRDDFunctions.scala:1109)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$6.apply(PairRDDFunctions.scala:1108)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$6.apply(PairRDDFunctions.scala:1108)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1206)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1116)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1095)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1280)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1268)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1267)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1267)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1493)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1455)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1444)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1813)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1826)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1903)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896)
at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1426)
at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1405)
at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1405)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1405)
at org.apache.spark.api.java.JavaRDDLike$class.saveAsTextFile(JavaRDDLike.scala:522)
at org.apache.spark.api.java.AbstractJavaRDDLike.saveAsTextFile(JavaRDDLike.scala:47)
at org.apache.spark.rdd.DataPreperation.main(DataPreperation.java:98)
Caused by: java.lang.ArrayIndexOutOfBoundsException: 3
at org.apache.spark.rdd.DataPreperation$5.call(DataPreperation.java:93)
at org.apache.spark.rdd.DataPreperation$5.call(DataPreperation.java:1)
at org.apache.spark.api.java.JavaPairRDD$$anonfun$toScalaFunction$1.apply(JavaPairRDD.scala:1027)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$13.next(Iterator.scala:372)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$6.apply$mcV$sp(PairRDDFunctions.scala:1109)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$6.apply(PairRDDFunctions.scala:1108)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$6.apply(PairRDDFunctions.scala:1108)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1206)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1116)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1095)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
16/01/06 09:05:05 INFO SparkContext: Invoking stop() from shutdown hook
16/01/06 09:05:05 INFO SparkUI: Stopped Spark web UI at http://192.168.100.35:4040
16/01/06 09:05:05 INFO DAGScheduler: Stopping DAGScheduler
16/01/06 09:05:05 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
16/01/06 09:05:05 INFO MemoryStore: MemoryStore cleared
16/01/06 09:05:05 INFO BlockManager: BlockManager stopped
16/01/06 09:05:05 INFO BlockManagerMaster: BlockManagerMaster stopped
16/01/06 09:05:05 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
16/01/06 09:05:05 INFO SparkContext: Successfully stopped SparkContext
16/01/06 09:05:05 INFO ShutdownHookManager: Shutdown hook called
16/01/06 09:05:05 INFO ShutdownHookManager: Deleting directory /tmp/spark-b90705cb-50d2-40fc-9518-e0aed907f570
transId是PairRdd-CustTransKP的值的Rdd,它由两个文件customer.csv和Transaction.csv组成。
每当我尝试通过返回x._1()。split(“,”)[3]来访问Transaction.csv的元素时;它会抛出异常而不是转向x._1()。split(“,”)[2];
答案 0 :(得分:2)
看起来你的分离剂可能正如你所期望的那样工作,尝试使用split(“,”, - 1),因此空的元素beetwen分隔符将保留在你的最终rdd中,所有数组都将具有相同数量的元素。
split(“,”, - 1)基本上意味着在结尾处保留空值。默认情况下,它是拆分(正则表达式,0),它会丢弃空值