我的Spark应用程序中有以下代码。它应该从set csv文件中过滤掉基因。我正在将csv文件加载到spark RDD中。当我使用spark-submit运行jar时,我得到Task不可序列化的异常。
public class AttributeSelector {
public static final String path = System.getProperty("user.dir") + File.separator;
public static Queue<Instances> result = new LinkedBlockingQueue<>();
private static final Logger LOGGER = LoggerFactory.getLogger(AttributeSelector.class);
int[] selectAttributes(Instances data) {
int[] indexes = null;
AttributeSelection filter = new AttributeSelection();
CfsSubsetEval evaluator = new CfsSubsetEval();
filter.setEvaluator(evaluator);
BestFirst search = new BestFirst();
filter.setSearch(search);
try {
filter.SelectAttributes(data);
indexes = filter.selectedAttributes();
} catch (Exception e) {
System.out.println("Error when resampling input data with selected attributes!");
e.printStackTrace();
}
return indexes;
}
public void selectData(Instances data, int[] indexes) {
Instances newData = data;
Remove remove = new Remove();
remove.setAttributeIndicesArray(indexes);
remove.setInvertSelection(true);
try {
remove.setInputFormat(data);
newData = Filter.useFilter(data, remove);
result.add(newData);
} catch (Exception e) {
e.printStackTrace();
}
}
private Instances getInputInstance(File fileName) {
CSVLoader loader = new CSVLoader();
Instances instance = null;
try {
loader.setSource(fileName);
instance = loader.getDataSet();
} catch (IOException e) {
e.printStackTrace();
}
return instance;
}
private void writeMergedOutput() {
LOGGER.info("Started merging results");
Instances finalResult = result.poll();
while (!result.isEmpty()) {
finalResult = Instances.mergeInstances(finalResult, result.poll());
}
try {
BufferedWriter writer = new BufferedWriter(new FileWriter(path + "Output" + ".arff"));
writer.write(finalResult.toString());
writer.flush();
writer.close();
} catch (Exception e) {
e.printStackTrace();
}
LOGGER.info("Finished merging results");
}
public static void main(String[] args) {
long start = System.currentTimeMillis();
try {
LOGGER.info("Loading data");
AttributeSelector attributeSelector = new AttributeSelector();
attributeSelector.run(path + "Parts");
} catch (Exception e) {
e.printStackTrace();
}
long end = System.currentTimeMillis();
LOGGER.info("Execution time: " + (end - start));
}
public void run(String sourceDir) {
String master = "local[*]";
SparkConf conf = new SparkConf()
.setAppName(AttributeSelector.class.getName())
.setMaster(master);
JavaSparkContext context = new JavaSparkContext(conf);
JavaFutureAction<Void> task = context.wholeTextFiles(sourceDir)
.foreachAsync(new VoidFunction<Tuple2<String,String>>(){
@Override
public void call(Tuple2<String, String> fileInfo) throws Exception {
File file = new File(fileInfo._1);
Instances instance = getInputInstance(file);
instance.setClassIndex(instance.numAttributes() - 1);
int[] indices = selectAttributes(instance);
selectData(instance, indices);
LOGGER.info("Finished executing: " + fileInfo._1);
}
});
while(!task.isDone()){
}
writeMergedOutput();
context.close();
}
}
导致此异常的原因是什么以及如何解决?
我得到的例外是
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2094)
at org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1.apply(AsyncRDDActions.scala:126)
at org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1.apply(AsyncRDDActions.scala:125)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.AsyncRDDActions.foreachAsync(AsyncRDDActions.scala:125)
at org.apache.spark.api.java.JavaRDDLike$class.foreachAsync(JavaRDDLike.scala:732)
at org.apache.spark.api.java.AbstractJavaRDDLike.foreachAsync(JavaRDDLike.scala:45)
at geneselection.AttributeSelector.run(AttributeSelector.java:129)
at geneselection.AttributeSelector.main(AttributeSelector.java:110)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:738)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:187)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:212)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:126)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.NotSerializableException: geneselection.AttributeSelector
Serialization stack:
- object not serializable (class: geneselection.AttributeSelector, value: geneselection.AttributeSelector@5d43409a)
- field (class: geneselection.AttributeSelector$1, name: this$0, type: class geneselection.AttributeSelector)
- object (class geneselection.AttributeSelector$1, geneselection.AttributeSelector$1@210308d5)
- field (class: org.apache.spark.api.java.JavaRDDLike$$anonfun$foreachAsync$1, name: f$15, type: interface org.apache.spark.api.java.function.VoidFunction)
- object (class org.apache.spark.api.java.JavaRDDLike$$anonfun$foreachAsync$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295)
... 22 more
答案 0 :(得分:2)
根据the java documentation的java.io.NotSerializableException:
当实例需要具有Serializable接口时抛出。
所以你需要geneselection.AttributeSelector
实施java.io.Serializable
(documentation)