我正在使用以下类来序列化和反序列化我的类。
import java.io.*;
public final class Serialization {
public static void writeObject(Object obj, String path){
try (ObjectOutputStream oos =
new ObjectOutputStream(new FileOutputStream(path))) {
oos.writeObject(obj);
//System.out.println("Done");
} catch (Exception ex) {
ex.printStackTrace();
}
}
public static Object readObject(String path){
Object obj = null;
FileInputStream fin = null;
ObjectInputStream ois = null;
try {
fin = new FileInputStream(path);
ois = new ObjectInputStream(fin);
obj = ois.readObject();
} catch (Exception ex) {
ex.printStackTrace();
} finally {
if (fin != null) {
try {
fin.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (ois != null) {
try {
ois.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return obj;
}
}
我有一个实现Serializable
接口的类:TextCategorizator。我试图将此类用作分类模型。因此,为了序列化这个类的对象,我使用
TextCategorizator tc = new TextCategorizator(trainingFiles, vecFile);
Serialization.writeObject(tc, MODEL_PATH);
然后当我尝试用
读取这个序列化对象时TextCategorizator model = (TextCategorizator) Serialization.readObject(MODEL_PATH);
我得到了以下异常跟踪:
java.io.OptionalDataException
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1373)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:373)
at java.util.HashMap.readObject(HashMap.java:1402)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1058)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1909)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1808)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2018)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1942)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1808)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:373)
at utils.Serialization.readObject(Serialization.java:27)
at Main.main(Main.java:33)
引发异常的部分是:
obj = ois.readObject();
当我查看此异常的reference page时,它表示在此异常中有两个选项用eof标志和长度变量表示。 我打印出来看看。 eof为true,长度为0.这意味着,根据参考页面,
尝试通过类定义的readObject或readExternal方法读取可消耗数据的结尾。在这种情况下,OptionalDataException的eof字段设置为true,length字段设置为0.
之前我使用过这些方法而且我没有面对这个例外。什么是错的,“读过去”究竟是什么意思?
编辑: TextCategorizator类在这里:
import utils.FileUtils;
import java.io.File;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
public class TextCategorizator implements Serializable {
private Map<String, String> wordVectors;
private Map<File, List<List<Double>>> docVectors;
private Map<File, String> trainingFiles;
private Set<String> classes;
public TextCategorizator(Map<File, String> trainingFiles, String trainedVectors) {
wordVectors = new HashMap<>();
docVectors = new HashMap<>();
classes = new HashSet<>();
this.trainingFiles = trainingFiles;
List<String> lines = FileUtils.readFileAsList(new File(trainedVectors));
System.out.println("> Reading word vector file.");
lines.parallelStream().forEach(line -> {
String name = line.substring(0, line.indexOf(' '));
wordVectors.put(name, line);
});
train(trainingFiles);
}
private void train(Map<File, String> trainingFiles) {
System.out.println("> Starting training parallel.");
trainingFiles.entrySet().parallelStream().forEach(entry -> {
docVectors.put(entry.getKey(), getVectorsOfDoc(entry.getKey()));
classes.add(entry.getValue());
});
}
private List<List<Double>> getVectorsOfDoc(File doc) {
List<List<Double>> lists = new ArrayList<>();
List<Double> resultVecAvg = new ArrayList<>();
List<Double> resultVecMax = new ArrayList<>();
List<Double> resultVecMin = new ArrayList<>();
int vecSize = 100;
for (int i = 0; i < vecSize; i++) {
resultVecAvg.add(0.0);
resultVecMax.add(0.0);
resultVecMin.add(0.0);
}
String[] words = FileUtils.readWords(doc);
for (String word : words) {
String line = wordVectors.get(word);
if (line != null) {
List<Double> vec = new ArrayList<>();
String[] tokens = line.split(" ");
for (int i = 1; i < tokens.length; i++) {
vec.add(Double.parseDouble(tokens[i]));
}
for (int i = 0; i < vec.size(); i++) {
resultVecAvg.set(i, resultVecAvg.get(i) + (vec.get(i) / vecSize));
resultVecMax.set(i, Math.max(resultVecMax.get(i), vec.get(i)));
resultVecMin.set(i, Math.min(resultVecMin.get(i), vec.get(i)));
}
}
}
lists.add(resultVecAvg); lists.add(resultVecMax); lists.add(resultVecMin);
return lists;
}
private void getCosineSimilarities(List<Double> givenVec, int option, Map<File, Double> distances) {
for (Map.Entry<File, List<List<Double>>> entry : docVectors.entrySet()) {
List<Double> vec = null;
if (option == 1) // AVG
vec = entry.getValue().get(0);
else if (option == 2) // MAX
vec = entry.getValue().get(1);
else if (option == 3) // MIN
vec = entry.getValue().get(2);
distances.put(entry.getKey(), cosSimilarity(givenVec, vec));
}
}
private double cosSimilarity(List<Double> vec1, List<Double> vec2) {
double norm1 = 0.0;
double norm2 = 0.0;
double dotProduct = 0.0;
for (int i = 0; i < vec1.size(); i++) {
norm1 += Math.pow(vec1.get(i), 2);
norm2 += Math.pow(vec2.get(i), 2);
dotProduct += vec1.get(i) * vec2.get(i);
}
return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
}
// from http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java
private <K, V extends Comparable<? super V>> Map<K, V>
sortByValue(Map<K, V> map, boolean reverse) {
return map.entrySet()
.stream()
.sorted((reverse ?
Map.Entry.comparingByValue(Collections.reverseOrder()) :
Map.Entry.comparingByValue()))
.collect(Collectors.toMap(
Map.Entry::getKey,
Map.Entry::getValue,
(e1, e2) -> e1,
LinkedHashMap::new
));
}
private int countClass(List<File> files, String c) {
int counter = 0;
for (File file : files) {
if (trainingFiles.get(file).equals(c))
++counter;
}
return counter;
}
public Map.Entry<String, Integer> classifyKnn(File file, int k, int option) {
List<List<Double>> vecs = getVectorsOfDoc(file);
List<Double> vec = getProperVector(vecs, option);
Map<File, Double> distances = new HashMap<>();
getCosineSimilarities(vec, option, distances);
distances = sortByValue(distances, true);
List<File> sortedFiles = new ArrayList<>(distances.keySet());
sortedFiles = sortedFiles.subList(0, k);
Map<String, Integer> counts = new HashMap<>();
for (String category : classes) {
counts.put(category, countClass(sortedFiles, category));
}
ArrayList<Map.Entry<String, Integer>> resultList =
new ArrayList(sortByValue(counts, true).entrySet());
return resultList.get(0);
}
private List<Double> getProperVector(List<List<Double>> lists, int option) {
List<Double> vec = null;
if (option == 1) // AVG
vec = lists.get(0);
else if (option == 2) // MAX
vec = lists.get(1);
else if (option == 3) // MIN
vec = lists.get(2);
return vec;
}
public Map.Entry<String, Double> classifyRocchio(File file, int option) {
List<List<Double>> vecs = getVectorsOfDoc(file);
List<Double> vec = getProperVector(vecs, option);
Map<File, Double> distances = new HashMap<>();
getCosineSimilarities(vec, option, distances);
distances = sortByValue(distances, true);
List<Map.Entry<File, Double>> sortedFiles =
new ArrayList<>(distances.entrySet());
return new AbstractMap.SimpleEntry<>
(trainingFiles.get(sortedFiles.get(0).getKey()),
sortedFiles.get(0).getValue());
}
}
答案 0 :(得分:3)
最后,我得到了这个工作。问题是我试图序列化和反序列化的对象的大小(即wordVectors
是480 MB)。
为了解决这个问题,我使用了synchronized map。所以
wordVectors = new HashMap<>();
更改为
wordVectors = Collections.synchronizedMap(new HashMap<>());
我从here得到了这个想法。