我想要实现的是在句子中保存单词对,如果单词已经存在,我试图保存一个单词列表。 为了保存配对,因为我的数据集文件非常大,可能有数百万,我选择了orientdb。我不知道我是否正确接近它,但orientdb非常慢。经过8个小时的运行,它只成了12000个句子。 据我所知,主要的减速是在浏览集群。 附上我的代码,如果蚂蚁可以对我的方法提出任何指示,请。
public static void main(String[] args) {
// TODO Auto-generated method stub
Main m = new Main();
m.openDatabase();
m.readFile("train_v2.txt");
m.closeDatabase();
}
}
class Main {
ODatabaseDocumentTx db;
Map<String, Object> index;
List<Object> list = null;
String pairing[];
ODocument doc;
Main() {
}
public void closeDatabase() {
if (!db.isClosed()) {
db.close();
}
}
void openDatabase() {
db = new ODatabaseDocumentTx("local:/databases/model").open("admin",
"admin");
doc = new ODocument("final");
}
public void readFile(String filename) {
InputStream ins = null; // raw byte-stream
Reader r = null; // cooked reader
int i = 1;
BufferedReader br = null; // buffered for readLine()
try {
String s;
ins = new FileInputStream(filename);
r = new InputStreamReader(ins, "UTF-8"); // leave charset out
// for
// default
br = new BufferedReader(r);
while ((s = br.readLine()) != null) {
System.out.println("" + i);
createTermPair(s.replaceAll("[^\\w ]", "").trim());
i++;
}
} catch (Exception e) {
System.err.println(e.getMessage()); // handle exception
} finally {
closeDatabase();
if (br != null) {
try {
br.close();
} catch (Throwable t) { /* ensure close happens */
}
}
if (r != null) {
try {
r.close();
} catch (Throwable t) { /* ensure close happens */
}
}
if (ins != null) {
try {
ins.close();
} catch (Throwable t) { /* ensure close happens */
}
}
}
}
private void createTermPair(String phrase) {
phrase = phrase + " .";
String[] word = phrase.split(" ");
for (int i = 0; i < word.length - 1; i++) {
if (!word[i].trim().equalsIgnoreCase("")
&& !word[i + 1].trim().equalsIgnoreCase("")) {
String wordFirst = word[i].toLowerCase().trim();
String wordSecond = word[i + 1].toLowerCase().trim();
String pair = wordFirst + " " + wordSecond;
checkForPairAndWrite(pair);
}
}
}
private void checkForPairAndWrite(String pair) {
try {
pairing = pair.trim().split(" ");
if (!pairing[1].equalsIgnoreCase(" ")) {
index = new HashMap<String, Object>();
for (ODocument docr : db.browseCluster("final")) {
list = docr.field(pairing[0]);
}
if (list == null) {
list = new ArrayList<>();
}
list.add("" + pairing[1]);
if (list.size() >= 1)
index.put(pairing[0], list);
doc.fields(index);
doc.save();
}// for (int i = 0; i < list.size(); i++) {
// System.out.println("" + list.get(i));
// }
} catch (Exception e) {
}
return;
}
}