import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
public class Test {
List<String> knownWordsArrayList = new ArrayList<String>();
List<String> wordsArrayList = new ArrayList<String>();
List<String> newWordsArrayList = new ArrayList<String>();
String toFile = "";
public void readKnownWordsFile() {
try {
FileInputStream fstream2 = new FileInputStream("knownWords.txt");
BufferedReader br2 = new BufferedReader(new InputStreamReader(fstream2, "UTF-8"));
String strLine;
while ((strLine = br2.readLine()) != null) {
knownWordsArrayList.add(strLine.toLowerCase());
}
HashSet h = new HashSet(knownWordsArrayList);
// h.removeAll(knownWordsArrayList);
knownWordsArrayList = new ArrayList<String>(h);
// for (int i = 0; i < knownWordsArrayList.size(); i++) {
// System.out.println(knownWordsArrayList.get(i));
// }
} catch (Exception e) {
// TODO: handle exception
}
}
public void readFile() {
try {
// Open the file that is the first
// command line parameter
FileInputStream fstream = new FileInputStream("Smallville 4x02.de.srt");
BufferedReader br = new BufferedReader(new InputStreamReader(fstream));
String strLine;
String numberedLineRemoved = "";
String strippedInput = "";
String[] words;
String trimmedString = "";
String temp = "";
// Read File Line By Line
while ((strLine = br.readLine()) != null) {
temp = strLine.toLowerCase();
// Print the content on the console
numberedLineRemoved = numberedLine(temp);
strippedInput = numberedLineRemoved.replaceAll("\\p{Punct}", "");
if ((strippedInput.trim().length() != 0) || (!strippedInput.contains("")) || (strippedInput.contains(" "))) {
words = strippedInput.split("\\s+");
for (int i = 0; i < words.length; i++) {
if (words[i].trim().length() != 0) {
wordsArrayList.add(words[i]);
}
}
}
}
HashSet h = new HashSet(wordsArrayList);
h.removeAll(knownWordsArrayList);
newWordsArrayList = new ArrayList<String>(h);
// HashSet h = new HashSet(wordsArrayList);
// wordsArrayList.clear();
// newWordsArrayList.addAll(h);
for (int i = 0; i < newWordsArrayList.size(); i++) {
toFile = newWordsArrayList.get(i) + ".\n";
// System.out.println(newWordsArrayList.get(i) + ".");
System.out.println();
}
System.out.println(newWordsArrayList.size());
// Close the input stream
in.close();
} catch (Exception e) {// Catch exception if any
System.err.println("Error: " + e.getMessage());
}
}
public String numberedLine(String string) {
if (string.matches(".*\\d.*")) {
return "";
} else {
return string;
}
}
public void writeToFile() {
try {
// Create file
FileWriter fstream = new FileWriter("out.txt");
BufferedWriter out = new BufferedWriter(fstream);
out.write(toFile);
// Close the output stream
out.close();
} catch (Exception e) {// Catch exception if any
System.err.println("Error: " + e.getMessage());
}
}
public static void main(String[] args) {
Test test = new Test();
test.readKnownWordsFile();
test.readFile();
test.writeToFile();
}
}
如何从文件中阅读äöüß? string.toLowercase()也能正确处理这些吗? 当我打印包含任何äöüß的单词时,我该如何正确打印这个单词? 当我打印到控制台时,我得到了 Außerdem weiß 对于Außerdem 魏斯 我该如何解决这个问题?
我试过了:
BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
但现在我得到了aufkl?ren而不是aufklären,以及其他地方的混乱。
更新了代码以查看它是否会正确打印在文件上,但我只是在文件中找到一个。
答案 0 :(得分:1)
您需要使用用于创建文件的字符集来读取文件。如果你在Windows机器上,那可能是cp1252。所以:
BufferedReader br = new BufferedReader(new InputStreamReader(in, "Cp1252"));
如果这不起作用,大多数文本编辑器都能告诉您给定文档使用的编码。