我有.sh,.txt,.sql,.pkb等文件,文件大小超过10 MB,这意味着超过10万行。
我想从这些文件中删除评论,然后进一步使用未注释的内容。我已经为它编写了以下代码。
/**
* Removes all the commented part from the file content as well as returns a
* file structure which have just lines with declaration syntax for eg.
* Create Package packageName <- Stores all decalartion lines as separate
* string in an array
*
* @param file
* @return file content
* @throws IOException
*/
private static String[] filterContent(File file) throws IOException {
String withoutComment = "";
String declare = "";
String[] content;
List<String> readLines = FileUtils.readLines(file);
int size = readLines.size();
System.out.println(file.getName() + " Files number of lines "+ size + " at "+new Date());
String[] declareLines = new String[size];
int startComment = 0;
int endComment = 0;
Boolean check = false;
int j = 0;
int i=0;
// Reading content line by line
for (String line:readLines) {
// If line contains */ that means comment is ending in this line,
// making a note of the line number
if (line.toString().contains("*/")) {
endComment = i;
// Removing the content before */ from the line
int indexOf = line.indexOf("*/");
line = line.replace(line.substring(0, indexOf + 2), "");
}
// If startComment is assigned fresh value and end comment hasn't,
// that means the current line is part of the comment
// Ignoring the line in this case and moving on to the next one
if ((startComment > 0 && endComment == 0) || (endComment < startComment) || check)
continue;
// If line contains /* that means comment is starting in this line,
// making a note of the line number
if (line.contains("/*")) {
startComment = i;
// Removing the content after /* from the line
int indexOf = line.indexOf("/*");
line = line.replace(line.substring(indexOf), "");
if (i == 0)
check = true; // means comment in the very first line
}
// If line contains -- that means single line comment is present in
// this line,
// removing the content after --
if (line.contains("--")) {
int indexOf = line.indexOf("--");
line = line.replace(line.substring(indexOf), "");
}
// If line contains -- that means single line comment is present in
// this line,
// removing the content after --
if (line.contains("#")) {
int indexOf = line.indexOf("#");
line = line.replace(line.substring(indexOf), "");
}
// At this point, all commented part is removed from the line, hence
// appending it to the final content
if (!line.isEmpty())
withoutComment = withoutComment + line + " \n";
// If line contains CREATE its a declaration line, holding it
// separately in the array
if (line.toUpperCase().contains(("CREATE"))) {
// If next line does not contains Create and the current line is
// the not the last line,
// then considering two consecutive lines as declaration line,
if (i < size - 1 && !readLines.get(i + 1).toString().toUpperCase().contains(("CREATE"))) {
declare = line + " " + readLines.get(i + 1).toString() + "\n";
} else if (i < size) {// If the line is last line, including
// that line alone.
declare = line + "\n";
}
declareLines[j] = declare.toUpperCase();
j++;
}
i++;
}
System.out.println("Read lines "+ new Date());
List<String> list = new ArrayList<String>(Arrays.asList(declareLines));
list.removeAll(Collections.singleton(null));
content = list.toArray(new String[list.size() + 1]);
withoutComment = withoutComment.toUpperCase();
content[j] = withoutComment;
System.out.println("Retruning uncommented content "+ new Date());
return content;
}
public static void main(String[] args) {
String[] content = filterContent(new File("abc.txt"));
}
如果文件大小很大,此代码的问题是它太慢了。对于10 MB文件,删除评论需要6个多小时。 (代码在SSH服务器上运行)。
我也可以拥有大小不超过100 MB的文件,其中删除评论需要数天。如何更快地删除评论?
更新:问题不重复,因为我的问题不仅仅是通过改变读取行的方式来解决。它的字符串活动使得过程变慢,我需要一种方法来更快地删除评论活动。
答案 0 :(得分:0)
您可以创建几个完成工作的线程(需要正确拆分行)
答案 1 :(得分:0)
更快地获得此代码的一些想法
使用InputStream
读取文件并直接分析行,将新String存储在未注释的文件中。这将阻止多次读取文件(一次创建List<String> readLines
,一旦迭代完成)
设计,您可以使用注释语法的映射而不是此重新生成代码。
一旦完成,这应该更快。当然,多线程可能是一个解决方案,但这需要一些检查,以确保您不要在注释块中拆分文件。所以,首先要改进代码,然后就可以想到这一点。
答案 2 :(得分:0)
原来我的代码最大的问题是使用Strings
。通过任何方法阅读行都有很大的不同,但使用StringBuilder
代替String
来存储未注释的行,大大改变了性能。现在,使用StringBuilder
的相同代码需要几秒钟才能删除几小时前的评论。
这是代码。为了获得更好的效果,我已将List
更改为BufferedReader
。
/**
* Removes all the commented part from the file content as well as returns a
* file structure which have just lines with declaration syntax for eg.
* Create Package packageName <- Stores all decalartion lines as separate
* string in an array
*
* @param file
* @return file content
* @throws IOException
*/
private static List<String> filterContent(File file) throws IOException {
StringBuilder withoutComment = new StringBuilder();
// String declare = "";
// String[] content;
// List<String> readLines = FileUtils.readLines(file);
//
// int size = readLines.size();
System.out.println(file.getName() + " at " + new Date());
List<String> declareLines = new ArrayList<String>();
// String line = null;
int startComment = 0;
int endComment = 0;
Boolean check = false;
Boolean isLineDeclaration = false;
int j = 0;
int i = 0;
InputStream in = new FileInputStream(file);
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
String line;
// Reading content line by line
while ((line = reader.readLine()) != null) {
// for (int i = 0; i < size; i++) {
// line = readLines.get(i).toString();// storing current line data
// If line contains */ that means comment is ending in this line,
// making a note of the line number
if (line.toString().contains("*/")) {
endComment = i;
// Removing the content before */ from the line
int indexOf = line.indexOf("*/");
line = line.replace(line.substring(0, indexOf + 2), "");
}
// If startComment is assigned fresh value and end comment hasn't,
// that means the current line is part of the comment
// Ignoring the line in this case and moving on to the next one
if ((startComment > 0 && endComment == 0) || (endComment < startComment) || check)
continue;
// If line contains /* that means comment is starting in this line,
// making a note of the line number
if (line.contains("/*")) {
startComment = i;
// Removing the content after /* from the line
int indexOf = line.indexOf("/*");
line = line.replace(line.substring(indexOf), "");
if (i == 0)
check = true; // means comment in the very first line
}
// If line contains -- that means single line comment is present in
// this line,
// removing the content after --
if (line.contains("--")) {
int indexOf = line.indexOf("--");
line = line.replace(line.substring(indexOf), "");
}
// If line contains -- that means single line comment is present in
// this line,
// removing the content after --
if (line.contains("#")) {
int indexOf = line.indexOf("#");
line = line.replace(line.substring(indexOf), "");
}
// At this point, all commented part is removed from the line, hence
// appending it to the final content
if (!line.isEmpty())
withoutComment.append(line).append(" \n");
// If line contains CREATE its a declaration line, holding it
// separately in the array
if (line.toUpperCase().contains(("CREATE"))) {
// If next line does not contains Create and the current line is
// the not the last line,
// then considering two consecutive lines as declaration line,
declareLines.add(line.toUpperCase());
isLineDeclaration = true;
j++;
} else if (isLineDeclaration && !line.toUpperCase().contains(("CREATE"))) {
// If next line does not contains Create and the current line is
// the not the last line,
// then considering two consecutive lines as declaration line,
declareLines.set(j - 1, declareLines.get(j - 1) + " " + line.toUpperCase());
isLineDeclaration = false;
}
i++;
}
reader.close();
System.out.println("Read lines " + new Date());
// List<String> list = new ArrayList<String>(Arrays.asList(declareLines));
declareLines.removeAll(Collections.singleton(null));
// content = list.toArray(new String[list.size() + 1]);
// withoutComment = withoutComment..toUpperCase();
declareLines.add(withoutComment.toString().toUpperCase());
System.out.println("Retruning uncommented content " + new Date());
return declareLines;
}