此代码部分完全正常工作,但我想知道是否有任何优化方法? 它必须能够处理大文件,例如,格式化原始140kb .txt文件(12.5k字)需要2秒(用秒表类测量)。
也许我在这里使用了一些不好的技术或者有一些部分要简化?也许多线程? 不胜感激求助!
以下代码:
class TextManipulations
{
public string[] wordsDist; // main array, contains words in alphabetic order and output lines
public void TextFormat(string sourcePath) // creating method that will format our source text according to task
{
string textInput = System.IO.File.ReadAllText(sourcePath).ToLower(); // reading text from file, lowercased at start for precise search
MatchCollection m = Regex.Matches(textInput, @"\b[\w']+\b"); // exact search of all alphanumeric "words" including words with apostrophe
List<string> words = new List<string>(); // creating List<T> for containing unknown amount of words
foreach (Match match in m) // assigning all matches to List<string>
{
words.Add(match.ToString());
}
words.Sort(); // sorting words in alphabetic order
wordsDist = words.Distinct().ToArray(); // assigning words to main array without duplicates
System.IO.File.WriteAllLines(@"D:\output.txt", wordsDist); // writing words into txt file to edit in setLineNumbers method
}
public void setLineNumbers(string sourcePath) // creating method for adding line numbers
{
string[] linesOutput = new string[wordsDist.Count()]; // creating array that will contain line numbers
string[] lines = System.IO.File.ReadAllLines(sourcePath); // assigning source text by lines
for (int j = 0; j < wordsDist.Count(); j++) // main cycle checking each word for presence in each line
{
for (int i = 0; i < lines.Count(); i++)
{
if (Regex.IsMatch(lines[i].ToLower(), "\\b" + wordsDist[j] + "\\b")) // using ToLower() here, because we can't use it in line 33
{
linesOutput[j] += (i + 1).ToString() + ", "; // adding line numbers according to word
}
}
}
for (int i = 0; i < wordsDist.Count(); i++) // connection of two relative arrays
{
wordsDist[i] += "_______________________________" + linesOutput[i];
wordsDist[i] = wordsDist[i].Remove(wordsDist[i].Length - 2); // removing last ',' char
}
System.IO.File.WriteAllLines(@"D:\output.txt", wordsDist); // writing final output result into txt file
}
}
答案 0 :(得分:2)
我认为您应该使用特定软件检查性能,例如dotTrace。使用dotTrace,您可以了解应用程序中的性能损失。
答案 1 :(得分:2)
因为您有明确的要求,所以无法并行读写。
您可以做的最好的事情是删除Distinct()
和Regex
。