Question

我有2个大文本文件，每个文件中有400,000行文本。在File2.txt中，我需要从File1.txt中的当前行中找到包含userId的行。一旦我在File2.txt中找到了正确的行，我就会进行一些计算并将该行写入新的文本文件。

我为此创建的代码运行速度非常慢。我尝试过各种方式重写它，但它总是突然出现，永远不会完成。我怎么能快速做到这一点？

private void btnExecute_Click(object sender, EventArgs e) {        
    string line1 = "";
    string line2 = "";

    //the new text file we are creating. Located in IVR_Text_Update\bin\Debug
    StreamWriter sw = new StreamWriter("NewFile.txt");

    //the new text file which contains the registrants which need removing
    StreamWriter sw_removeRegs = new StreamWriter("RemoveRegistrants.txt");

    //address has changed so we write the line to the address file
    StreamWriter sw_addressChange = new StreamWriter("AddressChanged.txt");

    List<string> lines_secondFile = new List<string>();

    using (StreamReader sr = new StreamReader(openFileDialog2.FileName)) {
        string line;
        while ((line = sr.ReadLine()) != null) {
            lines_secondFile.Add(line);
        }
    }

    //loop through the frozen file one line at a time
    while ((line1 = sr1.ReadLine()) != null) {
        //get the line from the update file, assign it to line2
        //function accepts (userId, List)
        line2 = getLine(line1.Substring(3, 8), lines_secondFile);

        //if line2 is null then userId was not found therefore we write
        //the line to Remove Registrants file
        if (line2 == null) {
            sw_removeRegs.Write(line1 + Environment.NewLine);
        }

        //address between the two lines was found to be different so we still write
        //them to the new text file but don't update codes
        else if (line1.Substring(93, 53) != line2.Substring(93, 53)) {
            sw_addressChange.Write(line1 + Environment.NewLine);
            sw.Write(line1 + Environment.NewLine);
        }

        //test for null then write the new line in our new text file
        else if ((line1 != null) && (line2 != null)) {
            sw.Write(line1.Substring(0, 608) +                    
                     line2.Substring(608, 9) +
                     line2.Substring(617, 9) +
                     line2.Substring(626, 9) +
                     line2.Substring(635, 9) +
                     line2.Substring(644, 9) +
                     line2.Substring(653, 9) +
                     line2.Substring(662, 9) +
                     line2.Substring(671, 9) +
                     line2.Substring(680, 9) +

                     line1.Substring(680, 19) + 
                     Environment.NewLine);
        }
    }

    textBox1.Text = "Finished.";
    sr1.Close();
    sw.Close();
    sw_removeRegs.Close();
    sw_addressChange.Close();
}

//returns the line from the update file which has the corresponding userId
//from the frozen file
string getLine(string userId, List<string> lines_secondFile) {

    foreach (string currentLine in lines_secondFile) {
        if (currentLine.Contains(userId)) {
            return currentLine;
        }
    }

    return null;
}

Answer 1

不要逐行读取，而是尝试一次读取所有文件。这比向文件发出许多读取请求要快得多。这是因为文件访问比内存访问慢得多。试试File.ReadAllText

话虽如此，您应该尝试分析代码，以确切了解代码中的瓶颈所在。

Answer 2

不考虑当前算法的磁盘访问速度O(n^2) - 对于第一个文件中的每一行，您在列表中进行查找以查找用户ID - 您可以使用一些缓存来避免多次查找相同的用户ID，我假设您的用户数少于40万，因此大多数情况应该重复：

private Dictionary<string, string> userMap = new Dictionary<string, string>();
string getLine(string userId, List<string> lines_secondFile) 
{
    if(userMap.ContainsKey(userId))
        return userMap[userId];
    else
    {
      foreach (string currentLine in lines_secondFile) 
      {
        if (currentLine.Contains(userId)) 
        {
            userMap.Add(userId, currentLine);
            return currentLine;
        }
    }
    return null;
}

Answer 3

如果您有资源，可以将整个文件放在内存中。然后应该提高速度。在C＃4之前，您必须使用WIN32 API来存储映射文件，但C＃4添加了System.IO.MemoryMappedFiles.MemoryMappedFile。

还可以实现多线程方法来处理pararrel中的部分文件，但这会增加额外的复杂性。

通过File1.txt和File2.txt循环非常慢。这两个文件都是280MB

3 个答案: