Question

正在处理一个将加载100+ GB文本文件的项目，其中一个过程是对指定文件中的行进行计数。我必须按照以下方式进行操作，以免出现内存不足异常。有没有更快的方法或最有效的方法来完成此任务？（我知道您可以在4个线程上运行它，然后将合并的输出除以4。这不知道最有效的方法）

uint loadCount2 = 0;
foreach (var line in File.ReadLines(currentPath))
{
    loadCount2++;
}

计划在固定位置的情况下在具有4个双核CPU和40 GB RAM的服务器上运行程序。当前，它在临时的小型4核8GB RAM服务器上运行。（不知道线程在多个CPU上的表现如何。）

我测试了您的很多建议。

            Stopwatch sw2 = Stopwatch.StartNew();
            {
                using (FileStream fs = File.Open(json, FileMode.Open))
                    CountLinesMaybe(fs);
            }



            TimeSpan t = TimeSpan.FromMilliseconds(sw2.ElapsedMilliseconds);
            string answer = string.Format("{0:D2}h:{1:D2}m:{2:D2}s:{3:D3}ms", t.Hours, t.Minutes, t.Seconds, t.Milliseconds);
            Console.WriteLine(answer);
            sw2.Restart();
            loadCount2 = 0;


            Parallel.ForEach(File.ReadLines(json), (line) =>
            {
                loadCount2++;
            });


            t = TimeSpan.FromMilliseconds(sw2.ElapsedMilliseconds);
            answer = string.Format("{0:D2}h:{1:D2}m:{2:D2}s:{3:D3}ms", t.Hours, t.Minutes, t.Seconds, t.Milliseconds);
            Console.WriteLine(answer);
            sw2.Restart();
            loadCount2 = 0;

            foreach (var line in File.ReadLines(json))
            {
                loadCount2++;
            }

             t = TimeSpan.FromMilliseconds(sw2.ElapsedMilliseconds);
             answer = string.Format("{0:D2}h:{1:D2}m:{2:D2}s:{3:D3}ms", t.Hours, t.Minutes, t.Seconds, t.Milliseconds);
            Console.WriteLine(answer);
            sw2.Restart();
            loadCount2 = 0;

            int query = (int)Convert.ToByte('\n');
            using (var stream = File.OpenRead(json))
            {
                int current;
                do
                {
                    current = stream.ReadByte();
                    if (current == query)
                    {
                        loadCount2++;
                        continue;
                    }
                } while (current != -1);
            }

             t = TimeSpan.FromMilliseconds(sw2.ElapsedMilliseconds);
             answer = string.Format("{0:D2}h:{1:D2}m:{2:D2}s:{3:D3}ms", t.Hours, t.Minutes, t.Seconds, t.Milliseconds);
            Console.WriteLine(answer);
            Console.ReadKey();

    private const char CR = '\r';
    private const char LF = '\n';
    private const char NULL = (char)0;

    public static long CountLinesMaybe(Stream stream)
    {
        //Ensure.NotNull(stream, nameof(stream));

        var lineCount = 0L;

        var byteBuffer = new byte[1024 * 1024];
        const int BytesAtTheTime = 4;
        var detectedEOL = NULL;
        var currentChar = NULL;

        int bytesRead;
        while ((bytesRead = stream.Read(byteBuffer, 0, byteBuffer.Length)) > 0)
        {
            var i = 0;
            for (; i <= bytesRead - BytesAtTheTime; i += BytesAtTheTime)
            {
                currentChar = (char)byteBuffer[i];

                if (detectedEOL != NULL)
                {
                    if (currentChar == detectedEOL) { lineCount++; }

                    currentChar = (char)byteBuffer[i + 1];
                    if (currentChar == detectedEOL) { lineCount++; }

                    currentChar = (char)byteBuffer[i + 2];
                    if (currentChar == detectedEOL) { lineCount++; }

                    currentChar = (char)byteBuffer[i + 3];
                    if (currentChar == detectedEOL) { lineCount++; }
                }
                else
                {
                    if (currentChar == LF || currentChar == CR)
                    {
                        detectedEOL = currentChar;
                        lineCount++;
                    }
                    i -= BytesAtTheTime - 1;
                }
            }

            for (; i < bytesRead; i++)
            {
                currentChar = (char)byteBuffer[i];

                if (detectedEOL != NULL)
                {
                    if (currentChar == detectedEOL) { lineCount++; }
                }
                else
                {
                    if (currentChar == LF || currentChar == CR)
                    {
                        detectedEOL = currentChar;
                        lineCount++;
                    }
                }
            }
        }

        if (currentChar != LF && currentChar != CR && currentChar != NULL)
        {
            lineCount++;
        }
        return lineCount;
    }

结果显示出很大的进步，但我希望达到20分钟。我希望他们在功能更强大的服务器上使用，以了解使用更多CPU的效果。

第二次运行返回： 23分钟 25分钟 22分钟 29分钟

表示方法实际上没有任何区别。（无法截屏，因为我删除了暂停，程序通过锁定屏幕继续运行）

Answer 1

基于ReadByte（并与换行符比较）的方法可能比ReadLine更快，例如，对于更接近GB的文件

stopwatch = System.Diagnostics.Stopwatch.StartNew();
uint count = 0;
int query = (int)Convert.ToByte('\n');
using (var stream = File.OpenRead(filepath))
{
    int current;
    do
    {
        current = stream.ReadByte();
        if (current == query)
        {
            count++;
            continue;
        }
    } while (current!= -1);
}
Console.WriteLine($"Using ReadByte,Time : {stopwatch.Elapsed.TotalMilliseconds},Count: {r}");

使用ReadByte，时间：8174.5661，计数：7555107

stopwatch = System.Diagnostics.Stopwatch.StartNew();
uint loadCount2 = 0;
foreach (var line in File.ReadLines(filepath))
{
    loadCount2++;
}
Console.WriteLine($"Using ReadLines, Time : {stopwatch.Elapsed.TotalMilliseconds},Count: {r}");

使用ReadLines，时间：27303.835，计数：7555107

Answer 2

当您开始使用大数据时，您需要一个功能更强大的计算系统来使运行速度更快。如果需要速度，请增加RAM以将所有数据保存在内存中。添加NVMe SSD并将数据文件存储在其中，以提高读取性能。

明智地使用软件，只需大块读取文件，然后循环遍历缓冲区，检查每个包含换行符的字节。您无需在文本行上进行任何处理，包括添加或删除字符，检查图案等。ReadLine在创建其数据结构以保持行间动态方面有太多开销。

您不需要那种开销，而只需要分配一个大的固定大小的缓冲区一次，读取数据，并在寻找换行符时进行迭代。用C编写它也可以更快地处理。

使用C＃

2 个答案: