Question

我想知道计算文件中所有字节的最快方法吗？我需要处理大型二进制文件

我想知道文件中所有字节的数量（数量为0x00,0x01，.. 0xff）

用于在我的WPF Hexeditor usercontrol https://github.com/abbaye/WPFHexEditorControl中添加带有文件表示的图形，就像在HxD hexeditor中一样。

此代码工作正常，但对于大文件来说速度很慢。

public Dictionary<int, long> GetByteCount()
{
    if (IsOpen)
    {
        Position = 0;
        int currentByte = 0;

        // Build dictionary
        Dictionary<int, long> cd = new Dictionary<int, long>();
        for (int i = 0; i <= 255; i++) cd.Add(i, 0);
        //

        for (int i = 0; i <= Length; i++)
        {
            //if (EOF) break;

            currentByte = ReadByte();                    
            if (currentByte != -1) cd[currentByte]++;

            Position++;
        }

        return cd;
    }

    return new Dictionary<int, long>();
}

Answer 1

    /// <summary>
    /// Get an array of long computing the total of each byte in the file. 
    /// The position of the array makes it possible to obtain the sum of the desired byte
    /// </summary>
    public long[] GetByteCount()
    {
        if (IsOpen)
        {
            const int bufferLenght = 1048576; //1mb
            var storedCnt = new long[256];
            Position = 0;

            while (!Eof)
            {
                var testLenght = Length - Position;
                var buffer = testLenght <= bufferLenght ? new byte[testLenght] : new byte[bufferLenght];

                Read(buffer, 0, buffer.Length);

                foreach (var b in buffer)
                    storedCnt[b]++;

                Position += bufferLenght;

            }

            return storedCnt;
        }

        return null;
    }

Answer 2

我对David的解决方案进行了一些优化。＆＃34;位置＆＃34; - 没有必要打电话。我发现缓冲区长度和无缓冲读取模式不是很重要，但是＆＃34;对于＆＃34;而不是＆＃34; foreach＆＃34; - 计算中的构造产生了很大的不同。

结果

foreach (var b in buffer.Take(count))
{
    storedCnt[b]++;
}

file length is 4110217216
duration 00:00:51.1686821

结果

for(var i = 0; i < count; i++)  
{
    storedCnt[buffer[i]]++;
}

file length 4110217216
duration 00:00:05.9695418

这是程序

private static void Main(
{
    const string fileForCheck = @"D:\Data\System\en_visual_studio_enterprise_2015_x86_x64_dvd_6850497.iso";
    Debug.Assert(File.Exists(fileForCheck));

    var watch = new Stopwatch();
    var counter = new FileBytesCounter(fileForCheck);
    watch.Start();
    var results = counter.GetByteCount();
    watch.Stop();
    counter.Dispose();

    Console.WriteLine("results:");
    Console.WriteLine(string.Join(", ", results.Select((c, b) => $"{b} -> {c}")));
    var sumBytes = results.Sum(c => c);
    Debug.Assert((new FileInfo(fileForCheck)).Length == sumBytes); // here's the proof
    Console.WriteLine();
    Console.WriteLine($"file length {sumBytes}");
    Console.WriteLine($"duration {watch.Elapsed}");
}

这里是班级

internal class FileBytesCounter
    : FileStream
{
    private const FileOptions FileFlagNoBuffering = (FileOptions)0x20000000;
    private const int CopyBufferSize = 1024 * 1024;
    //private const int CopyBufferSize = 4 * 1024 * 16;

    public FileBytesCounter(string path, FileShare share = FileShare.Read)
        : base(path, FileMode.Open, FileAccess.Read, share, CopyBufferSize/*, FileFlagNoBuffering*/)
    {
    }

    public long[] GetByteCount()
    {
        var buffer = new byte[CopyBufferSize];
        var storedCnt = new long[256];
        int count;

        Position = 0;

        while ((count = Read(buffer, 0, CopyBufferSize)) > 0)
        {
            for(var i = 0; i < count; i++)  
            {
                storedCnt[buffer[i]]++;
            }
        }

        return storedCnt;
    }
}

另请参阅https://www.codeproject.com/Articles/172613/Fast-File-Copy-With-Managed-Code-UBCopy-update了解FileFlagNoBuffering

Answer 3

看起来你想要这样的东西：

public Dictionary<char, long> GetCharCount(string filePath)
{
    var result = new Dictionary<char, long>();
    var content = File.ReadAllText(filePath);

    foreach(var c in content)
    {
        if (result.ContainsKey(c))
        {
            result[c] = result[c] + 1;
        }
        else
        {
            result.Add(c, 1);
        }
    }

    return result;
}

（C＃）计算文件中字节的最快方法是什么？

3 个答案: