我想知道计算文件中所有字节的最快方法吗?我需要处理大型二进制文件
我想知道文件中所有字节的数量(数量为0x00,0x01,.. 0xff)
用于在我的WPF Hexeditor usercontrol https://github.com/abbaye/WPFHexEditorControl中添加带有文件表示的图形,就像在HxD hexeditor中一样。
此代码工作正常,但对于大文件来说速度很慢。
public Dictionary<int, long> GetByteCount()
{
if (IsOpen)
{
Position = 0;
int currentByte = 0;
// Build dictionary
Dictionary<int, long> cd = new Dictionary<int, long>();
for (int i = 0; i <= 255; i++) cd.Add(i, 0);
//
for (int i = 0; i <= Length; i++)
{
//if (EOF) break;
currentByte = ReadByte();
if (currentByte != -1) cd[currentByte]++;
Position++;
}
return cd;
}
return new Dictionary<int, long>();
}
答案 0 :(得分:3)
/// <summary>
/// Get an array of long computing the total of each byte in the file.
/// The position of the array makes it possible to obtain the sum of the desired byte
/// </summary>
public long[] GetByteCount()
{
if (IsOpen)
{
const int bufferLenght = 1048576; //1mb
var storedCnt = new long[256];
Position = 0;
while (!Eof)
{
var testLenght = Length - Position;
var buffer = testLenght <= bufferLenght ? new byte[testLenght] : new byte[bufferLenght];
Read(buffer, 0, buffer.Length);
foreach (var b in buffer)
storedCnt[b]++;
Position += bufferLenght;
}
return storedCnt;
}
return null;
}
答案 1 :(得分:1)
我对David的解决方案进行了一些优化。 &#34;位置&#34; - 没有必要打电话。我发现缓冲区长度和无缓冲读取模式不是很重要,但是&#34;对于&#34;而不是&#34; foreach&#34; - 计算中的构造产生了很大的不同。
结果
foreach (var b in buffer.Take(count))
{
storedCnt[b]++;
}
file length is 4110217216
duration 00:00:51.1686821
结果
for(var i = 0; i < count; i++)
{
storedCnt[buffer[i]]++;
}
file length 4110217216
duration 00:00:05.9695418
这是程序
private static void Main(
{
const string fileForCheck = @"D:\Data\System\en_visual_studio_enterprise_2015_x86_x64_dvd_6850497.iso";
Debug.Assert(File.Exists(fileForCheck));
var watch = new Stopwatch();
var counter = new FileBytesCounter(fileForCheck);
watch.Start();
var results = counter.GetByteCount();
watch.Stop();
counter.Dispose();
Console.WriteLine("results:");
Console.WriteLine(string.Join(", ", results.Select((c, b) => $"{b} -> {c}")));
var sumBytes = results.Sum(c => c);
Debug.Assert((new FileInfo(fileForCheck)).Length == sumBytes); // here's the proof
Console.WriteLine();
Console.WriteLine($"file length {sumBytes}");
Console.WriteLine($"duration {watch.Elapsed}");
}
这里是班级
internal class FileBytesCounter
: FileStream
{
private const FileOptions FileFlagNoBuffering = (FileOptions)0x20000000;
private const int CopyBufferSize = 1024 * 1024;
//private const int CopyBufferSize = 4 * 1024 * 16;
public FileBytesCounter(string path, FileShare share = FileShare.Read)
: base(path, FileMode.Open, FileAccess.Read, share, CopyBufferSize/*, FileFlagNoBuffering*/)
{
}
public long[] GetByteCount()
{
var buffer = new byte[CopyBufferSize];
var storedCnt = new long[256];
int count;
Position = 0;
while ((count = Read(buffer, 0, CopyBufferSize)) > 0)
{
for(var i = 0; i < count; i++)
{
storedCnt[buffer[i]]++;
}
}
return storedCnt;
}
}
另请参阅https://www.codeproject.com/Articles/172613/Fast-File-Copy-With-Managed-Code-UBCopy-update了解FileFlagNoBuffering
答案 2 :(得分:0)
看起来你想要这样的东西:
public Dictionary<char, long> GetCharCount(string filePath)
{
var result = new Dictionary<char, long>();
var content = File.ReadAllText(filePath);
foreach(var c in content)
{
if (result.ContainsKey(c))
{
result[c] = result[c] + 1;
}
else
{
result.Add(c, 1);
}
}
return result;
}