我需要读取一个大文件,删除一些空格字符,压缩内容并从中创建一个双SHA256哈希。为了避免将整个文件加载到内存中(这是不可能的,因为某些文件有数百MB)并避免性能瓶颈,我希望只读取一次整个文件。因此,我专门包装了FileStream
并完成了工作。 DeflaterOutputStream
来自SharpZipLib。
public class DeflateAndHashStream : Stream
{
private readonly FileStream _input;
private readonly MemoryStream _compressedFile;
private readonly DeflaterOutputStream _deflate;
public DeflateAndHashStream(FileStream input)
{
_input = input;
_compressedFile = new MemoryStream();
_deflate = new DeflaterOutputStream(_compressedFile);
}
public override void Flush()
{
throw new NotImplementedException();
}
public override long Seek(long offset, SeekOrigin origin)
{
throw new NotImplementedException();
}
public override void SetLength(long value)
{
throw new NotImplementedException();
}
public override int Read(byte[] buffer, int offset, int count)
{
byte[] internalBuffer = new byte[buffer.Length];
//read bufferbytes from the file
int readedBytes = _input.Read(internalBuffer, 0, buffer.Length);
//Are we done reading the file?
if (readedBytes == 0) return 0;
//remove whitespaces from internalBuffer
//method takes bytearray and cut whitespace chars in place
//returns the count of removed characters
//so this line corrects the readed bytes
readedBytes -= RemoveWhitespace(ref internalBuffer);
//make the result available for stream chaining
buffer = internalBuffer;
//in parall compress the file into internal memorystream
_deflate.Write(internalBuffer, 0, internalBuffer.Length);
return readedBytes;
}
public byte[] GetCompressedData()
{
_deflate.Flush();
_deflate.Finish();
return _compressedFile.ToArray();
}
public override void Write(byte[] buffer, int offset, int count)
{
throw new NotImplementedException();
}
public override bool CanRead => _input.CanRead;
public override bool CanSeek => _input.CanSeek;
public override bool CanWrite => _input.CanWrite;
public override long Length => _input.Length;
public override long Position
{
get => _input.Position;
set => _input.Position = value;
}
private int RemoveWhitespace(ref byte[] digest)
{
var output = new MemoryStream();
int removedWhiteSpaces = 0;
foreach (var actualByte in digest)
switch (actualByte)
{
case 10:
case 13:
case 26:
// ignore this character
removedWhiteSpaces++;
break;
default:
output.WriteByte(actualByte);
break;
}
digest = output.ToArray();
return removedWhiteSpaces;
}
}
我这样叫DeflateAndHashStream
:
public string[] CreateHashAndZipFile(string filePath)
{
if(!File.Exists(filePath)) throw new FileNotFoundException();
string[] result = new string[2];
using (FileStream fs = new FileStream(filePath, FileMode.Open))
{
using (DeflateAndHashStream defhash = new DeflateAndHashStream(fs))
{
using (SHA256 sha = new SHA256Managed())
{
result[0] = Convert.ToBase64String(sha.ComputeHash(sha.ComputeHash(defhash)));
result[1] = Convert.ToBase64String(defhash.GetCompressedData());
}
}
}
return result;
}
不幸的是,这会创建一个完全不同的散列,如下所示:
string test1;
using (FileStream fs = new FileStream("test.txt", FileMode.Open))
{
using (SHA256Managed sha = new SHA256Managed())
{
test1 = Convert.ToBase64String(sha.ComputeHash(sha.ComputeHash(fs)));
}
}
使用仅包含单个A(0x65)的文件test.txt
我在做什么错了?