我写了以下简单的测试:
[Test]
public void TestUTF8()
{
var c = "abc☰def";
var b = Encoding.UTF8.GetBytes(c);
Assert.That(b.Length, Is.EqualTo(9));
//Assuming, you are reading a byte stream and got partial result with the first 5 bytes
var p = Encoding.UTF8.GetChars(b, 0, 5);
Trace.WriteLine(new string(p));
Assert.That(p.Length, Is.EqualTo(3));
}
Trace
输出abc�
和最后一个断言失败,因为p.Length
为4
。
但是,我希望Trace
输出abc
和最后一个断言传递,因为实际上我知道流将具有有效字符,并且当最后几个字节不是这种情况时,只需离开他们在那里等待更多的数据来。
那么我怎样才能在C#中实现这一目标?
答案 0 :(得分:4)
Encoding.GetChars
并非真正设计用于来自流的字节,其中某些状态需要在解码过程中保持跟踪,因为单个字符可能跨越多个缓冲区段。要完成这项工作,您应该使用从Decoder
获得的Encoding.GetDecoder
。但是,Decoder.Convert
实际上是低级别的,允许您控制输入和输出缓冲区,并且有些难以使用。 Decoder.GetChars
更容易使用,并且在调用之间存储状态的重要工作。我们可以很容易地扩展Peter Duniho的answer任意缓冲区大小:
public static void Main(string[] args)
{
var c = "abc☰def";
var b = Encoding.UTF8.GetBytes(c);
var result = DecodeFromStream(new MemoryStream(b), Encoding.UTF8, 3);
Console.WriteLine(result);
Console.WriteLine(c == result);
}
private static string DecodeFromStream(Stream dataStream, Encoding encoding, int bufferSize)
{
Decoder decoder = encoding.GetDecoder();
StringBuilder sb = new StringBuilder();
int inputByteCount;
byte[] inputBuffer = new byte[bufferSize];
char[] charBuffer = new char[encoding.GetMaxCharCount(inputBuffer.Length)];
while ((inputByteCount = dataStream.Read(inputBuffer, 0, inputBuffer.Length)) > 0)
{
int readChars = decoder.GetChars(inputBuffer, 0, inputByteCount, charBuffer, 0);
if (readChars > 0)
sb.Append(charBuffer, 0, readChars);
}
return sb.ToString();
}