如何使用C / C ++读取带有Unicode内容的文件?
我使用ReadFile函数读取具有Unicode内容的文件,但它没有真正的输出。 我想要一个包含文件所有内容的缓冲区
我使用此代码:
#include <Windows.h>
int main()
{
HANDLE hndlRead;
OVERLAPPED ol = {0};
CHAR* szReadBuffer;
INT fileSize;
hndlRead = CreateFileW(L"file", GENERIC_READ, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (hndlRead != INVALID_HANDLE_VALUE)
{
fileSize = GetFileSize(hndlRead, NULL);
szReadBuffer = (CHAR*) HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, (fileSize)*2);
DWORD nb=0;
int nSize=fileSize;
if (szReadBuffer != NULL)
{
ReadFile(hndlRead, szReadBuffer, nSize, &nb, &ol);
}
}
return 0;
}
有没有办法正确读取这个文件?
这是nb和szReadBuffer:
这是我在notpad ++中的文件内容:
答案 0 :(得分:0)
您的代码运行正常。它将rdp文件逐字读入内存。
您对rdp文件开头的BOM (byte order mark)感到不安。
如果你用文本编辑器(例如记事本)查看rdp文件,你会看到:
screen mode id:i:2
use multimon:i:0
desktopwidth:i:2560
desktopheight:i:1600
....
如果你用十六进制编辑器查看rdp文件,你会看到:
0000 FFFE 7300 6300 7200 6500 6500 6E00 2000 ..s.c.r.e.e.n. .
0008 6D00 6F00 6400 6500 2000 6900 6400 3A00 m.o.d.e. .i.d...
....
FFFE
是字节顺序标记,表示该文件是以小端UNICODE编码的文本文件,因此每个字符占用2个字节而不是1个字节。
一旦文件在内存中读取,你就会得到这个(0x00318479是地址szReadBuffer
指向):
CloseHandle(hndlRead)
。HeapAlloc
,而不是malloc
或calloc
。更正程序:
#include <Windows.h>
int main()
{
HANDLE hndlRead;
WCHAR* szReadBuffer; // WCHAR instead of CHAR
INT fileSize;
hndlRead = CreateFileW(L"rdp.RDP", GENERIC_READ, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (hndlRead != INVALID_HANDLE_VALUE)
{
fileSize = GetFileSize(hndlRead, NULL);
szReadBuffer = (WCHAR*)calloc(fileSize + sizeof(WCHAR), 1); // + sizeof(WCHAR) for NUL string terminator
DWORD nb = 0;
int nSize = fileSize;
if (szReadBuffer != NULL)
{
ReadFile(hndlRead, szReadBuffer, nSize, &nb, NULL);
}
CloseHandle(hndlRead); // close what we have opened
WCHAR *textwithoutbom = szReadBuffer + 1; // skip BOM
// put breakpoint here and inspect textwithoutbom
free(szReadBuffer); // free what we have allocated
}
return 0;
}
答案 1 :(得分:0)
正如@MickaelWalz所建议的那样,RDP文件的文件格式现在是Unicode。
这是一种阅读和显示该文件内容的方法:
- 使用
wchar_t *
或CHAR *
缓冲区的BYTE *
缓冲区实例。- 检查
ReadFile()
是否已成功执行bRet == True
和nSize == nb
。- 启动第二个WCHAR以排除0xFFFE Unicode标识符。
- 请勿忘记关闭文件
CloseHandle(hndlRead);
!
#include <stdio.h>
#include <iostream>
#include <Windows.h>
int main()
{
HANDLE hndlRead;
OVERLAPPED ol = {0};
//BYTE* szReadBuffer;
INT fileSize;
wchar_t *szReadBuffer;
hndlRead = CreateFileW(L"rdp.RDP", GENERIC_READ, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (hndlRead != INVALID_HANDLE_VALUE)
{
fileSize = GetFileSize(hndlRead, NULL);
szReadBuffer = (wchar_t *) HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, (fileSize)*sizeof(wchar_t));
DWORD nb=0;
int nSize=fileSize;
BOOL bRet;
if (szReadBuffer != NULL)
{
bRet = ReadFile(hndlRead, szReadBuffer, nSize, &nb, &ol);
if ((bRet) && (nb == nSize)) {
printf("%02X,%02X... %02X\n",szReadBuffer[0],szReadBuffer[1],szReadBuffer[nb-1]);
std::wcout << L"info " << (szReadBuffer+1) << L" " << nb << std::endl;
}
}
CloseHandle(hndlRead);
}
return 0;
}