Question

我想在二进制文件（如0001）中标识指定的序列。该文件将打开并读取。此外，文件的大小显示得很好。不幸的是，我无法做出检测指定序列的算法。我陷入了这个麻烦。有人有什么想法吗？文件名和进程名等参数由命令提示符提供。该代码在下面列出。

 int blockchain(wchar_t *box)
 {
     int i=0;
     DWORD dwCounter=0;
     BYTE buffer[BUFFERSIZE]={0};
     HANDLE hFile = CreateFile(box, GENERIC_READ, FILE_SHARE_READ, NULL,
         OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);

     if(INVALID_HANDLE_VALUE == hFile) {
         wprintf(L"Error! Wrong file name!\n");
         return -5;
     }

     if(FALSE==ReadFile(hFile, &buffer, BUFFERSIZE, &dwCounter, NULL)) {
         wprintf(L"Terminal failure: Unable to read from file.\n");
         CloseHandle(hFile);
         return -5;
     } else {
         for(i;i<dwCounter;i++) {
             if(wcscmp(buffer,L"0001") == 0)
                 wprintf(L"The data is found");
         }
         wprintf(L"\nData read from %s (%d bytes): \n", box, dwCounter);    
     }
     CloseHandle(hFile);
     return 0;
 }

Answer 1

（正在回答，因为我还没有评论要回复。）

在我上面的评论中，您对memcmp的用法对我来说似乎不太正确，也对原始问题中wcscmp的用法并不满意；您在每次测试后都将i递增，但是每次都使用未修改的相同缓冲区指针-根据我的阅读，在两种情况下，您都在每次通过时都测试完全相同的字节（{{1的开始}}），而不是逐步处理您加载的数据。另外，对于buffer情况，您使用的是memcmp字符串文字（wchar_t）；这将是8个字节长的内存，而不是您的注释调用中的4个字节，因此比较将仅针对前两个零。

这里的正确方法将取决于您要处理的数据的性质。您是要在ASCII编码文件中查找字符序列L"0001"，还是在Unicode文件中查找0001（由用户看到）还是十六进制字节0001，还是某些字符序列？该表示的其他含义？如果这是ASCII文件中的字符序列，则可能正在寻找类似0x1的字符以逐字节检查字符序列，遍历缓冲区，并且每次迭代所比较的字节数都不超过缓冲区中保留的字节数。

但是，如果有合理可行的替代方法，我建议不要使用这样的行。听起来问题How to search in a BYTE array for a pattern?可能也涵盖了您正在尝试做的事情。我怀疑该问题的答案也可能比这种逐字节方法更好。或者，您可以考虑使用正则表达式实现（如果可用），并且正在处理目标值可以通过正则表达式合理表达的字符/文本数据。

Answer 2

从https://pp.userapi.com/c849236/v849236424/46ac8/yy8w6ak1Ddc.jpg开始，我假设您要查找给定的字节序列。

因此，看来标准函数strstr无法正常工作，因为它无法处理NUL字节（等于0）。

从匹配功能开始（您可以在下一个功能中内联它，为清楚起见，我将其单独提供）：

bool MatchBytes( const BYTE* a, const BYTE* b, int num_bytes )
{
    return memcmp( a, b, num_bytes ) == 0;
}

编写函数以查找序列：

int FindBytesIndex(
       const BYTE* find_in,
       int num_bytes_in_find_in,
       const BYTE* to_find,
       int num_bytes_to_find )
{
    int pos;
    for ( pos = 0 ; pos <= num_bytes_in_find_in - num_bytes_to_find ; ++pos )
        if ( MatchBytes( find_in + pos, to_find, num_bytes_to_find ) )
            return pos;
    return -1;
}

或者：

const BYTE* FindBytesPtr(
       const BYTE* find_in,
       int num_bytes_in_find_in,
       const BYTE* to_find,
       int num_bytes_to_find )
{
    const BYTE* end = find_in + num_bytes_in_find_in - num_bytes_to_find;
    for ( ; find_in < end ; ++find_in )
        if ( MatchBytes( find_in, to_find, num_bytes_to_find ) )
            return find_in;
    return NULL;
}

用法：

BYTE bytes[] = { 0, 0, 0, 1 };
int num_bytes = 4;
int index = FindBytesIndex( buffer, dwCounter, bytes, num_bytes );
if ( index >= 0 )
    printf( "Found at index %d\n", index );
else
    printf( "Not found\n" );

如果您希望所有出现的情况：

BYTE bytes[] = { 0, 0, 0, 1 };
int num_bytes = 4;
int index = 0;
int count = 0;
/* There is assigment in loop condition. Found index is assigned to "index" variable */
/* then it is checked if >= 0 */
while ( ( index = FindBytesIndex(buffer + index, dwCounter - index, bytes, num_bytes) ) >= 0 )
{
    printf( "Found at index %d\n", index );
    ++count;
}

if ( count > 0 )
    printf( "Found %d occurrences\n", count );
else
    printf( "Not found\n" );

您使用的上面显示的算法被称为“天真模式搜索”。如果速度太慢，则可以使用更快的算法“ Boyer-Moore”，“ Rabin-Karp”，“ KMP”（“ Knuth-Morris-Pratt”）进行搜索。

如何使用WinAPI / Win32在二进制文件中找到序列？

2 个答案: