为什么Horspool不能使用二进制文件?

时间:2017-12-08 22:23:14

标签: c algorithm search

我正在尝试在C中制作一个快速简单的签名检测程序。它应该读取二进制文件(.exe,ELF,库等...)并搜索二进制数据(有时是字符串,有时是字节);

我在C中有一个简单的测试程序:

#include <stdio.h>
#include <unistd.h>

const char *str = "TestingOneTwoThree";

int main()
{
    while(1)
    {
        fprintf(stdout, "%s %ld\n", str, (long)getpid());
        sleep(1);
    }

}

这是我正在使用的horspool算法。我直接从这里找到的维基百科伪代码中调整了它:https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore%E2%80%93Horspool_algorithm

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define HORSPOOL_COUNT 256
#define BLOCK_SIZE 1024
#define MAX(a, b) a > b ? a : b

ssize_t horspool_find(const char *buf, size_t buflen, const char *egg, size_t egglen)
{
    int table[HORSPOOL_COUNT];
    ssize_t shift = 0, i, tmp;

    for(i = 0; i < HORSPOOL_COUNT; ++i)
    {
        table[i] = (int)egglen;
    }

    for(i = 0; i < egglen - 1; ++i)
    {
        table[(int)egg[i]] = egglen - i - 1;
    }

    while(shift <= buflen - egglen)
    {
        i = egglen - 1;
        while(buf[shift + i] == egg[i])
        {
            if(i == 0)
            {
                return shift;
            }
            i--;
        }
        shift += MAX(1, table[(int)buf[shift + egglen - 1]]);
    }
    return -1;
}

char *readfile(const char *filename, size_t *size)
{
    int ch;
    size_t used = 0, allocated = 0;
    char *buf = NULL, *tmp = NULL;
    FILE *f;

    if((f = fopen(filename, "rb")) == NULL)
    {
        if(size) *size = 0;
        return perror("fopen"), NULL;
    }

    while((ch=fgetc(f)) != EOF)
    {
        if(used >= allocated)
        {
            allocated += BLOCK_SIZE;
            tmp = realloc(buf, allocated);
            if(tmp == NULL)
            {
                free(buf);
                if(size) *size = 0;
                fclose(f);
                return perror("realloc"), NULL;
            }
            buf = tmp;
        }
        buf[used++] = (char)ch;
    }

    fclose(f);
    if(size) *size = used;
    return realloc(buf, used);
}

ssize_t naivealg_find(const char *buf, size_t buflen, const char *find, size_t findlen)
{
    size_t i, j, diff = buflen - findlen;
    for(i = 0; i < diff; ++i)
    {
        for(j = 0; j < findlen; ++j)
        {
            if(buf[i+j] != find[j])
            {
                break;
            }
        }
        if(j == findlen)
        {
            return (ssize_t)i;
        }
    }
    return -1;
}

int main()
{
    size_t size;
    char *buf = readfile("./a.out", &size);
    char *pat = "TestingOneTwoThree";
    ssize_t pos1 = horspool_find(buf, size, pat, strlen(pat));
    ssize_t pos2 = naivealg_find(buf, size, pat, strlen(pat));
    fprintf(stdout, "Offsets: %zd ~ %zd\n", pos1, pos2);
    return 0;
}

输出类似于:

Offsets: -1 ~ 2052

注意:

  • 相同的缓冲区和“egg”与天真的搜索实现一起工作。
  • horspool实现似乎可以使用普通字符串作为bufegg参数正常工作。

1 个答案:

答案 0 :(得分:2)

代码使用带签名的char并且二进制数据会不时地使用负索引进行错误索引。

// table[(int)buf[shift + egglen - 1]]
table[(unsigned char )buf[shift + egglen - 1]]

此问题也存在于egg模式中。

// table[(int) egg[i]] = egglen - i - 1;
table[(unsigned char) egg[i]] = egglen - i - 1;

buflen < egglen

时会出现其他问题
// while (shift <= buflen - egglen)
// change to avoid underflow
while (shift + egglen <= buflen)

还考虑以二进制文件打开文件:

ssize_t shift,i; --> size_t shift,i;

int table[HORSPOOL_COUNT]; -- > size_t table[HORSPOOL_COUNT];

()添加到#define MAX(a, b) (((a) > (b)) ? (a) : (b))