计算字符串中子字符串的特定匹配项

时间:2014-10-12 07:31:26

标签: c++ algorithm substring query-optimization counting

问题:我收到一个大小为n的字符串,我有很多类型为
的查询 [L,R]
对于每个查询,我必须返回s的子字符串的数量,这些子字符串等于s [L..R]并且在s中的索引L之前开始。

约束:
 n <= 2 * 10 ^ 6
 查询&lt; = 10 ^ 5

一种暴力方法是使用后缀数组,我们遍历LCP数组以查找每个查询的答案。

我认为O(q log n)方法对这个问题有好处。请建议任何??

提前感谢..

2 个答案:

答案 0 :(得分:1)

可能的解决方案可能类似于solution that I provided for fast string search with many queries to the same string。该解决方案在C中具有示例实现。

  • 创建指向字符串每个字符的指针数组。排序。
  • 使用二分查找在已排序的auffix数组中查找第一次出现的查询。
  • 查找最后一次查询。您可以通过递增最后一个字符来执行此操作,以便您查找“abd”而不是“abc”来查找第一个不匹配。
  • 计算在L之前开始的两场比赛之间的所有出现次数。

但是,此解决方案不是O( q log n ),因为排序已经是O( n log n )和 q 查询查找为O( q log n )。

我已经重新编写了我为您的问题链接的示例。它是C,而不是C ++,它不会用C ++编译器编译,因为使用malloc的方式。在惯用的C ++中重写它应该不会太难。

该解决方案需要大量内存用于后缀数组。它可以在一分多钟的时间内处理大约130,000个查询和1.3兆字节的文件。

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define die(...) exit((fprintf(stderr, "Fatal: " __VA_ARGS__), \
    putc(10, stderr), 1))



typedef struct Haystack Haystack;    

struct Haystack {
    size_t size;    /* Number of chars in string */
    char *buf;      /* Null-terminated char buffer */
    char **ptr;     /* Pointers into char buffer */
};

/*
 *      Count occurrence of c in zero-terminated string p.
 */
size_t strcount(const char *p, int c)
{
    size_t n = 0;

    for (;;) {
        p = strchr(p, c);
        if (p == NULL) return n;
        p++;
        n++;        
    }

    return 0;
}

/*
 *      String comparison via pointers to strings.
 */
int pstrcmp(const void *a, const void *b)
{
    const char *const *aa = a;
    const char *const *bb = b;

    return strcmp(*aa, *bb);
}

/*
 *      Create and prepare a hayst, i.e. a text file to search.
 */
Haystack *hayst_new(const char *fn)
{
    Haystack *hayst;
    FILE *f = fopen(fn, "r");
    char *p;
    char **pp;

    if (f == NULL) die("Couldn't open %s", fn);

    hayst = malloc(sizeof(*hayst));
    if (hayst == NULL) die("Allocation failed");

    fseek(f, 0, SEEK_END);
    hayst->size = ftell(f);
    fseek(f, 0, SEEK_SET);

    hayst->buf = malloc(hayst->size + 1);
    hayst->ptr = malloc(hayst->size * sizeof(*hayst->ptr));

    if (hayst->buf == NULL) die("Allocation failed");
    if (hayst->ptr == NULL) die("Allocation failed");

    fread(hayst->buf, 1, hayst->size, f);
    hayst->buf[hayst->size] = '\0';
    fclose(f);

    p = hayst->buf;
    pp = hayst->ptr;
    while (*p) *pp++ = p++;

    qsort(hayst->ptr, hayst->size, sizeof(*hayst->ptr), pstrcmp);

    return hayst;
}

/*
 *      Clean up hayst.
 */
void hayst_delete(Haystack *hayst)
{
    free(hayst->buf);
    free(hayst->ptr);
    free(hayst);
}

/*
 *      Binary range search for string pointers.
 */
static char **pstr_bsearch(const char *key, size_t len,
    char **arr, size_t high)
{
    size_t low = 0;

    while (low < high) {
        size_t mid = (low + high) / 2;
        int diff = strncmp(key, arr[mid], len);

        if (diff <= 0) high = mid;
        else low = mid + 1;
    }

    return arr + low;
}

/*
 *      Count occurrences of the string key in the haystack.
 */
size_t hayst_find(Haystack *hayst, size_t offset, size_t len)
{
    char *key = hayst->buf + offset;
    char **begin, **end;
    size_t n = 0;

    if (offset + len > hayst->size) return 0;

    begin = pstr_bsearch(key, len, hayst->ptr, hayst->size);
    if (begin == NULL) return 0;

    key[len - 1]++;
    end = pstr_bsearch(key, len, hayst->ptr, hayst->size);
    key[len - 1]--;
    if (end == NULL) return 0;
    if (end == begin) return 0;

    while (begin < end) {
        if (*begin < key) n++;
        begin++;
    }

    return n;
}

/*
 *      Example client code
 */
int main(int argc, char **argv)
{
    Haystack *hayst;
    FILE *f;

    if (argc != 3) die("Usage: %s string queries", *argv);

    hayst = hayst_new(argv[1]);

    f = fopen(argv[2], "r");
    if (f == NULL) die("Can't open %s.", argv[1]);
    for (;;) {
        char str[80];
        size_t p, len;
        size_t n;

        if (fgets(str, sizeof(str), f) == NULL) break;
        if (sscanf(str, "%zu %zu", &p, &len) < 2) continue;
        n = hayst_find(hayst, p, len);
        printf("%8zu %.*s\n", n, (int) len, hayst->buf + p);
    }
    fclose(f);

    hayst_delete(hayst);

    return 0;
}

答案 1 :(得分:1)

我建议构建一个已排序的后缀数组并使用二进制搜索进行查询。另一种方法是构建一系列查询并在字符串中进行单次传递以累积计数:

  • 阅读所有查询。查询具有左右索引以及计数。
  • 为所有查询构建一个trie。指向查询的指针用作每个查询的结束标记。
  • 遍历字符串一次。对于每个char,遍历trie并在具有指向查询的指针的节点处累积计数。

这种方法似乎更快。它无法处理重复的查询。只有事先知道所有查询才有意义。 (不幸的是,我的示例实现已经表明我的另一个答案的代码有一个严重的错误,可能是在远程二进制搜索中;它错误地计算了一些查询。)

编辑:我已在C中添加了我的示例实现 - 不是在C ++中,抱歉 - 下面。

实现是浪费的,因为它为除'\0'之外的所有可能字符分配子节点,其中绝大多数将是NULL。更好的实现会将查询的字符映射到更紧凑的字母表。

在读取整个数组之后,必须在一个额外的循环中构建trie,这样重新分配就不会使引用指针无效到该数组中。

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define die(...) exit((fprintf(stderr, "Fatal: " __VA_ARGS__), \
    putc(10, stderr), 1))



typedef struct Query Query;
typedef struct Trie Trie;
typedef struct Trienode Trienode;

struct Query {
    char *p;                /* starting point in buffer */
    size_t len;             /* length of query in chars */
    size_t count;           /* number of occurrences */
};

struct Trie {
    Trienode *head;
};

struct Trienode {
    Query *query;           /* query reference */
    Trienode *next[255];    /* child nodes */
};

/*
 *      Read whole file to buffer and return size in n.
 */
char *slurp(const char *fn, size_t *n)
{
    FILE *f = fopen(fn, "r");
    size_t size;
    char *buf;

    if (f == NULL) die("Couldn't open %s", fn);

    fseek(f, 0, SEEK_END);
    size = ftell(f);
    fseek(f, 0, SEEK_SET);

    buf = malloc(size + 1);

    if (buf == NULL) die("Allocation failed");

    fread(buf, 1, size, f);
    buf[size] = '\0';
    fclose(f);

    if (n) *n = size;
    return buf;
}

/*
 *      Insert query string and reference into trie.
 */
void trie_insert(Trie *trie, Query *q)
{
    char *str = q->p;
    Trienode **n = &trie->head;
    size_t i;

    for (i = 0; i < q->len; i++) {    
        if (*n == NULL) {
            *n = malloc(sizeof(**n));
            if (*n == NULL) die("Coudn't allocate node");
            memset(*n, 0, sizeof(**n));
        }

        n = &(*n)->next[(unsigned char) str[i] - 1];
    }   

    if (*n == NULL) {
        *n = malloc(sizeof(**n));
        if (*n == NULL) die("Coudn't allocate node");
        memset(*n, 0, sizeof(**n));
    }

    (*n)->query = q;
}

static void trie_delete_node(Trienode *n)
{
    size_t i;

    for (i = 0; i < 255; i++) {
        if (n->next[i]) trie_delete_node(n->next[i]);
    }

    free(n);
}

/*
 *      Destroy trie and its nodes.
 */
void trie_delete(Trie *trie)
{
    if (trie->head) trie_delete_node(trie->head);
}

/*
 *      Find occurrences of all queries. The count member of all
 *      queries must be 0 before calling this routine.
 */
void search(char *buf, Trie *trie)
{
    while (*buf) {
        Trienode *n = trie->head;
        char *p = buf;

        while (n && *p) {
            if (n->query) {
                Query *q = n->query;

                if (buf < q->p) q->count++;
            }
            n = n->next[(unsigned char) *p - 1];
            p++;
        }

        buf++;
    }
}

/*
 *      Example client code
 */
int main(int argc, char **argv)
{
    Query *query = NULL;
    size_t nquery = 0;
    size_t squery = 0;

    char *buf;
    size_t nbuf;

    Trie trie = {NULL};
    FILE *f;
    size_t i;

    if (argc != 3) die("Usage: %s string queries", *argv);

    // Read string buffer from file
    buf = slurp(argv[1], &nbuf);

    // Read query array
    f = fopen(argv[2], "r");
    if (f == NULL) die("Can't open %s.", argv[1]);
    for (;;) {
        char str[80];
        size_t p, len;

        if (fgets(str, sizeof(str), f) == NULL) break;
        if (sscanf(str, "%zu %zu", &p, &len) < 2) continue;

        if (nquery >= squery) {
            squery *= 2;
            if (squery == 0) squery = 0x400;
            query = realloc(query, squery * sizeof(*query));
            if (query == NULL) die("Reallocation failed.");
        }

        query[nquery].p = buf + p;
        query[nquery].len = len;
        query[nquery].count = 0;
        nquery++;
    }
    fclose(f);

    // Build tree from queries
    for (i = 0; i < nquery; i++) {
        Query *q = query + i;
        trie_insert(&trie, q);
    }

    // Assign the query counts
    search(buf, &trie);

    // Print results
    for (i = 0; i < nquery; i++) {
        Query *q = query + i;
        printf("%8zu %.*s\n", q->count, (int) q->len, q->p);
    }

    // Clean up    
    trie_delete(&trie);
    free(buf);
    free(query);

    return 0;
}