问题:我收到一个大小为n的字符串,我有很多类型为
的查询
[L,R]
对于每个查询,我必须返回s的子字符串的数量,这些子字符串等于s [L..R]并且在s中的索引L之前开始。
约束:
n <= 2 * 10 ^ 6
查询&lt; = 10 ^ 5
一种暴力方法是使用后缀数组,我们遍历LCP数组以查找每个查询的答案。
我认为O(q log n)方法对这个问题有好处。请建议任何??
提前感谢..
答案 0 :(得分:1)
可能的解决方案可能类似于solution that I provided for fast string search with many queries to the same string。该解决方案在C中具有示例实现。
L
之前开始的两场比赛之间的所有出现次数。但是,此解决方案不是O( q log n ),因为排序已经是O( n log n )和 q 查询查找为O( q log n )。
我已经重新编写了我为您的问题链接的示例。它是C,而不是C ++,它不会用C ++编译器编译,因为使用malloc
的方式。在惯用的C ++中重写它应该不会太难。
该解决方案需要大量内存用于后缀数组。它可以在一分多钟的时间内处理大约130,000个查询和1.3兆字节的文件。
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define die(...) exit((fprintf(stderr, "Fatal: " __VA_ARGS__), \
putc(10, stderr), 1))
typedef struct Haystack Haystack;
struct Haystack {
size_t size; /* Number of chars in string */
char *buf; /* Null-terminated char buffer */
char **ptr; /* Pointers into char buffer */
};
/*
* Count occurrence of c in zero-terminated string p.
*/
size_t strcount(const char *p, int c)
{
size_t n = 0;
for (;;) {
p = strchr(p, c);
if (p == NULL) return n;
p++;
n++;
}
return 0;
}
/*
* String comparison via pointers to strings.
*/
int pstrcmp(const void *a, const void *b)
{
const char *const *aa = a;
const char *const *bb = b;
return strcmp(*aa, *bb);
}
/*
* Create and prepare a hayst, i.e. a text file to search.
*/
Haystack *hayst_new(const char *fn)
{
Haystack *hayst;
FILE *f = fopen(fn, "r");
char *p;
char **pp;
if (f == NULL) die("Couldn't open %s", fn);
hayst = malloc(sizeof(*hayst));
if (hayst == NULL) die("Allocation failed");
fseek(f, 0, SEEK_END);
hayst->size = ftell(f);
fseek(f, 0, SEEK_SET);
hayst->buf = malloc(hayst->size + 1);
hayst->ptr = malloc(hayst->size * sizeof(*hayst->ptr));
if (hayst->buf == NULL) die("Allocation failed");
if (hayst->ptr == NULL) die("Allocation failed");
fread(hayst->buf, 1, hayst->size, f);
hayst->buf[hayst->size] = '\0';
fclose(f);
p = hayst->buf;
pp = hayst->ptr;
while (*p) *pp++ = p++;
qsort(hayst->ptr, hayst->size, sizeof(*hayst->ptr), pstrcmp);
return hayst;
}
/*
* Clean up hayst.
*/
void hayst_delete(Haystack *hayst)
{
free(hayst->buf);
free(hayst->ptr);
free(hayst);
}
/*
* Binary range search for string pointers.
*/
static char **pstr_bsearch(const char *key, size_t len,
char **arr, size_t high)
{
size_t low = 0;
while (low < high) {
size_t mid = (low + high) / 2;
int diff = strncmp(key, arr[mid], len);
if (diff <= 0) high = mid;
else low = mid + 1;
}
return arr + low;
}
/*
* Count occurrences of the string key in the haystack.
*/
size_t hayst_find(Haystack *hayst, size_t offset, size_t len)
{
char *key = hayst->buf + offset;
char **begin, **end;
size_t n = 0;
if (offset + len > hayst->size) return 0;
begin = pstr_bsearch(key, len, hayst->ptr, hayst->size);
if (begin == NULL) return 0;
key[len - 1]++;
end = pstr_bsearch(key, len, hayst->ptr, hayst->size);
key[len - 1]--;
if (end == NULL) return 0;
if (end == begin) return 0;
while (begin < end) {
if (*begin < key) n++;
begin++;
}
return n;
}
/*
* Example client code
*/
int main(int argc, char **argv)
{
Haystack *hayst;
FILE *f;
if (argc != 3) die("Usage: %s string queries", *argv);
hayst = hayst_new(argv[1]);
f = fopen(argv[2], "r");
if (f == NULL) die("Can't open %s.", argv[1]);
for (;;) {
char str[80];
size_t p, len;
size_t n;
if (fgets(str, sizeof(str), f) == NULL) break;
if (sscanf(str, "%zu %zu", &p, &len) < 2) continue;
n = hayst_find(hayst, p, len);
printf("%8zu %.*s\n", n, (int) len, hayst->buf + p);
}
fclose(f);
hayst_delete(hayst);
return 0;
}
答案 1 :(得分:1)
我建议构建一个已排序的后缀数组并使用二进制搜索进行查询。另一种方法是构建一系列查询并在字符串中进行单次传递以累积计数:
这种方法似乎更快。它无法处理重复的查询。只有事先知道所有查询才有意义。 (不幸的是,我的示例实现已经表明我的另一个答案的代码有一个严重的错误,可能是在远程二进制搜索中;它错误地计算了一些查询。)
编辑:我已在C中添加了我的示例实现 - 不是在C ++中,抱歉 - 下面。
实现是浪费的,因为它为除'\0'
之外的所有可能字符分配子节点,其中绝大多数将是NULL
。更好的实现会将查询的字符映射到更紧凑的字母表。
在读取整个数组之后,必须在一个额外的循环中构建trie,这样重新分配就不会使引用指针无效到该数组中。
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define die(...) exit((fprintf(stderr, "Fatal: " __VA_ARGS__), \
putc(10, stderr), 1))
typedef struct Query Query;
typedef struct Trie Trie;
typedef struct Trienode Trienode;
struct Query {
char *p; /* starting point in buffer */
size_t len; /* length of query in chars */
size_t count; /* number of occurrences */
};
struct Trie {
Trienode *head;
};
struct Trienode {
Query *query; /* query reference */
Trienode *next[255]; /* child nodes */
};
/*
* Read whole file to buffer and return size in n.
*/
char *slurp(const char *fn, size_t *n)
{
FILE *f = fopen(fn, "r");
size_t size;
char *buf;
if (f == NULL) die("Couldn't open %s", fn);
fseek(f, 0, SEEK_END);
size = ftell(f);
fseek(f, 0, SEEK_SET);
buf = malloc(size + 1);
if (buf == NULL) die("Allocation failed");
fread(buf, 1, size, f);
buf[size] = '\0';
fclose(f);
if (n) *n = size;
return buf;
}
/*
* Insert query string and reference into trie.
*/
void trie_insert(Trie *trie, Query *q)
{
char *str = q->p;
Trienode **n = &trie->head;
size_t i;
for (i = 0; i < q->len; i++) {
if (*n == NULL) {
*n = malloc(sizeof(**n));
if (*n == NULL) die("Coudn't allocate node");
memset(*n, 0, sizeof(**n));
}
n = &(*n)->next[(unsigned char) str[i] - 1];
}
if (*n == NULL) {
*n = malloc(sizeof(**n));
if (*n == NULL) die("Coudn't allocate node");
memset(*n, 0, sizeof(**n));
}
(*n)->query = q;
}
static void trie_delete_node(Trienode *n)
{
size_t i;
for (i = 0; i < 255; i++) {
if (n->next[i]) trie_delete_node(n->next[i]);
}
free(n);
}
/*
* Destroy trie and its nodes.
*/
void trie_delete(Trie *trie)
{
if (trie->head) trie_delete_node(trie->head);
}
/*
* Find occurrences of all queries. The count member of all
* queries must be 0 before calling this routine.
*/
void search(char *buf, Trie *trie)
{
while (*buf) {
Trienode *n = trie->head;
char *p = buf;
while (n && *p) {
if (n->query) {
Query *q = n->query;
if (buf < q->p) q->count++;
}
n = n->next[(unsigned char) *p - 1];
p++;
}
buf++;
}
}
/*
* Example client code
*/
int main(int argc, char **argv)
{
Query *query = NULL;
size_t nquery = 0;
size_t squery = 0;
char *buf;
size_t nbuf;
Trie trie = {NULL};
FILE *f;
size_t i;
if (argc != 3) die("Usage: %s string queries", *argv);
// Read string buffer from file
buf = slurp(argv[1], &nbuf);
// Read query array
f = fopen(argv[2], "r");
if (f == NULL) die("Can't open %s.", argv[1]);
for (;;) {
char str[80];
size_t p, len;
if (fgets(str, sizeof(str), f) == NULL) break;
if (sscanf(str, "%zu %zu", &p, &len) < 2) continue;
if (nquery >= squery) {
squery *= 2;
if (squery == 0) squery = 0x400;
query = realloc(query, squery * sizeof(*query));
if (query == NULL) die("Reallocation failed.");
}
query[nquery].p = buf + p;
query[nquery].len = len;
query[nquery].count = 0;
nquery++;
}
fclose(f);
// Build tree from queries
for (i = 0; i < nquery; i++) {
Query *q = query + i;
trie_insert(&trie, q);
}
// Assign the query counts
search(buf, &trie);
// Print results
for (i = 0; i < nquery; i++) {
Query *q = query + i;
printf("%8zu %.*s\n", q->count, (int) q->len, q->p);
}
// Clean up
trie_delete(&trie);
free(buf);
free(query);
return 0;
}