Question

我们获得了一个模式字符串：＆＃39; foo＆＃39;和源字符串：＆＃39; foobaroofzaqofom＆＃39;我们需要以任何字母顺序查找所有出现的单词模式字符串。因此，对于给定的示例，解决方案将如下所示：[＆＃39; foo＆＃39;，＆＃39; oof＆＃39; of＆＃39; ofo＆＃39;]。

我有一个解决方案，但我不确定它是最有效的解决方案：

创建模式字符串的字符的hash_map，其中每个字符是一个键，每个值都是模式中字符的计数器。对于给定的示例，它将是{{f：1}，{o：2}}
查看源字符串，如果找到hash_map中的一个元素，则尝试查找pattern的所有其余元素
如果发现所有元素都不是我们的解决方案，那么

这是c ++中的一个实现：

set<string> FindSubstringPermutations(string& s, string& p)
{
    set<string> result; 
    unordered_map<char, int> um;

    for (auto ch : p)
    {
        auto it = um.find(ch);
        if (it == um.end())
            um.insert({ ch, 1 });
        else
            um[ch] += 1;
    }

    for (int i = 0; i < (s.size() - p.size() + 1); ++i)
    {
        auto it = um.find(s[i]);
        if (it != um.end())
        {
            decltype (um) um_c = um;
            um_c[s[i]] -= 1;
            for (int t = (i + 1); t < i + p.size(); ++t)
            {
                auto it = um_c.find(s[t]);
                if (it == um_c.end())
                    break;
                else if (it->second == 0)
                    break;
                else
                    it->second -= 1;
            }

            int sum = 0;
            for (auto c : um_c)
                sum += c.second;

            if (sum == 0)
                result.insert(s.substr(i, p.size()));
        }
    }

    return result;
}

复杂性接近O（n），我不知道如何更精确地计算。

所以问题：是否有任何有效的解决方案，因为使用hash_map是一些黑客攻击，我认为使用简单数组和找到元素的标志可能会有更有效的解决方案。

Answer 1

您可以使用与滑动窗口一起使用的顺序不变哈希算法来优化一些事情。

这种哈希算法的一个例子可能是

int hash(string s){
    int result = 0;

    for(int i = 0; i < s.length(); i++)
        result += s[i];

    return result;
}

这个算法有点过于简单，除了性能（即分布和可能的哈希值的数量）之外的所有点都相当可怕，但这并不难改变。

这种哈希算法的优点是：

hash("abc") == hash("acb") == hash("bac") == ...

使用这个算法的滑动窗口非常简单：

string s = "abcd";

hash(s.substring(0, 3)) + 'd' - 'a' == hash(s.substring(1, 3));

这种散列方法的这两个属性允许我们这样做：

int hash(string s){
    return sum(s.chars);
}

int slideHash(int oldHash, char slideOut, char slideIn){
    return oldHash - slideOut + slideIn;
}

int findPermuted(string s, string pattern){
    int patternHash = hash(pattern);
    int slidingHash = hash(s.substring(0, pattern.length()));

    if(patternHash == slidingHash && isPermutation(pattern, s.substring(0, pattern.length())
        return 0;

    for(int i = 0; i < s.length() - pattern.length(); i++){
        slidingHash = slideHash(slidingHash, s[i], s[i + pattern.length()]);

        if(patternHash == slidingHash)
            if(isPermutation(pattern, s.substring(i + 1, pattern.length())
                return i + 1;
    }

    return -1;
}

这基本上是适用于置换字符串的Rabin-Karp-algorithm的更改版本。这种方法的主要优点是实际上需要比较较少的字符串，这带来了相当多的优势。这特别适用于此，因为比较（检查字符串是否是另一个字符串的排列）本身已经非常昂贵。

注意：
上面的代码只能作为一个想法的演示。它的目的是易于理解而不是表现，不应直接使用。

编辑：
以上＆＃34;实施＆＃34;不应使用顺序不变的滚动散列算法，因为它在数据分布方面表现极差。当然，这种散列显然存在一些问题：可以生成散列的唯一方法是字符的实际值（无索引！），需要使用可逆操作来累积。

更好的方法是将每个角色映射到素数（不要使用2 !!!）。由于所有操作都是模2^(8 * sizeof(hashtype))（整数溢出），因此我们需要为所有使用过的素数生成一个模2^(8 * sizeof(hashtype))乘法逆的表。我不打算生成这些表格，因为这里有很多关于该主题的资源。

最终的哈希值如下所示：

map<char, int> primes = generatePrimTable();
map<int, int> inverse = generateMultiplicativeInverses(primes);

unsigned int hash(string s){
    unsigned int hash = 1;
    for(int i = 0; i < s.length(); i++)
        hash *= primes[s[i]];

    return hash;
}

unsigned int slideHash(unsigned int oldHash, char slideOut, char slideIn){
    return oldHash * inverse[primes[slideOut]] * primes[slideIn];
}

请记住，此解决方案适用于无符号整数。

Answer 2

anagrams

的典型滚动哈希函数

使用素数产品
这只适用于相对较短的模式
所有正常单词的哈希值将适合64位值而不会溢出。
Based on this anagram matcher

/* braek; */
/* 'foobaroofzaqofom' */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

typedef unsigned long long HashVal;
static HashVal hashchar (unsigned char ch);
static HashVal hashmem (void *ptr, size_t len);

unsigned char primes26[] =
{ 5,71,79,19,2,83,31,43,11,53,37,23,41,3,13,73,101,17,29,7,59,47,61,97,89,67, };
/*********************************************/
static HashVal hashchar (unsigned char ch)
{
HashVal val=1;

if (ch >= 'A' && ch <= 'Z' ) val = primes26[ ch - 'A'];
else if (ch >= 'a' && ch <= 'z' ) val = primes26[ ch - 'a'];

return val;
}

static HashVal hashmem (void *ptr, size_t len)
{
size_t idx;
unsigned char *str = ptr;
HashVal val=1;

if (!len) return 0;
for (idx = 0; idx < len; idx++) {
        val *= hashchar ( str[idx] );
        }

return val;
}
/*********************************************/


unsigned char buff [4096];
int main (int argc, char **argv)
{
size_t patlen,len,pos,rotor;
int ch;
HashVal patval;
HashVal rothash=1;

patlen = strlen(argv[1]);
patval = hashmem( argv[1], patlen);
// fprintf(stderr, "Pat=%s, len=%zu, Hash=%llx\n", argv[1], patlen, patval);

for (rotor=pos=len =0; ; len++) {
        ch=getc(stdin);
        if (ch == EOF) break;

        if (ch < 'A' || ch > 'z') { pos = 0; rothash = 1; continue; }
        if (ch > 'Z' && ch < 'a') { pos = 0; rothash = 1; continue; }
                /* remove old char from rolling hash */
        if (pos >= patlen) { rothash /= hashchar(buff[rotor]); }
                /* add new char to rolling hash */
        buff[rotor] = ch;
        rothash *= hashchar(buff[rotor]);

        // fprintf(stderr, "%zu: [rot=%zu]pos=%zu, Hash=%llx\n", len, rotor, pos, rothash);

        rotor = (rotor+1) % patlen;
                /* matched enough characters ? */
        if (++pos < patlen) continue;
                /* correct hash value ? */
        if (rothash != patval) continue;
        fprintf(stdout, "Pos=%zu\n", len);
        }

return 0;
}

输出/结果：

$ ./a.out foo < anascan.c
Pos=21
Pos=27
Pos=33

更新。对于不喜欢素数乘积的人，这里是 taxinumber 多维数据集的总和（+额外的直方图检查）实现。这也应该是8位清洁。注意立方体不是必需的;它与方块同样很好。或者只是总和。（最终的直方图检查将有更多的工作待办事项）

/* braek; */
/*  'foobaroofzaqofom' */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

typedef unsigned long long HashVal;
static HashVal hashchar (unsigned char ch);
static HashVal hashmem (void *ptr, size_t len);

/*********************************************/
static HashVal hashchar (unsigned char ch)
{
HashVal val=1+ch;

return val*val*val;
}

static HashVal hashmem (void *ptr, size_t len)
{
size_t idx;
unsigned char *str = ptr;
HashVal val=1;

if (!len) return 0;
for (idx = 0; idx < len; idx++) {
        val += hashchar ( str[idx] );
        }

return val;
}
/*********************************************/
int main (int argc, char **argv)
{
size_t patlen,len,rotor;
int ch;
HashVal patval;
HashVal rothash=1;
unsigned char *patstr;
unsigned pathist[256] = {0};
unsigned rothist[256] = {0};
unsigned char cycbuff[1024];

patstr = (unsigned char*) argv[1];
patlen = strlen((const char*) patstr);
patval = hashmem( patstr, patlen);

for(rotor=0; rotor < patlen; rotor++) {
        pathist [ patstr[rotor] ] += 1;
        }
fprintf(stderr, "Pat=%s, len=%zu, Hash=%llx\n", argv[1], patlen, patval);

for (rotor=len =0; ; len++) {
        ch=getc(stdin);
        if (ch == EOF) break;

                /* remove old char from rolling hash */
        if (len >= patlen) {
                rothash -= hashchar(cycbuff[rotor]);
                rothist [ cycbuff[rotor] ] -= 1;
                }
                /* add new char to rolling hash */
        cycbuff[rotor] = ch;
        rothash += hashchar(cycbuff[rotor]);
        rothist [ cycbuff[rotor] ] += 1;

        // fprintf(stderr, "%zu: [rot=%zu], Hash=%llx\n", len, rotor, rothash);

        rotor = (rotor+1) % patlen;
                /* matched enough characters ? */
        if (len < patlen) continue;
                /* correct hash value ? */
        if (rothash != patval) continue;
                /* correct histogram? */
        if (memcmp(rothist,pathist, sizeof pathist)) continue;
        fprintf(stdout, "Pos=%zu\n", len-patlen);
        }

return 0;
}

查找给定源字符串中给定字符串的所有字符串排列

2 个答案: