从给定的单词集合中获取单词以进行校对

时间:2014-11-24 04:19:55

标签: c# regex

我有一个存储在List对象中的单词集合,例如这里的标题集合

Lorem Ipsum
Centuries
Electronic

这是示例段落,我想查找这个词 lorem ipsum只是印刷和排版行业的虚拟文本。自16世纪以来,Loren Ipsum一直是业界标准的虚拟文本,当时一个未知的打印机拿了一个类型的厨房,并拼写它制作一个类型的标本书。它不仅存在了五个世纪,而且还延续了电子排版,基本保持不变。它在20世纪60年代随着包含LorenIpsum段落的Letraset表格的推出而得到普及,最近还推出了包括LoremIpsum版本在内的桌面出版软件Aldus PageMaker。

我的目标是,我想在该段中提取这些词,如果拼写错误则无关紧要,因为目标是纠正大小写和拼写错误的单词。

我的预期结果是

lorem ipsum
Loren Ipsum
centuries
electornic
LorenIpsum
LoremIpsum

但不限于这些因为这将涉及整篇文章和文章的内容

抱歉,我还没有任何书面代码,但我打算在这里使用RegEx for C#。

1 个答案:

答案 0 :(得分:0)

互联网上有许多算法可以检查两个单词之间的相似性。 GetEdits就是其中之一。

可以使用以下代码。但它可能效率不高。

static int GetEdits(string answer, string guess)
{
    guess = guess.ToLower();
    answer = answer.ToLower();

    int[,] d = new int[answer.Length + 1, guess.Length + 1];
    for (int i = 0; i <= answer.Length; i++)
        d[i, 0] = i;
    for (int j = 0; j <= guess.Length; j++)
        d[0, j] = j;
    for (int j = 1; j <= guess.Length; j++)
        for (int i = 1; i <= answer.Length; i++)
            if (answer[i - 1] == guess[j - 1])
                d[i, j] = d[i - 1, j - 1];  //no operation
            else
                d[i, j] = Math.Min(Math.Min(
                    d[i - 1, j] + 1,    //a deletion

                    d[i, j - 1] + 1),   //an insertion

                    d[i - 1, j - 1] + 1 //a substitution

                );
    return d[answer.Length, guess.Length];
}

static void Main(string[] args)
{
    const string text = @"lorem ipsum is simply dummy text of the printing and typesetting industry. Loren Ipsum has been the industrys standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing LorenIpsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of LoremIpsum.";

    var findWords = new string[]
    {
        "Lorem Ipsum",
        "Centuries",
        "Electronic"
    };

    const int MaxErrors = 2;

    // Tokenize text
    var tokens = text.Split(' ', ',' , '.');

    for (int i = 0; i < tokens.Length; i++)
    {
        if( tokens[i] != String.Empty)
        {
            foreach (var findWord in findWords)
            {
                if (GetEdits(findWord, tokens[i]) <= MaxErrors)
                {
                    Console.WriteLine(tokens[i]);
                    break;
                }
                // Join with the next word and check again.
                else if(findWord.Contains(' ') && i + 1 < tokens.Length)
                {
                    string token = tokens[i] + " " + tokens[i + 1];
                    if (GetEdits(findWord, token) <= MaxErrors)
                    {
                        Console.WriteLine(token);
                        i++;
                        break;
                    }
                }
            }
        }
    }
}