我有一个包含不同文章的smg文件。现在,我想使用前缀树数据结构来为整个文档集建立基线字数。该文件的示例可以在下面找到:
<REUTERS TOPICS="YES" LEWISSPLIT="TRAIN" CGISPLIT="TRAINING-SET"
OLDID="5544" NEWID="1">
<DATE>26-FEB-1987 15:01:01.79</DATE>
<TOPICS><D>cocoa</D></TOPICS>
<PLACES><D>el-salvador</D><D>usa</D><D>uruguay</D></PLACES>
<PEOPLE></PEOPLE>
<ORGS></ORGS>
<EXCHANGES></EXCHANGES>
<COMPANIES></COMPANIES>
<UNKNOWN>
C T
f0704reute
u f BC-BAHIA-COCOA-REVIEW 02-26 0105</UNKNOWN>
<TEXT>
<TITLE>BAHIA COCOA REVIEW</TITLE>
<DATELINE> SALVADOR, Feb 26 - </DATELINE><BODY>
Some text here.
Reuter
</BODY></TEXT>
</REUTERS>
关于如何建立基准字数的任何建议?
答案 0 :(得分:0)
使用trie数据结构更快地加载字符串和检索建议
public class Trie
{
public struct Letter
{
public const string Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
public static implicit operator Letter(char c)
{
c = c.ToString().ToUpper().ToCharArray().First();
return new Letter() { Index = Chars.IndexOf(c) };
}
public int Index;
public char ToChar()
{
return Chars[Index];
}
public override string ToString()
{
return Chars[Index].ToString();
}
}
public class Node
{
public string Word;
public bool IsTerminal { get { return Word != null; } }
public Dictionary<Letter, Node> Edges = new Dictionary<Letter, Node>();
}
public Node Root = new Node();
public Trie(string[] words)
{
for (int w = 0; w < words.Length; w++)
{
var word = words[w];
var node = Root;
for (int len = 1; len <= word.Length; len++)
{
var letter = word[len - 1];
Node next;
if (!node.Edges.TryGetValue(letter, out next))
{
next = new Node();
if (len == word.Length)
{
next.Word = word;
}
node.Edges.Add(letter, next);
}
node = next;
}
}
}
public List<string> GetSuggestions(string word, int max)
{
List<string> outPut = new List<string>();
var node = Root;
int i = 0;
foreach (var l in word)
{
Node cNode;
if (node.Edges.TryGetValue(l, out cNode))
{
node = cNode;
}
else
{
if (i == word.Length - 1)
return outPut;
}
i++;
}
GetChildWords(node, ref outPut, max);
return outPut;
}
public void GetChildWords(Node n, ref List<string> outWords, int Max)
{
if (n.IsTerminal && outWords.Count < Max)
outWords.Add(n.Word);
foreach (var item in n.Edges)
{
GetChildWords(item.Value, ref outWords, Max);
}
}
}