我已经获得了大约20,000个严格的字母/文本字符串列表,这些字符串作为CSV文件输出到Excel,但它非常混乱。 我想要做的是查询一个单独的英语词典单词的参考文件,这样我就可以基本上创建一个查找并返回字典单词,减去文本噪音的加载量,该文本噪音可以预先添加或附加到字符串中。示例如下。
xyzbuildingcontractor = Building Contractor
upholsteryabcdef = Upholstery
lmnoengineer = Engineer
作为一名相对n00b的程序员,我只想评估最佳方法的意见以及Excel是否是最佳平台。
提前感谢任何指导,非常感谢。
吉姆
答案 0 :(得分:2)
好的,这是一个非常粗略的草稿,您可能需要进行调整,但一般的想法是:
clsTrieIterator
班级允许在Trie clsTrieIterator
clsTrieIterators
消耗每个下一个字符,如果给定字典无法生成字符组合,则会停止跟踪以下是一个简短的使用示例:
Public Sub Main()
Dim wf As clsWordFinder
Set wf = New clsWordFinder
wf.Add "Building"
wf.Add "Contractor"
wf.Add "Upholstery"
wf.Add "Engineer"
Debug.Print wf.getWordsFromString("xyzbuildingcontractor")
Debug.Print wf.getWordsFromString("upholsteryabcdef")
Debug.Print wf.getWordsFromString("lmnoengineer")
End Sub
将以下内容输出到VBA中的即时窗口:
建筑承包商
内饰
工程师
......以下是课程。
clsTrieNode
是树的每个单独节点。它代表一个单独的字母,它可能有多达26个孩子,假设它们在字典中形成有效的单词。如果字符组合,从根到树的节点逐个节点形成一个单词,Trie将设置" isWord"。
Option Compare Database
Option Explicit
Public KeyChar As String
Public isWord As Boolean
Private m_Children(0 To 25) As clsTrieNode
Public Property Get Child(strChar As String) As clsTrieNode
'better be ONE char
Set Child = m_Children(charToIndex(strChar))
End Property
Public Property Set Child(strChar As String, oNode As clsTrieNode)
Set m_Children(charToIndex(strChar)) = oNode
End Property
Private Function charToIndex(strChar As String) As Long
charToIndex = Asc(strChar) - 97 'asc("a")
End Function
clsTrie
是面向公众的接口,用于与构成trie的节点树交互。它包含一个Add
方法,用于将单词放入字典中,还包含一个isWord
方法,该方法允许针对trie字典测试字符串,以查看它是否是有效单词。 Remove
是一种很好的方法,但可能不是您的问题所必需的,所以我还没有实现它。
Option Compare Database
Option Explicit
Private m_Head As clsTrieNode
Private Sub Class_Initialize()
Set m_Head = New clsTrieNode
End Sub
Public Sub Add(strKey As String)
Dim currNode As clsTrieNode
Dim tempNode As clsTrieNode
Set currNode = m_Head
Dim strLCaseKey As String
strLCaseKey = LCase(strKey)
Dim i As Long
For i = 1 To Len(strLCaseKey)
If Not currNode.Child(Mid(strLCaseKey, i, 1)) Is Nothing Then
Set currNode = currNode.Child(Mid(strLCaseKey, i, 1))
Else
Exit For
End If
Next
For i = i To Len(strLCaseKey)
Set tempNode = New clsTrieNode
tempNode.KeyChar = Mid(strLCaseKey, i, 1)
Set currNode.Child(Mid(strLCaseKey, i, 1)) = tempNode
Set currNode = tempNode
Next
currNode.isWord = True
End Sub
Public Sub Remove(strKey As String)
'Might be nice to have
End Sub
Public Function isWord(strKey As String)
Dim currNode As clsTrieNode
Set currNode = m_Head
Dim strLCaseKey As String
strLCaseKey = LCase(strKey)
Dim i As Long
For i = 1 To Len(strLCaseKey)
If Not currNode.Child(Mid(strLCaseKey, i, 1)) Is Nothing Then
Set currNode = currNode.Child(Mid(strLCaseKey, i, 1))
Else
isWord = False
Exit Function
End If
Next
If currNode.isWord Then
isWord = True
Else
isWord = False
End If
End Function
Public Function getIterator() As clsTrieIterator
Dim oIterator As clsTrieIterator
Set oIterator = New clsTrieIterator
oIterator.Init m_Head
Set getIterator = oIterator
End Function
clsTrieIterator
是clsTrie
返回的特殊类,它允许使用consumeChar
逐字符解析字符串,而不是像clsTrie.isWord
那样一次完成。这允许一些自由地解析字符串而不需要回溯或多次读取相同的字符,并且当你不确定它们将会有多长时间时它允许查找单词。
Option Compare Database
Option Explicit
Private m_currNode As clsTrieNode
Private m_currString As String
Public Property Get getCurrentString() As String
getCurrentString = m_currString
End Property
Public Sub Init(oNode As clsTrieNode)
Set m_currNode = oNode
End Sub
Public Function consumeChar(strChar As String) As Boolean
Dim strLCaseChar As String
strLCaseChar = LCase(strChar)
If Not m_currNode.Child(strLCaseChar) Is Nothing Then
consumeChar = True
Set m_currNode = m_currNode.Child(strLCaseChar)
m_currString = m_currString & strChar
Else
consumeChar = False
Set m_currNode = Nothing
End If
End Function
Public Function isWord() As Boolean
isWord = m_currNode.isWord
End Function
clsWordFinder
将所有内容放在一个简单的api中,根据您的具体问题量身定制。可能值得添加一些逻辑来处理不同的行为,比如" greedy"匹配vs"懒惰"匹配和重叠与非重叠单词解析。
Option Compare Database
Option Explicit
Private m_Trie As clsTrie
Private Sub Class_Initialize()
Set m_Trie = New clsTrie
End Sub
Public Sub Add(strWord As String)
m_Trie.Add strWord
End Sub
Public Function getWordsFromString(strString As String) As String
Dim colIterators As Collection
Set colIterators = New Collection
Dim colMatches As Collection
Set colMatches = New Collection
Dim oIterator As clsTrieIterator
Dim strMatch As String
Dim i As Long
Dim iter
For i = 1 To Len(strString)
Set oIterator = m_Trie.getIterator
colIterators.Add oIterator, CStr(ObjPtr(oIterator))
For Each iter In colIterators
If Not iter.consumeChar(Mid(strString, i, 1)) Then
colIterators.Remove CStr(ObjPtr(iter))
ElseIf iter.isWord() Then
strMatch = iter.getCurrentString
Mid(strMatch, 1, 1) = UCase(Mid(strMatch, 1, 1))
colMatches.Add strMatch
colIterators.Remove CStr(ObjPtr(iter))
End If
Next
Next
getWordsFromString = JoinCollection(colMatches)
End Function
Public Function getWordsCollectionFromString(strString As String) As Collection
Dim colIterators As Collection
Set colIterators = New Collection
Dim colMatches As Collection
Set colMatches = New Collection
Dim oIterator As clsTrieIterator
Dim strMatch As String
Dim i As Long
Dim iter
For i = 1 To Len(strString)
Set oIterator = m_Trie.getIterator
colIterators.Add oIterator, CStr(ObjPtr(oIterator))
For Each iter In colIterators
If Not iter.consumeChar(Mid(strString, i, 1)) Then
colIterators.Remove CStr(ObjPtr(iter))
ElseIf iter.isWord() Then
strMatch = iter.getCurrentString
Mid(strMatch, 1, 1) = UCase(Mid(strMatch, 1, 1))
colMatches.Add strMatch
colIterators.Remove CStr(ObjPtr(iter))
End If
Next
Next
Set getWordsCollectionFromString = colMatches
End Function
Private Function JoinCollection(colStrings As Collection, Optional strDelimiter = " ") As String
Dim strOut As String
Dim i As Long
If colStrings.Count > 0 Then
strOut = colStrings.Item(1)
For i = 2 To colStrings.Count
strOut = strOut & strDelimiter & colStrings.Item(i)
Next
JoinCollection = strOut
End If
End Function