模式匹配:阅读innerhtml&即时写入文本文件 - VBScript

时间:2015-03-03 16:59:05

标签: regex vbscript match

作为对此问题的跟进:Pattern matching in VBS

我试图将模式匹配实施到我的第一步。我的第一步是将所有来自1,000页的内部HTML转储到一个文本文件中,然后使用上面链接中的技术在文本文件中找到模式匹配。这对于一些innerhtml转储工作正常,但当我处理我需要的每个页面时,我生成的文本文件超过1GB!当我试图对它进行模式匹配时,它只是挂起(甚至让它整夜运行)。

我的下一步想法是在我的第一步中直接与innerhtml进行模式匹配,所以我只需要将所需的URL写入文本文件。这就是我所拥有的 - 没有收到错误,但文本文件没有被填充。我哪里错了?

我认为问题出在这里:

Dim objRegExp
Set objRegExp = New RegExp

objRegExp.IgnoreCase = True
objRegExp.Global = True
objRegExp.Pattern = "LicenseDetail[^']+"

dim matches()
dim iii: iii = 0

strHTML = ie.document.body.innerhtml

Dim objMatch
For Each objMatch in objRegExp.Execute(strHTML)
    redim preserve matches(iii)
    matches(iii) = objMatch.SubMatches(0)
    iii = (iii + 1)
Next

'read back
for iii = 0 to ubound(matches)

    set fso = createobject("scripting.filesystemobject")
    set ts = fso.opentextfile("C:\AgentURLsRaw.txt",8,true) 
    ts.write matches(iii) 
    ts.close 
next

完整代码:

Set objWshShell = Wscript.CreateObject("Wscript.Shell")
Set IE = CreateObject("internetexplorer.application")
Set fso = CreateObject("Scripting.FileSystemObject")

For ii=12 To 81

    If ii=81 Then Exit For

    IE.Visible = True
    IE.Navigate "https://www.myfloridalicense.com/wl11.asp?Mode=1&SID=&brd=&typ="
    Do Until IE.ReadyState = 4: WScript.sleep 15: Loop

    IE.Document.getElementsByTagName("select")("Board").Value = "25"
    Set Form = IE.Document.Forms("reportForm")
    Form.hDDChange.Value = "Y"
    Form.Submit

    Do Until IE.ReadyState = 4: WScript.sleep 15: Loop
    Do Until IE.Document.ReadyState = "complete": WScript.sleep 10: Loop
    IE.Document.getElementsByTagName("select")("LicenseType").Value = "2501"

    IE.Document.getElementsByTagName("select")("County").Value = ii
    IE.Document.getElementsByTagName("select")("RecsPerPage").Value = "50"

    For Each btn In IE.Document.getElementsByTagName("input")
        If btn.type = "image" Then btn.Click()
        Do Until IE.ReadyState = 4: WScript.sleep 15: Loop
    Next

    Do Until IE.Document.ReadyState = "complete": WScript.sleep 15: Loop
    pg = IE.Document.getElementsByName("hTotalPages")(0).Value

    Dim i
    i = 0

    Do while i < pg +1
        i = i + 1

        on error resume next

        Dim objRegExp
        Set objRegExp = New RegExp

        objRegExp.IgnoreCase = True
        objRegExp.Global = True
        objRegExp.Pattern = "LicenseDetail[^']+"

        dim matches()
        dim iii: iii = 0

        strHTML = ie.document.body.innerhtml

        Dim objMatch
        For Each objMatch in objRegExp.Execute(strHTML)
            redim preserve matches(iii)
            matches(iii) = objMatch.SubMatches(0)
            iii = (iii + 1)
        Next

        'read back
        for iii = 0 to ubound(matches)

            set fso = createobject("scripting.filesystemobject")
            set ts = fso.opentextfile("C:\AgentURLsRaw.txt",8,true) 
            ts.write matches(iii) 
            ts.close 
        next

        Do Until IE.Document.ReadyState = "complete": WScript.sleep 15: Loop
        For Each btn In IE.Document.getElementsByTagName("input")
            If btn.name = "SearchForward" and btn.value = "Search" Then btn.Click()
        Next
    loop

next
Wscript.echo DONE

谢谢!

1 个答案:

答案 0 :(得分:1)

想出来。这是我的最终代码,以防它对其他人有帮助:

Set objWshShell = Wscript.CreateObject("Wscript.Shell")
Set IE = CreateObject("internetexplorer.application")
Set fso = CreateObject("Scripting.FileSystemObject")

For ii=12 To 81

If ii=81 Then Exit For

IE.Visible = True
IE.Navigate "https://www.myfloridalicense.com/wl11.asp?Mode=1&SID=&brd=&typ="
Do Until IE.ReadyState = 4: WScript.sleep 15: Loop

IE.Document.getElementsByTagName("select")("Board").Value = "25"
Set Form = IE.Document.Forms("reportForm")
Form.hDDChange.Value = "Y"
Form.Submit

Do Until IE.ReadyState = 4: WScript.sleep 15: Loop
Do Until IE.Document.ReadyState = "complete": WScript.sleep 10: Loop
IE.Document.getElementsByTagName("select")("LicenseType").Value = "2501"

IE.Document.getElementsByTagName("select")("County").Value = ii
IE.Document.getElementsByTagName("select")("RecsPerPage").Value = "50"

For Each btn In IE.Document.getElementsByTagName("input")
If btn.type = "image" Then btn.Click()
Do Until IE.ReadyState = 4: WScript.sleep 15: Loop
Next

Do Until IE.Document.ReadyState = "complete": WScript.sleep 15: Loop
pg = IE.Document.getElementsByName("hTotalPages")(0).Value

Dim i
i = 0

Do while i < pg +1
i = i + 1

on error resume next

strPattern = "LicenseDetail[^""]+"

strTestString = ie.document.body.innerhtml

arrAllMatches = fGetMatches(strPattern, strTestString)

If UBound(arrAllMatches) <> 0 Then 

set fso = createobject("scripting.filesystemobject")
set ts = fso.opentextfile("C:\AgentURLsRaw.txt",8,true) 
ts.write Join(arrAllMatches, vbCrlf)
ts.close 

'Reopens newly created txt file and corrects ampersand in addresses
 set ts = fso.opentextfile("C:\AgentURLsRaw.txt",1,true)
 tsread2 = ts.ReadAll
 ts.close

 tsreadreplaceAMP = Replace(tsread2, "LicenseDetail.asp?SID=&amp;id=","https://www.myfloridalicense.com/LicenseDetail.asp?SID=&id=")

 set ts = fso.opentextfile("C:\AgentURLsRaw.txt",2,true)
 ts.WriteLine tsreadreplaceAMP
 ts.Close


Else

WScript.Echo "-- None Found --"

End if

Do Until IE.Document.ReadyState = "complete": WScript.sleep 15: Loop
For Each btn In IE.Document.getElementsByTagName("input")
If btn.name = "SearchForward" and btn.value = "Search" Then btn.Click()
Next
loop

next
Wscript.echo DONE

Function fGetMatches(sPattern, sStr)
Dim regEx, retVal, sMatch, colMatches, temp
Set regEx = New RegExp     ' Create a regular expression.
regEx.Pattern = sPattern   ' Set pattern.
regEx.IgnoreCase = True   ' Set case insensitivity.
regEx.Global = True        ' Set global applicability.

Set colMatches = regEx.Execute(sStr)   ' Execute search.

If colMatches.Count = 0 Then
    temp = Array("")
Else
    '# Convert Collection to Array
    For Each sMatch In colMatches
        temp = temp & sMatch & "¶"
    Next
    temp = Left(temp, Len(temp) - 1)
    temp = Split(temp, "¶")
End If

fGetMatches = temp
End Function