目前,我使用以下代码提取了13,000个网址。然而,其中3,000人提出了来自Facebook,彭博等的网址。对于这些URL,我一直在手动搜索那些名称,也许20个中有1个有宏错过的公司URL。所以我的问题是:有没有办法可以编辑宏,这样如果一个URL页面包含一个字符串值,如" facebook"或者" wiki"它会跳过该URL并继续搜索不包含字符串值的URL吗?
我如何提取网址的代码:
Sub XMLHTTP()
Dim url As String, lastRow As Long
Dim XMLHTTP As Object, html As Object, objResultDiv As Object, objH3 As Object, link As Object
Dim start_time As Date
Dim end_time As Date
lastRow = Range("A" & Rows.Count).End(xlUp).Row
Dim cookie As String
Dim result_cookie As String
start_time = Time
Debug.Print "start_time:" & start_time
For i = 2 To lastRow
url = "https://www.google.co.in/search?q=" & Cells(i, 1) & "&rnd=" & WorksheetFunction.RandBetween(1, 10000)
Set XMLHTTP = CreateObject("MSXML2.serverXMLHTTP")
XMLHTTP.Open "GET", url, False
XMLHTTP.setRequestHeader "Content-Type", "text/xml"
XMLHTTP.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0"
XMLHTTP.send
Set html = CreateObject("htmlfile")
html.body.innerHTML = XMLHTTP.ResponseText
Set objResultDiv = html.getelementbyid("rso")
Set objH3 = objResultDiv.getelementsbytagname("H3")(0)
Set link = objH3.getelementsbytagname("a")(0)
str_text = Replace(link.innerHTML, "<EM>", "")
str_text = Replace(str_text, "</EM>", "")
Cells(i, 2) = str_text
Cells(i, 3) = link.href
DoEvents
Next
end_time = Time
Debug.Print "end_time:" & end_time
Debug.Print "done" & "Time taken : " & DateDiff("n", start_time, end_time)
MsgBox "done" & "Time taken : " & DateDiff("n", start_time, end_time)
End Sub
这是我用来根据字符串值过滤掉网址的代码:
Sub badURLs()
Dim lr As Long ' Declare the variable
lr = Cells(Rows.Count, 3).End(xlUp).Row ' Set the variable
' lr now contains the last used row in column A
Application.ScreenUpdating = False
For a = lr To 1 Step -1
If InStr(1, Cells(a, 3), "bloomberg", vbTextCompare) > 0 _
Or InStr(1, Cells(a, 3), "manta", vbTextCompare) > 0 _
Or InStr(1, Cells(a, 3), "yellowpages", vbTextCompare) > 0 _
Or InStr(1, Cells(a, 3), "yelp", vbTextCompare) > 0 _
Or InStr(1, Cells(a, 3), "snapshot", vbTextCompare) > 0 _
Or InStr(1, Cells(a, 3), "facebook", vbTextCompare) > 0 _
Or InStr(1, Cells(a, 3), "wiki", vbTextCompare) > 0 _
Or InStr(1, Cells(a, 3), "linkedin", vbTextCompare) > 0 _
Or InStr(1, Cells(a, 3), "hoovers", vbTextCompare) > 0 Then
'Compares for bloomberg, wiki, or hoovers. Enters loop if value is greater than 0
With Cells(a, 3)
.NumberFormat = "General"
.Value = "NA"
End With
End If
Next a
Application.ScreenUpdating = True
End Sub
重申一下:我想知道是否有可能(如果是这样的话)根据第二个中的字符串值过滤掉第一个宏中的URL。我希望这会让我有更准确的URL点击,我不必手动搜索3000公司名称,希望只有少数人会有一个有用的URL。
答案 0 :(得分:1)
我在下面完整地复制了您的XMLHTTP()
代码,然后我在下面添加了用户定义的功能,以演示您的模块的布局方式。我所做的更改实际上只会影响:Cells(i, 3) = href
。在这种情况下,如果href
位于URL的错误列表中,则Cells(i, 3)
中不会放置任何内容。如果您需要更复杂的业务逻辑,请告诉我们,我们会尽力提供帮助。
Sub XMLHTTP()
Dim url As String, lastRow As Long
Dim XMLHTTP As Object, html As Object, objResultDiv As Object, objH3 As Object, link As Object
Dim start_time As Date
Dim end_time As Date
lastRow = Range("A" & Rows.Count).End(xlUp).Row
Dim cookie As String
Dim result_cookie As String
start_time = Time
Debug.Print "start_time:" & start_time
For i = 2 To lastRow
url = "https://www.google.co.in/search?q=" & Cells(i, 1) & "&rnd=" & WorksheetFunction.RandBetween(1, 10000)
Set XMLHTTP = CreateObject("MSXML2.serverXMLHTTP")
XMLHTTP.Open "GET", url, False
XMLHTTP.setRequestHeader "Content-Type", "text/xml"
XMLHTTP.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0"
XMLHTTP.send
Set html = CreateObject("htmlfile")
html.body.innerHTML = XMLHTTP.ResponseText
Set objResultDiv = html.getelementbyid("rso")
Set objH3 = objResultDiv.getelementsbytagname("H3")(0)
Set link = objH3.getelementsbytagname("a")(0)
str_text = Replace(link.innerHTML, "<EM>", "")
str_text = Replace(str_text, "</EM>", "")
Cells(i, 2) = str_text
If funcBadUrls(Cells(i, 1)) then
Cells(i, 3) = ""
Else
Cells(i, 3) = link.href
End If
DoEvents
Next
end_time = Time
Debug.Print "end_time:" & end_time
Debug.Print "done" & "Time taken : " & DateDiff("n", start_time, end_time)
MsgBox "done" & "Time taken : " & DateDiff("n", start_time, end_time)
End Sub
Function funcBadURLs(sInput as String) as Boolean
Dim bResult as Boolean
If InStr(1, sInput, "bloomberg", vbTextCompare) > 0 _
Or InStr(1, sInput, "manta", vbTextCompare) > 0 _
Or InStr(1, sInput, "yellowpages", vbTextCompare) > 0 _
Or InStr(1, sInput, "yelp", vbTextCompare) > 0 _
Or InStr(1, sInput, "snapshot", vbTextCompare) > 0 _
Or InStr(1, sInput, "facebook", vbTextCompare) > 0 _
Or InStr(1, sInput, "wiki", vbTextCompare) > 0 _
Or InStr(1, sInput, "linkedin", vbTextCompare) > 0 _
Or InStr(1, sInput, "hoovers", vbTextCompare) > 0 Then
bResult = True
Else
bResult = False
End If
funcBadUrls = bResult
End Sub
如果我理解正确,你想在第一个子程序中忽略BadUrls
。如果是这样,考虑根据第二个例程创建Function
,如果不好则返回true,否则返回false。然后,您可以根据需要构建逻辑。例如:
Function funcBadURLs(sInput as String) as Boolean
Dim bResult as Boolean
If InStr(1, sInput, "bloomberg", vbTextCompare) > 0 _
Or InStr(1, sInput, "manta", vbTextCompare) > 0 _
Or InStr(1, sInput, "yellowpages", vbTextCompare) > 0 _
Or InStr(1, sInput, "yelp", vbTextCompare) > 0 _
Or InStr(1, sInput, "snapshot", vbTextCompare) > 0 _
Or InStr(1, sInput, "facebook", vbTextCompare) > 0 _
Or InStr(1, sInput, "wiki", vbTextCompare) > 0 _
Or InStr(1, sInput, "linkedin", vbTextCompare) > 0 _
Or InStr(1, sInput, "hoovers", vbTextCompare) > 0 Then
bResult = True
Else
bResult = False
End If
funcBadUrls = bResult
End Sub
使用它:
Sub Test()
If funcBadUrls("www.bloomberg.com") then
'Do whatever to skip
Else
MsgBox "Success"
End If
End Sub
如果这有帮助,或者如果我误解了你的问题,请告诉我。