我正在尝试获取与每个专利号对应的数据。我实际上想要每个专利的出版日期。这是excel表:
我使用的数据库是espacenet.com 这是您在excel表中看到的第一项专利的链接: http://worldwide.espacenet.com/searchResults?compact=false&PN=US7055777B2&ST=advanced&locale=en_EP&DB=EPODOC
在此网页中,"发布信息"标题,我需要在专利号与excel表中的专利号成功匹配后得到日期。
以下是代码:
Sub tryextraction()
Dim ie As New InternetExplorer
Dim sdd As String
Dim tdd() As String
Dim num0 As Integer
Dim num1 As Integer
Dim doc As HTMLDocument
Dim i As Integer
Dim j As Integer
ie.Visible = True
num1 = ActiveSheet.UsedRange.Rows.Count
For num0 = 2 To num1
ie.navigate "http://worldwide.espacenet.com/searchResults?compact=false&PN=" & Range("A" & num0) & "&ST=advanced&locale=en_EP&DB=EPODOC"
Do
DoEvents
Loop Until ie.readyState = READYSTATE_COMPLETE
Set doc = ie.document
sdd = Trim(doc.getElementsByTagName("td")(5).innerText)
tdd() = Split(sdd, vbLf)
j = UBound(tdd)
For i = 0 To j
If InStr(tdd(i), "(") <> 0 Then
tdd(i) = Replace(tdd(i), " ", "")
tdd(i) = Replace(tdd(i), "(", "")
tdd(i) = Replace(tdd(i), ")", "")
If tdd(i) = Range("A" & num0).Value Then
Range("B" & num0).Value = tdd(i + 1)
End If
End If
Next i
Next num0
ie.Quit
End Sub
代码可能看起来很幼稚,因为我对编码不太了解。无论如何,这并没有给出任何错误,而是列#34;出版日期&#34;代码完成运行后仍为空白。包含发布信息的html标记已正确使用。请帮忙。
答案 0 :(得分:2)
在您在文档中搜索的ID之后有一些尾随空白字符,因此tdd(i) = Range("A" & num0).Value
永远不会评估为true。它不仅仅是一个空间,所以一个简单的Trim(tdd(i)) = Range("A" & num0).Value
调用无济于事。请改为InStr(tdd(i), Range("A" & num0).Value)
如果这还不够好,那么在进行比较之前,您必须从字符串末尾专门删除CRLF。
答案 1 :(得分:0)
发布信息标题下通常有多个发布日期。
示例:
以下脚本获取所有这些信息和前一行(因此,您具有关联的发布以及日期)。
它从Activesheet
的第2行循环到最后一个填充的行,从A列中提取Publication Numbers
并从B列开始写出结果。具体取决于有多少个日期,数据将跨越B的多个列。
正则表达式:
正则表达式^(.*)\s\d{4}-\d{2}-\d{2}
用于检索日期模式和前一行,即发布标识符和日期。 Try it
示例输出:
VBA:
Option Explicit
Public Sub GetInfo()
Dim IE As New InternetExplorer, html As New HTMLDocument, url As String, pubInfo As Object
Dim loopRange As Range, iRow As Range, counter As Long
'example US7055777B2
Application.ScreenUpdating = False
With ActiveSheet
Set loopRange = Range("A2:A" & .Cells(.Rows.Count, "A").End(xlUp).Row)
End With
With IE
.Visible = True
counter = 2 '<== start from row 2
For Each iRow In loopRange
If Not IsEmpty(iRow) Then
url = "https://worldwide.espacenet.com/searchResults?compact=false&PN=" & iRow.Value & "&ST=advanced&locale=en_EP&DB=EPODOC"
.navigate url
While .Busy Or .readyState < 4: DoEvents: Wend
Set html = .document
Do
DoEvents
On Error Resume Next
Set pubInfo = html.querySelector(".publicationInfoColumn")
On Error GoTo 0
Loop While pubInfo Is Nothing
Dim tempArr()
tempArr = GetDateAndPatent(pubInfo.innerText, "^(.*)\s\d{4}-\d{2}-\d{2}") '"(?m)^(.*)\s\d{4}-\d{2}-\d{2}" '<==This is not supported
With ActiveSheet
.Cells(counter, 2).Resize(1, UBound(tempArr) + 1) = tempArr
End With
End If
counter = counter + 1
Next iRow
.Quit '<== Remember to quit application
End With
Application.ScreenUpdating = True
End Sub
Public Function GetDateAndPatent(ByVal inputString As String, ByVal sPattern As String) As Variant
Dim matches As Object, iMatch As Object, s As String, arrMatches(), i As Long
With CreateObject("vbscript.regexp")
.Global = True
.MultiLine = True
.IgnoreCase = True
.Pattern = sPattern
If .test(inputString) Then
Set matches = .Execute(inputString)
For Each iMatch In matches
ReDim Preserve arrMatches(i)
arrMatches(i) = iMatch.Value
i = i + 1
Next iMatch
End If
End With
GetDateAndPatent = arrMatches
End Function