我正在尝试创建一个Windows桌面应用程序,该应用程序将转到指定的站点并从该站点获取HTML。我发现了很多这样做的示例,但由于某些原因,它不适用于传统的Google协作平台网页。该计划需要在Google协作平台页面的正文中找到简单的文字。当您“查看页面来源”时,它不会显示Google Chrome显示的相同代码。怎么了?
Public Sub Scrape(strURL)
Try
Dim wrResponse As WebResponse
Dim wrRequest As WebRequest = HttpWebRequest.Create(strURL)
textScrape = "Extracting..." & Environment.NewLine
wrResponse = wrRequest.GetResponse()
Using sr As New StreamReader(wrResponse.GetResponseStream())
strOutput = sr.ReadToEnd()
' Close and clean up the StreamReader
sr.Close()
End Using
textScrape = strOutput
'Formatting Techniques
' Remove Doctype ( HTML 5 )
strOutput = Regex.Replace(strOutput, "<!(.|\s)*?>", "")
' Remove HTML Tags
' strOutput = Regex.Replace(strOutput, "</?[a-z][a-z0-9]*[^<>]*>", "")
' Remove HTML Comments
' strOutput = Regex.Replace(strOutput, "<!--(.|\s)*?-->", "")
' Remove Script Tags
' strOutput = Regex.Replace(strOutput, "<script.*?</script>", "", RegexOptions.Singleline Or RegexOptions.IgnoreCase)
' Remove Stylesheets
' strOutput = Regex.Replace(strOutput, "<style.*?</style>", "", RegexOptions.Singleline Or RegexOptions.IgnoreCase)
scrapeFormatted = strOutput 'write Formatted Output To Separate TB
Form2.Show()
Catch ex As Exception
ErrorMsg("", "")
End Try
End Sub
答案 0 :(得分:0)
没关系,明白了。对于正在寻找相关信息的人来说,上面的代码非常有效。 确保将正确的链接插入&#34; strURL&#34;变量。这对我来说是一个非常愚蠢的错误。
Public Sub Scrape(strURL)
Try
Dim wrResponse As WebResponse
Dim wrRequest As WebRequest = HttpWebRequest.Create(strURL)
textScrape = "Extracting..." & Environment.NewLine
wrResponse = wrRequest.GetResponse()
Using sr As New StreamReader(wrResponse.GetResponseStream())
strOutput = sr.ReadToEnd()
' Close and clean up the StreamReader
sr.Close()
End Using
textScrape = strOutput
'Formatting Techniques
' Remove Doctype ( HTML 5 )
strOutput = Regex.Replace(strOutput, "<!(.|\s)*?>", "")
' Remove HTML Tags
' strOutput = Regex.Replace(strOutput, "</?[a-z][a-z0-9]*[^<>]*>", "")
' Remove HTML Comments
' strOutput = Regex.Replace(strOutput, "<!--(.|\s)*?-->", "")
' Remove Script Tags
' strOutput = Regex.Replace(strOutput, "<script.*?</script>", "", RegexOptions.Singleline Or RegexOptions.IgnoreCase)
' Remove Stylesheets
' strOutput = Regex.Replace(strOutput, "<style.*?</style>", "", RegexOptions.Singleline Or RegexOptions.IgnoreCase)
scrapeFormatted = strOutput 'write Formatted Output To Separate TB
Form2.Show()
Catch ex As Exception
ErrorMsg("", "")
End Try
End Sub