我继承了一个带有所有源代码的web spider应用程序。看来,对于普通的宣传册式网站(比如15页以下),该软件运行得非常好。
对于其他人(超过20页),软件会将StackOverflowException抛出到下面代码中标记的行上。
它似乎没有使用递归,不幸的是,没有支持使用的LinqToHtml(SuperStarCoders)库。
以下是发生异常时正在运行的代码:
Private Function ExportXml(Optional ByVal _Worker As ComponentModel.BackgroundWorker = Nothing) As Boolean
Dim _L = PopulateSEOList(_Worker)
Try
Dim _TmpStr As New Text.StringBuilder
Dim _X As New XDocument, _ct As Long = 0, _Elements As Typing.SEO.Elements = Nothing
ReportProgress(0, _Worker)
With _TmpStr
.Append("<?xml version=""1.0"" encoding=""UTF-8""?>")
.Append("<o7th.Web.Design.Web.Spider>")
For i As Long = 0 To _L.Count - 1
_ct += 1
.Append(" <Page>")
.Append(" <Link>" & XmlEscape(_L(i).Link) & "</Link>")
.Append(" <Title>" & XmlEscape(_L(i).Title) & "</Title>")
.Append(" <Keywords>" & XmlEscape(_L(i).Keywords) & "</Keywords>")
.Append(" <Description>" & XmlEscape(_L(i).Description) & "</Description>")
.Append(" <Elements>")
_Elements = _L(i).ContentElements
If _Elements IsNot Nothing Then
If _Elements.H1 IsNot Nothing Then
.Append(<H1>
<%= (From n In _Elements.H1.AsParallel()
Select
<Content><%= XmlEscape(n) %></Content>).ToList() %>
</H1>)
End If
If _Elements.H2 IsNot Nothing Then
.Append(<H2>
<%= (From n In _Elements.H2.AsParallel()
Select
<Content><%= XmlEscape(n) %></Content>).ToList() %>
</H2>)
End If
If _Elements.H3 IsNot Nothing Then
.Append(<H3>
<%= (From n In _Elements.H3.AsParallel()
Select
<Content><%= XmlEscape(n) %></Content>).ToList() %>
</H3>)
End If
If _Elements.H4 IsNot Nothing Then
.Append(<H4>
<%= (From n In _Elements.H4.AsParallel()
Select
<Content><%= XmlEscape(n) %></Content>).ToList() %>
</H4>)
End If
If _Elements.H5 IsNot Nothing Then
.Append(<H5>
<%= (From n In _Elements.H5.AsParallel()
Select
<Content><%= XmlEscape(n) %></Content>).ToList() %>
</H5>)
End If
If _Elements.H6 IsNot Nothing Then
.Append(<H6>
<%= (From n In _Elements.H6.AsParallel()
Select
<Content><%= XmlEscape(n) %></Content>).ToList() %>
</H6>)
End If
If _Elements.UL IsNot Nothing Then
.Append(<UL>
<%= (From n In _Elements.UL.AsParallel()
Select
<Content><%= ConvertToCDATA(n) %></Content>).ToList() %>
</UL>)
End If
If _Elements.OL IsNot Nothing Then
.Append(<OL>
<%= (From n In _Elements.OL.AsParallel()
Select
<Content><%= ConvertToCDATA(n) %></Content>).ToList() %>
</OL>)
End If
If _Elements.STRONG IsNot Nothing Then
.Append(<STRONG>
<%= (From n In _Elements.STRONG.AsParallel()
Select
<Content><%= XmlEscape(n) %></Content>).ToList() %>
</STRONG>)
End If
If _Elements.EM IsNot Nothing Then
.Append(<EM>
<%= (From n In _Elements.EM.AsParallel()
Select
<Content><%= XmlEscape(n) %></Content>).ToList() %>
</EM>)
End If
If _Elements.BLOCKQUOTE IsNot Nothing Then
.Append(<BLOCKQUOTE>
<%= (From n In _Elements.BLOCKQUOTE.AsParallel()
Select
<Content><%= ConvertToCDATA(n) %></Content>).ToList() %>
</BLOCKQUOTE>)
End If
If _Elements.A IsNot Nothing Then
.Append(<LINKS>
<%= (From n In _Elements.A.AsParallel()
Select
<Content>
<HREF><%= XmlEscape(n.Href) %></HREF>
<REL><%= XmlEscape(n.Rel) %></REL>
<TITLE><%= XmlEscape(n.Title) %></TITLE>
<TARGET><%= XmlEscape(n.Target) %></TARGET>
<CONTENT><%= XmlEscape(n.Content) %></CONTENT>
</Content>).ToList() %>
</LINKS>)
End If
If _Elements.IMG IsNot Nothing Then
.Append(<IMAGES>
<%= (From n In _Elements.IMG.AsParallel()
Select
<Content>
<SRC><%= XmlEscape(n.Source) %></SRC>
<ALT><%= XmlEscape(n.Alt) %></ALT>
<TITLE><%= XmlEscape(n.Title) %></TITLE>
</Content>).ToList() %>
</IMAGES>)
End If
End If
.Append(" </Elements>")
.Append(" <Content><![CDATA[" & _L(i).Content.ToString() & "]]></Content>")
.Append(" </Page>")
ReportProgress((_ct / _L.Count) * 100, _Worker)
Next
.Append("</o7th.Web.Design.Web.Spider>")
End With
Dim _xStr As String = _TmpStr.ToString()
_X = XDocument.Parse(_xStr)
_X.Save(ExportPath & "site.xml")
_X = Nothing
ReportProgress(100, _Worker)
Return True
Catch ex As Exception
'Put logging in here
Message = ex.Message & ":::Export.ExportXml"
Return False
End Try
End Function
上面的LinkList变量是一个(Typing.Links)列表:
Partial Public Class Links
Public Property SiteUrl As String
Public Property SiteTitle As String
Public Property Site As String
End Class
其他两个名单是:
Imports Superstar.Html.Linq
Public Class Typing
Partial Public Class SEO
Public Property Link As String
Public Property Title As String
Public Property Description As String
Public Property Keywords As String
Public Property Content As HElement
Public Property ContentElements As Elements
Partial Public Class Elements
Public Property H1 As List(Of String)
Public Property H2 As List(Of String)
Public Property H3 As List(Of String)
Public Property H4 As List(Of String)
Public Property H5 As List(Of String)
Public Property H6 As List(Of String)
Public Property UL As List(Of String)
Public Property OL As List(Of String)
Public Property STRONG As List(Of String)
Public Property BLOCKQUOTE As List(Of String)
Public Property EM As List(Of String)
Public Property A As List(Of Links)
Public Property IMG As List(Of Images)
Partial Public Class Images
Public Property Source As String
Public Property Alt As String
Public Property Title As String
End Class
Partial Public Class Links
Public Property Href As String
Public Property Rel As String
Public Property Title As String
Public Property Target As String
Public Property Content As String
End Class
End Class
End Class
End Class
ReportProgress只是报告并更新Xaml窗口的后台工作人员,以了解此特定情况以更新进度条:
Public Sub ReportProgress(ByVal ct As Integer, _Worker As ComponentModel.BackgroundWorker)
If _Worker IsNot Nothing Then
_Worker.ReportProgress(ct)
Threading.Thread.Sleep(500)
End If
End Sub
,Downloader类是:
Imports System.Reflection
Imports System.Net
Imports Superstar.Html.Linq
Public Class Downloader
Implements IDisposable
''' <summary>
''' Get the returned downloaded string
''' </summary>
''' <value></value>
''' <returns></returns>
''' <remarks></remarks>
Public ReadOnly Property ReturnString As String
Get
Return _StrReturn
End Get
End Property
Private Property _StrReturn As String
''' <summary>
''' Get the returned downloaded byte array
''' </summary>
''' <value></value>
''' <returns></returns>
''' <remarks></remarks>
Public ReadOnly Property ReturnBytes As Byte()
Get
Return _FSReturn
End Get
End Property
Private Property _FSReturn As Byte()
Private Property _UserAgent As String = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13"
Private Property DataReceived As Boolean = False
''' <summary>
''' Download a string, but do not block the calling thread
''' </summary>
''' <param name="_Path"></param>
''' <remarks></remarks>
Public Sub DownloadString(ByVal _Path As String, Optional ByVal _Worker As ComponentModel.BackgroundWorker = Nothing)
SetAllowUnsafeHeaderParsing20()
Using wc As New Net.WebClient()
With wc
Dim _ct As Long = 0
DataReceived = False
.Headers.Add("user-agent", _UserAgent)
.DownloadStringAsync(New System.Uri(_Path))
AddHandler .DownloadStringCompleted, AddressOf StringDownloaded
Do While Not DataReceived
If _Worker IsNot Nothing Then
_ct += 1
ReportProgress(_ct, _Worker)
End If
Loop
End With
End Using
End Sub
''' <summary>
''' Download a file, but do not block the calling thread
''' </summary>
''' <param name="_Path"></param>
''' <remarks></remarks>
Public Sub DownloadFile(ByVal _Path As String, Optional ByVal _Worker As ComponentModel.BackgroundWorker = Nothing)
SetAllowUnsafeHeaderParsing20()
Using wc As New Net.WebClient()
With wc
Dim _ct As Long = 0
DataReceived = False
.Headers.Add("user-agent", _UserAgent)
.DownloadDataAsync(New System.Uri(_Path))
AddHandler .DownloadDataCompleted, AddressOf FileStreamDownload
Do While Not DataReceived
If _Worker IsNot Nothing Then
_ct += 1
ReportProgress(_ct, _Worker)
End If
Loop
End With
End Using
End Sub
''' <summary>
''' Download a parsable HDocument, for using HtmlToLinq
''' </summary>
''' <param name="_Path"></param>
''' <returns></returns>
''' <remarks></remarks>
Public Function DownloadHDoc(ByVal _Path As String, Optional ByVal _Worker As ComponentModel.BackgroundWorker = Nothing) As HDocument
Try
'StackOverFlowException Occurring Here!
DownloadString(_Path, _Worker)
Return HDocument.Parse(_StrReturn)
Catch soex As StackOverflowException
'put some logging in here, with the path attempted
Return Nothing
Catch ex As Exception
SetAllowUnsafeHeaderParsing20()
Return HDocument.Load(_Path)
End Try
End Function
#Region "Internals"
Private Sub SetAllowUnsafeHeaderParsing20()
Dim a As New System.Net.Configuration.SettingsSection
Dim aNetAssembly As System.Reflection.Assembly = Assembly.GetAssembly(a.GetType)
Dim aSettingsType As Type = aNetAssembly.GetType("System.Net.Configuration.SettingsSectionInternal")
Dim args As Object() = Nothing
Dim anInstance As Object = aSettingsType.InvokeMember("Section", BindingFlags.Static Or BindingFlags.GetProperty Or BindingFlags.NonPublic, Nothing, Nothing, args)
Dim aUseUnsafeHeaderParsing As FieldInfo = aSettingsType.GetField("useUnsafeHeaderParsing", BindingFlags.NonPublic Or BindingFlags.Instance)
aUseUnsafeHeaderParsing.SetValue(anInstance, True)
End Sub
Private Sub FileStreamDownload(ByVal sender As Object, ByVal e As DownloadDataCompletedEventArgs)
If e.Cancelled = False AndAlso e.Error Is Nothing Then
DataReceived = True
_FSReturn = DirectCast(e.Result, Byte())
Else
_FSReturn = Nothing
End If
End Sub
Private Sub StringDownloaded(ByVal sender As Object, ByVal e As DownloadStringCompletedEventArgs)
If e.Cancelled = False AndAlso e.Error Is Nothing Then
DataReceived = True
_StrReturn = DirectCast(e.Result, String)
Else
_StrReturn = String.Empty
End If
End Sub
#End Region
#Region "IDisposable Support"
Private disposedValue As Boolean ' To detect redundant calls
' IDisposable
Protected Overridable Sub Dispose(disposing As Boolean)
If Not Me.disposedValue Then
If disposing Then
End If
_StrReturn = Nothing
_FSReturn = Nothing
End If
Me.disposedValue = True
End Sub
Public Sub Dispose() Implements IDisposable.Dispose
Dispose(True)
GC.SuppressFinalize(Me)
End Sub
#End Region
End Class
正如我上面所说的那样,看起来没有任何递归发生。 (至少没有一个真正对我而言),所以我立刻认为它在HDocument.Parse中,它正在发生。
你能告诉我这是错误的,以及如何纠正这个问题?
我做了一些研究,并且明白默认堆栈大小只有1MB,所以我想知道这是否真的是我应该尝试增加这个特殊情况之一......
我经常看了几次跟踪后发现它总是在它碰到特定页面时发生。这个页面恰好超过了500k。
这是调用堆栈:
[External Code]
> o7th.Web.Design.Spider.Worker.dll!o7th.Web.Design.Spider.Worker.Downloader.DownloadHDoc(String _Path, System.ComponentModel.BackgroundWorker _Worker) Line 95 + 0x1e bytes Basic
o7th.Web.Design.Spider.Worker.dll!o7th.Web.Design.Spider.Worker.Export.PopulateSEOList(System.ComponentModel.BackgroundWorker _Worker) Line 513 + 0x65 bytes Basic
o7th.Web.Design.Spider.Worker.dll!o7th.Web.Design.Spider.Worker.Export.ExportXml(System.ComponentModel.BackgroundWorker _Worker) Line 70 + 0x1e bytes Basic
o7th.Web.Design.Spider.Worker.dll!o7th.Web.Design.Spider.Worker.Export.RunExport(System.ComponentModel.BackgroundWorker _Worker) Line 30 + 0x17 bytes Basic
o7th.Web.Design.WebSpider.exe!o7th.Web.Design.WebSpider.ParseLinks.RunExport(Object sender, System.ComponentModel.DoWorkEventArgs e) Line 106 + 0x2c bytes Basic
[External Code]
当地人向我展示了我上面提到的超过500k的页面
答案 0 :(得分:2)
(我需要更多空间,否则我会将此作为对@Jakub Konecki帖子的评论添加。)
多年来我已经构建了多个蜘蛛,并行性能的唯一重要性能就是实际下载URL。您可能会在大型文档上进行几百毫秒的HTML解析,但增益不值得调试。因此,让您的生活更轻松,并消除并行性。
你也遇到了一个奇怪的异步阻塞问题。在DownloadHDoc
中,您同步调用DownloadString
,但随后在DownloadString
内部启动异步方法,然后阻止位标志,从而打败异步的目的。更糟糕的是,你在do-while
循环中阻塞,每小时旋转一百万英里并且每次都调用ReportProgress
。我希望这实际上是给你SOE的。将Thread.Sleep(100)
放在那里可能对初学者有帮助。
<强> [编辑] 强>
位标志上阻塞的代码是:
.DownloadStringAsync(New System.Uri(_Path))
AddHandler .DownloadStringCompleted, AddressOf StringDownloaded
Do While Not DataReceived
If _Worker IsNot Nothing Then
_ct += 1
ReportProgress(_ct, _Worker)
End If
Loop
第1行启动异步方法,第2行为完成添加处理程序并立即返回。第3行一遍又一遍地检查全局变量,等待函数StringDownloaded
进行设置。这种情况每秒发生数百或数千(或更多)次。虽然不是最优的,但是每次都要调用ReportProgress
方法的原因很糟糕。文档越大,对ReportProgress
的调用就越多。你真的只需要每隔100毫秒更新一次用户界面,我通常每隔250毫秒或500毫秒更新一次。
[编辑2]
如果出现上述问题,您应该可以将其更改为:
.DownloadStringAsync(New System.Uri(_Path))
AddHandler .DownloadStringCompleted, AddressOf StringDownloaded
Do While Not DataReceived
If _Worker IsNot Nothing Then
_ct += 1
ReportProgress(_ct, _Worker)
End If
Thread.Sleep(250) ''//Sleep inside of the loop
Loop
答案 1 :(得分:0)
我首先要删除所有的并行性 - 无论如何它可能过度,创建多个线程的开销大于性能增益。
一旦这样做,只需调试代码并等待异常。您可以检查调用堆栈和所有集合。
当您递归调用相同的方法并且由于某种原因结束条件没有启动时,通常会发生堆栈溢出。您将在调用堆栈中清楚地看到它。