我的目标是获取大约5k网址的状态代码。
约束:
1 /如果URL A重定向到URL B,则获取URL B的状态代码
2 /如果超时,则重试3次。
这是我实施的:
Parallel.ForEach(
linkList,
new ParallelOptions() {MaxDegreeOfParallelism=64},
link=>
{
HtmlAnalyzor htmlAnalyzor = new HtmlAnalyzor(link.URL);
int statusCode=-1;
for (int retryTime = 2; retryTime >= 0; retryTime--)
{
statusCode = htmlAnalyzor.GetDestinationURLStatusCode(link.URL, link.IdQualityPage,retryTime);
if (statusCode!=-1 && statusCode!=0) { break; }
}
linkStatusCodeDic.Add(link, statusCode);
});
public int GetDestinationURLStatusCode(string originalURL,int qPageId, int retryTime)
{
try
{
Console.WriteLine("URL:{0}",originalURL);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(originalURL);
request.Method = "HEAD";
request.Timeout = 10000;
//Half of the time, the line below will throw a WebException and give me a statusCode=0;
_Response = (HttpWebResponse)request.GetResponse();
string destURL = _Response.ResponseUri.ToString();
if (originalURL != destURL)
{
GetDestinationURLStatusCode(destURL,qPageId,retryTime);
}
int statusCode = (int)_Response.StatusCode;
_Response.Close();
Console.WriteLine("Normal:{0}", statusCode);
return statusCode;
}catch(WebException webEx)
{
int statusCode = 0;
if (webEx.Status == WebExceptionStatus.ProtocolError)
{
//statusCode = (int)((HttpWebResponse)webEx.Response).StatusCode;
Console.WriteLine("WebEx:{0}", statusCode);
}
if (_Response != null)
{
_Response.Close();
}
return statusCode;
}
catch(Exception ex)
{
if (_Response != null)
{
_Response.Close();
}
if(retryTime==0)
{
Console.WriteLine("Failed to get status code for URL['{1}'] on the Page[Code:{2}].{0}ErrorMessage:{3}", Environment.NewLine, _URL, pageId, ex.Message);
}
return -1;
}
}
我的代码结果:有一半时间,它会抛出WebException并给我一个状态代码= 0。
我试图改变这种情况:
1 /我已将MaxDegreeOfParallelism更改为40和20,它不起作用
2 /我已将request.TimeOut更改为20s,30s甚至90s,它不起作用。
答案 0 :(得分:0)
我已经改变了我的代码,现在它正在运行。 我改变的要点是:
delete:new ParallelOptions(){MaxDegreeOfParallelism = 64}
首先使用并行,然后使用传统for循环来并行处理失败。这增加了成功的百分比。
为httpwebrequest修改了一些参数:
request.UserAgent =“html-analyzor”;
request.KeepAlive = false;
request.Timeout = 15000;
以下是代码:
List<QualityPageLink> linkListToRetrySync = new List<QualityPageLink>();
ServicePointManager.DefaultConnectionLimit = 1000;
Parallel.ForEach(
linkList,
//new ParallelOptions() { //MaxDegreeOfParallelism = 64 },
link =>
{
HtmlAnalyzor htmlAnalyzor = new HtmlAnalyzor(link.URL);
int statusCode = -1;
for (int retryTime = 2; retryTime >= 0; retryTime--)
{
statusCode = htmlAnalyzor.GetDestinationURLStatusCode(link.URL, link.IdQualityPage, retryTime);
if (statusCode > 0) { break; }
if (statusCode != 200) { linkListToRetrySync.Add(link); }
linkIdStatusCodeDic.Add(link, statusCode);
});
if(linkListToRetrySync!=null && linkListToRetrySync.Count()!=0)
{
for (int i = 0; i < linkListToRetrySync.Count(); i++)
{
var link = linkListToRetrySync[i];
int statusCode = -1;
HtmlAnalyzor htmlAnalyzor = new HtmlAnalyzor(link.URL);
for (int retryTime = 2; retryTime >= 0; retryTime--)
{
statusCode = htmlAnalyzor.GetDestinationURLStatusCode(link.URL, link.IdQualityPage, retryTime);
if (statusCode > 0) { break; }
}
linkIdStatusCodeDic[link] = statusCode;
}
}
public int GetDestinationURLStatusCode(string originalURL, int qPageId, int retryTime)
{
HttpWebRequest request;
int statusCode = -1;
//HttpWebResponse response = null;
try
{
Console.WriteLine("URL:{0}", Helper.ToString(originalURL));
request = (HttpWebRequest)WebRequest.Create(originalURL);
request.UserAgent = "html-analyzor";
request.KeepAlive = false;
request.Timeout = 15000;
using (this._Response = (HttpWebResponse)request.GetResponse())
{
statusCode = (int)_Response.StatusCode;
}
//string destURL = _Response.ResponseUri.ToString();
//if (originalURL != destURL)
//{
// GetDestinationURLStatusCode(destURL, qPageId, retryTime);
//}
Console.WriteLine("Normal:{0}", statusCode);
return statusCode;
}
catch (WebException webEx)
{
statusCode = 0;
if (webEx.Status == WebExceptionStatus.ProtocolError)
{
statusCode = (int)((HttpWebResponse)webEx.Response).StatusCode;
Console.WriteLine("WebEx:{0}", statusCode);
}
if (this._Response != null)
{
this._Response.Close();
this._Response = null;
}
return statusCode;
}
catch(Exception ex)
{
if (this._Response != null)
{
this._Response.Close();
this._Response = null;
}
if (retryTime == 0)
{
// Console.WriteLine("Failed.");
}
return -1;
}
}