使用并行

时间:2015-09-25 13:36:09

标签: c# httpwebrequest parallel.foreach

我的目标是获取大约5k网址的状态代码。

约束:
1 /如果URL A重定向到URL B,则获取URL B的状态代码 2 /如果超时,则重试3次。

这是我实施的:

  Parallel.ForEach(
                linkList,
                new ParallelOptions() {MaxDegreeOfParallelism=64},
                link=>
                    {
                        HtmlAnalyzor htmlAnalyzor = new HtmlAnalyzor(link.URL);
                        int statusCode=-1;
                        for (int retryTime = 2; retryTime >= 0; retryTime--)
                        {
                            statusCode = htmlAnalyzor.GetDestinationURLStatusCode(link.URL, link.IdQualityPage,retryTime);
                            if (statusCode!=-1 && statusCode!=0) { break; }
                        }
                        linkStatusCodeDic.Add(link, statusCode);
                    });



public int GetDestinationURLStatusCode(string originalURL,int qPageId, int retryTime)
        {
            try
            {
                Console.WriteLine("URL:{0}",originalURL);
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(originalURL);
                request.Method = "HEAD";
                request.Timeout = 10000;

//Half of the time, the line below will throw a WebException and give me a statusCode=0;
                _Response = (HttpWebResponse)request.GetResponse(); 

            string destURL = _Response.ResponseUri.ToString();
            if (originalURL != destURL)
            {
                GetDestinationURLStatusCode(destURL,qPageId,retryTime);
            }
            int statusCode = (int)_Response.StatusCode;
            _Response.Close();
            Console.WriteLine("Normal:{0}", statusCode);
            return statusCode;
        }catch(WebException webEx)
        {
            int statusCode = 0;
            if (webEx.Status == WebExceptionStatus.ProtocolError)
            {
                //statusCode = (int)((HttpWebResponse)webEx.Response).StatusCode;
                Console.WriteLine("WebEx:{0}", statusCode);
            }
            if (_Response != null)
            {
                _Response.Close();
            }
            return statusCode;


        }
        catch(Exception ex)
        {
            if (_Response != null)
            {
                _Response.Close();
            }
            if(retryTime==0)
            {
                Console.WriteLine("Failed to get status code for URL['{1}'] on the Page[Code:{2}].{0}ErrorMessage:{3}", Environment.NewLine, _URL, pageId, ex.Message);
            }

            return -1;
        }
}

我的代码结果:有一半时间,它会抛出WebException并给我一个状态代码= 0。
我试图改变这种情况:
1 /我已将MaxDegreeOfParallelism更改为40和20,它不起作用 2 /我已将request.TimeOut更改为20s,30s甚至90s,它不起作用。

1 个答案:

答案 0 :(得分:0)

我已经改变了我的代码,现在它正在运行。 我改变的要点是:

  1. delete:new ParallelOptions(){MaxDegreeOfParallelism = 64}

  2. 首先使用并行,然后使用传统for循环来并行处理失败。这增加了成功的百分比。

  3. 为httpwebrequest修改了一些参数:

    request.UserAgent =“html-analyzor”;
      request.KeepAlive = false;
      request.Timeout = 15000;

  4. 以下是代码:

    List<QualityPageLink> linkListToRetrySync = new List<QualityPageLink>();
        ServicePointManager.DefaultConnectionLimit = 1000;
        Parallel.ForEach(
             linkList,
             //new ParallelOptions() { //MaxDegreeOfParallelism = 64 },
             link =>
             {
              HtmlAnalyzor htmlAnalyzor = new HtmlAnalyzor(link.URL);
              int statusCode = -1;
              for (int retryTime = 2; retryTime >= 0; retryTime--)
              {
                  statusCode = htmlAnalyzor.GetDestinationURLStatusCode(link.URL, link.IdQualityPage, retryTime);
                  if (statusCode > 0) { break; }
                  if (statusCode != 200) { linkListToRetrySync.Add(link); }
                  linkIdStatusCodeDic.Add(link, statusCode);
              });
    
    
    if(linkListToRetrySync!=null && linkListToRetrySync.Count()!=0)
    {
          for (int i = 0; i < linkListToRetrySync.Count(); i++)
          {
               var link = linkListToRetrySync[i];
               int statusCode = -1;
               HtmlAnalyzor htmlAnalyzor = new HtmlAnalyzor(link.URL);
               for (int retryTime = 2; retryTime >= 0; retryTime--)
               {
                   statusCode = htmlAnalyzor.GetDestinationURLStatusCode(link.URL, link.IdQualityPage, retryTime);
                   if (statusCode > 0) { break; }
               }
               linkIdStatusCodeDic[link] = statusCode;
                }
        }
    
     public int GetDestinationURLStatusCode(string originalURL, int qPageId, int retryTime)
            {
                HttpWebRequest request;
                int statusCode = -1;
                //HttpWebResponse response = null;
                try
                {
                    Console.WriteLine("URL:{0}", Helper.ToString(originalURL));
                    request = (HttpWebRequest)WebRequest.Create(originalURL);
                    request.UserAgent = "html-analyzor";
                    request.KeepAlive = false;
                    request.Timeout = 15000;
    
                using (this._Response = (HttpWebResponse)request.GetResponse())
                {
                    statusCode = (int)_Response.StatusCode;
                }
    
                //string destURL = _Response.ResponseUri.ToString();
                //if (originalURL != destURL)
                //{
                //    GetDestinationURLStatusCode(destURL, qPageId, retryTime);
                //}
    
                Console.WriteLine("Normal:{0}", statusCode);
                return statusCode;
            }
            catch (WebException webEx)
            {
                statusCode = 0;
                if (webEx.Status == WebExceptionStatus.ProtocolError)
                {
                    statusCode = (int)((HttpWebResponse)webEx.Response).StatusCode;
                    Console.WriteLine("WebEx:{0}", statusCode);
                }
                if (this._Response != null)
                {
                    this._Response.Close();
                    this._Response = null;
                }
                return statusCode;
            }
            catch(Exception ex)
            {
                if (this._Response != null)
                {
                    this._Response.Close();
                    this._Response = null;
                }
                if (retryTime == 0)
                {
                    // Console.WriteLine("Failed.");
                }
    
                return -1;
            }
    
        }