在c#中实现Web scraper

时间:2013-08-24 18:21:14

标签: c# cookies curl web-scraping httprequest

我正在开发一个Web scraper,但我需要在请求之间保留cookie,就像我在PHP中使用curl一样。但是,似乎如果我尝试在C#中使用CookieContainer对象,它不会从响应中获取所有cookie并将它们发送到下一个请求。


    public class Scraper
        public string Username { get; set; }
        public string Password { get; set; }
        public string UserAgent { get; set; }
        public string ContentType { get; set; }
        public CookieCollection Cookies { get; set; }
        public CookieContainer Container { get; set; }

        public Scraper()
            UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0";
            ContentType = "application/x-www-form-urlencoded";
            Cookies = new CookieCollection();
            Container = new CookieContainer();

        public string Load(string uri, string postData = "", NetworkCredential creds = null, int timeout = 60000, string host = "", string referer = "", string requestedwith = "")
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
            request.CookieContainer = Container;
            request.UserAgent = UserAgent;
            request.AllowWriteStreamBuffering = true;
            request.ProtocolVersion = HttpVersion.Version11;
            request.AllowAutoRedirect = true;
            request.ContentType = ContentType;
            request.PreAuthenticate = true;

            if (requestedwith.Length > 0)
                request.Headers["X-Requested-With"] = requestedwith;

            if (host.Length > 0)
                request.Host = host;

            if (referer.Length > 0)
                request.Referer = referer;

            if (timeout > 0)
                request.Timeout = timeout;

            if (creds != null)
                request.Credentials = creds;

            if (postData.Length > 0)
                request.Method = "POST";
                ASCIIEncoding encoding = new ASCIIEncoding();
                byte[] data = encoding.GetBytes(postData);
                request.ContentLength = data.Length;
                Stream newStream = request.GetRequestStream(); //open connection
                newStream.Write(data, 0, data.Length); // Send the data.
                request.Method = "GET";

            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            Cookies = response.Cookies;
            StringBuilder page;
            using (StreamReader sr = new StreamReader(response.GetResponseStream()))
                page = new StringBuilder(sr.ReadToEnd());
                page = page.Replace("\r\n", ""); // strip all new lines and tabs
                page = page.Replace("\r", ""); // strip all new lines and tabs
                page = page.Replace("\n", ""); // strip all new lines and tabs
                page = page.Replace("\t", ""); // strip all new lines and tabs

            string str = page.ToString();
            str = Regex.Replace(str, @">\s+<", "><");

            return str;

这是我在cookie jar中加载和维护cookie的PHP代码:

    private function load($url = 'http://www.google.com/', $postData = array(), $headers = FALSE)
        $useragent = "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; " . $this->locale . "; rv: Gecko/20100914 BRI/1 Firefox/3.6.10 ( .NET CLR 3.5.30729)";

        $curl = curl_init();
        curl_setopt($curl, CURLOPT_URL, $url);
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
        curl_setopt($curl, CURLOPT_HEADER, FALSE);
        if($headers) curl_setopt($curl, CURLOPT_HTTPHEADER, array('X-Requested-With: XMLHttpRequest'));
        curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
        curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE);
        curl_setopt($curl, CURLOPT_ENCODING, 'UTF-8');
        curl_setopt($curl, CURLOPT_USERAGENT, $useragent);
        curl_setopt($curl, CURLOPT_POST, !empty($postData));
        if(!empty($postData)) curl_setopt($curl, CURLOPT_POSTFIELDS, $postData);
        curl_setopt($curl, CURLOPT_COOKIEFILE, $this->cookieFile);
        curl_setopt($curl, CURLOPT_COOKIEJAR, $this->cookieFile);
        $page = curl_exec ($curl);
        $page = str_replace(array("\r\n", "\r", "\n", "\t"), "", $page); // strip all new lines and tabs
        $page = preg_replace('~>\s+<~', '><', $page);// strip all whitespace between tags
        curl_close ($curl);

        return $page;


using SeasideResearch.LibCurlNet;
using System;
using System.Collections.Generic;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;

namespace Scraping
    public class LibCurlScraper
        StringBuilder sb = new StringBuilder();
        MemoryStream ms = new MemoryStream();
        public string CookieFile { get; set; }
        public string RedirectUrl { get; set; }
        public string UserAgent { get; set; }
        public string ContentType { get; set; }
        public bool DisplayHeaders { get; set; }
        public bool FollowRedirects { get; set; }

        public LibCurlScraper()
            UserAgent = "useragent";
            ContentType = "application/x-www-form-urlencoded";
            DisplayHeaders = false;

        private int MyWriteFunction(byte[] buf, int size, int nmemb, Object extraData)
            foreach (byte b in buf)

            return buf.Length;

        private int MyWriteBinaryFunction(byte[] buf, int size, int nmemb, Object extraData)
            foreach (byte b in buf)

            return buf.Length;

        public MemoryStream LoadBinary(string uri, string method = "GET", string postData = "", List<string> headers = null)
            ms = new MemoryStream();
            Easy easy = new Easy();
            Easy.WriteFunction wf = MyWriteBinaryFunction;
            easy.SetOpt(CURLoption.CURLOPT_URL, uri);
            easy.SetOpt(CURLoption.CURLOPT_HEADER, false);
            easy.SetOpt(CURLoption.CURLOPT_FOLLOWLOCATION, true);

            Slist headerSlist = new Slist();

            if (headers != null)
                foreach (var header in headers)


            easy.SetOpt(CURLoption.CURLOPT_HTTPHEADER, headerSlist);

            easy.SetOpt(CURLoption.CURLOPT_SSL_VERIFYPEER, false);
            easy.SetOpt(CURLoption.CURLOPT_SSL_VERIFYHOST, false);
            easy.SetOpt(CURLoption.CURLOPT_USERAGENT, UserAgent);
            easy.SetOpt(CURLoption.CURLOPT_TIMEOUT, 10);
            easy.SetOpt(CURLoption.CURLOPT_CONNECTTIMEOUT, 3);

            if (!string.IsNullOrEmpty(postData))
                easy.SetOpt(CURLoption.CURLOPT_POST, true);
                easy.SetOpt(CURLoption.CURLOPT_POSTFIELDS, postData);

            easy.SetOpt(CURLoption.CURLOPT_COOKIEFILE, CookieFile);
            easy.SetOpt(CURLoption.CURLOPT_COOKIEJAR, CookieFile);
            easy.SetOpt(CURLoption.CURLOPT_WRITEFUNCTION, wf);
            int code = 0;
            easy.GetInfo(CURLINFO.CURLINFO_RESPONSE_CODE, ref code);

            return ms;

        public string Load(string uri, string method = "GET", string postData = "", List<string> headers = null)
            Easy easy = new Easy();
            Easy.WriteFunction wf = MyWriteFunction;
            easy.SetOpt(CURLoption.CURLOPT_URL, uri);
            easy.SetOpt(CURLoption.CURLOPT_HEADER, DisplayHeaders);
            easy.SetOpt(CURLoption.CURLOPT_FOLLOWLOCATION, FollowRedirects);

            Slist headerSlist = new Slist();

            if (headers != null)
                foreach (var header in headers)


            easy.SetOpt(CURLoption.CURLOPT_HTTPHEADER, headerSlist);

            easy.SetOpt(CURLoption.CURLOPT_SSL_VERIFYPEER, false);
            easy.SetOpt(CURLoption.CURLOPT_SSL_VERIFYHOST, false);
            easy.SetOpt(CURLoption.CURLOPT_USERAGENT, UserAgent);
            easy.SetOpt(CURLoption.CURLOPT_TIMEOUT, 10);
            easy.SetOpt(CURLoption.CURLOPT_CONNECTTIMEOUT, 3);

            if (!string.IsNullOrEmpty(postData))
                easy.SetOpt(CURLoption.CURLOPT_POST, true);
                easy.SetOpt(CURLoption.CURLOPT_POSTFIELDS, postData);

            if (method.Equals("POST"))
                easy.SetOpt(CURLoption.CURLOPT_POST, true);

            easy.SetOpt(CURLoption.CURLOPT_COOKIEFILE, CookieFile);
            easy.SetOpt(CURLoption.CURLOPT_COOKIEJAR, CookieFile);
            easy.SetOpt(CURLoption.CURLOPT_WRITEFUNCTION, wf);
            int code = 0;
            easy.GetInfo(CURLINFO.CURLINFO_RESPONSE_CODE, ref code);

            if (code == 302)
                RedirectUrl = FindString(sb.ToString(), "Location:(.*?)\n");

            string page = sb.ToString();
            page = page.Replace("\r\n", ""); // strip all new lines and tabs
            page = page.Replace("\r", ""); // strip all new lines and tabs
            page = page.Replace("\n", ""); // strip all new lines and tabs
            page = page.Replace("\t", ""); // strip all new lines and tabs

            page = Regex.Replace(page, @">\s+<", "><");

            return page;

        public static void OnDebug(CURLINFOTYPE infoType, String msg, Object extraData)
            TextWriter tw = new StreamWriter(@"C:\cookies\verbose.txt", true);



理想情况下,我们可以为所有HTTP Cookie使用内置托管类,但这一直有效,直到找到更好的解决方案。



这是原始代码: http://snipplr.com/view/4427/


public static CookieCollection GetAllCookiesFromHeader(string strHeader, string strHost)
    ArrayList al = new ArrayList();
    CookieCollection cc = new CookieCollection();
    if (strHeader != string.Empty)
        al = ConvertCookieHeaderToArrayList(strHeader);
        cc = ConvertCookieArraysToCookieCollection(al, strHost);
    return cc;

private static ArrayList ConvertCookieHeaderToArrayList(string strCookHeader)
    strCookHeader = strCookHeader.Replace("\r", "");
    strCookHeader = strCookHeader.Replace("\n", "");
    string[] strCookTemp = strCookHeader.Split(',');
    ArrayList al = new ArrayList();
    int i = 0;
    int n = strCookTemp.Length;
    while (i < n)
        if (strCookTemp[i].IndexOf("expires=", StringComparison.OrdinalIgnoreCase) > 0)
            al.Add(strCookTemp[i] + "," + strCookTemp[i + 1]);
            i = i + 1;
        i = i + 1;
    return al;

private static CookieCollection ConvertCookieArraysToCookieCollection(ArrayList al, string strHost)
    CookieCollection cc = new CookieCollection();

    int alcount = al.Count;
    string strEachCook;
    string[] strEachCookParts;
    for (int i = 0; i < alcount; i++)
        strEachCook = al[i].ToString();
        strEachCookParts = strEachCook.Split(';');
        int intEachCookPartsCount = strEachCookParts.Length;
        string strCNameAndCValue = string.Empty;
        string strPNameAndPValue = string.Empty;
        string strDNameAndDValue = string.Empty;
        string[] NameValuePairTemp;
        Cookie cookTemp = new Cookie();

        for (int j = 0; j < intEachCookPartsCount; j++)
            if (j == 0)
                strCNameAndCValue = strEachCookParts[j];
                if (strCNameAndCValue != string.Empty)
                    int firstEqual = strCNameAndCValue.IndexOf("=");
                    string firstName = strCNameAndCValue.Substring(0, firstEqual);
                    string allValue = strCNameAndCValue.Substring(firstEqual + 1, strCNameAndCValue.Length - (firstEqual + 1));
                    cookTemp.Name = firstName;
                    cookTemp.Value = allValue;
            if (strEachCookParts[j].IndexOf("path", StringComparison.OrdinalIgnoreCase) >= 0)
                strPNameAndPValue = strEachCookParts[j];
                if (strPNameAndPValue != string.Empty)
                    NameValuePairTemp = strPNameAndPValue.Split('=');
                    if (NameValuePairTemp[1] != string.Empty)
                        cookTemp.Path = NameValuePairTemp[1];
                        cookTemp.Path = "/";

            if (strEachCookParts[j].IndexOf("domain", StringComparison.OrdinalIgnoreCase) >= 0)
                strPNameAndPValue = strEachCookParts[j];
                if (strPNameAndPValue != string.Empty)
                    NameValuePairTemp = strPNameAndPValue.Split('=');

                    if (NameValuePairTemp[1] != string.Empty)
                        cookTemp.Domain = NameValuePairTemp[1];
                        cookTemp.Domain = strHost;

        if (cookTemp.Path == string.Empty)
            cookTemp.Path = "/";
        if (cookTemp.Domain == string.Empty)
            cookTemp.Domain = strHost;
    return cc;