我正在开发一个Web scraper,但我需要在请求之间保留cookie,就像我在PHP中使用curl一样。但是,似乎如果我尝试在C#中使用CookieContainer
对象,它不会从响应中获取所有cookie并将它们发送到下一个请求。
这是我的C#类:
public class Scraper
{
public string Username { get; set; }
public string Password { get; set; }
public string UserAgent { get; set; }
public string ContentType { get; set; }
public CookieCollection Cookies { get; set; }
public CookieContainer Container { get; set; }
public Scraper()
{
UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0";
ContentType = "application/x-www-form-urlencoded";
Cookies = new CookieCollection();
Container = new CookieContainer();
}
public string Load(string uri, string postData = "", NetworkCredential creds = null, int timeout = 60000, string host = "", string referer = "", string requestedwith = "")
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
request.CookieContainer = Container;
request.CookieContainer.Add(Cookies);
request.UserAgent = UserAgent;
request.AllowWriteStreamBuffering = true;
request.ProtocolVersion = HttpVersion.Version11;
request.AllowAutoRedirect = true;
request.ContentType = ContentType;
request.PreAuthenticate = true;
if (requestedwith.Length > 0)
request.Headers["X-Requested-With"] = requestedwith;
if (host.Length > 0)
request.Host = host;
if (referer.Length > 0)
request.Referer = referer;
if (timeout > 0)
request.Timeout = timeout;
if (creds != null)
request.Credentials = creds;
if (postData.Length > 0)
{
request.Method = "POST";
ASCIIEncoding encoding = new ASCIIEncoding();
byte[] data = encoding.GetBytes(postData);
request.ContentLength = data.Length;
Stream newStream = request.GetRequestStream(); //open connection
newStream.Write(data, 0, data.Length); // Send the data.
newStream.Close();
}
else
request.Method = "GET";
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Cookies = response.Cookies;
StringBuilder page;
using (StreamReader sr = new StreamReader(response.GetResponseStream()))
{
page = new StringBuilder(sr.ReadToEnd());
page = page.Replace("\r\n", ""); // strip all new lines and tabs
page = page.Replace("\r", ""); // strip all new lines and tabs
page = page.Replace("\n", ""); // strip all new lines and tabs
page = page.Replace("\t", ""); // strip all new lines and tabs
}
string str = page.ToString();
str = Regex.Replace(str, @">\s+<", "><");
return str;
}
}
这是我在cookie jar中加载和维护cookie的PHP代码:
private function load($url = 'http://www.google.com/', $postData = array(), $headers = FALSE)
{
$useragent = "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; " . $this->locale . "; rv:1.9.2.10) Gecko/20100914 BRI/1 Firefox/3.6.10 ( .NET CLR 3.5.30729)";
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($curl, CURLOPT_HEADER, FALSE);
if($headers) curl_setopt($curl, CURLOPT_HTTPHEADER, array('X-Requested-With: XMLHttpRequest'));
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE);
curl_setopt($curl, CURLOPT_ENCODING, 'UTF-8');
curl_setopt($curl, CURLOPT_USERAGENT, $useragent);
curl_setopt($curl, CURLOPT_POST, !empty($postData));
if(!empty($postData)) curl_setopt($curl, CURLOPT_POSTFIELDS, $postData);
curl_setopt($curl, CURLOPT_COOKIEFILE, $this->cookieFile);
curl_setopt($curl, CURLOPT_COOKIEJAR, $this->cookieFile);
$page = curl_exec ($curl);
$page = str_replace(array("\r\n", "\r", "\n", "\t"), "", $page); // strip all new lines and tabs
$page = preg_replace('~>\s+<~', '><', $page);// strip all whitespace between tags
curl_close ($curl);
return $page;
}
如何在请求之间成功维护cookie?
答案 0 :(得分:2)
我找到了一个名为LibCurl.NET的libcurl的.NET包装器,并且能够以与C#中的cURL相同的方式处理cookie!以下是我感兴趣的人的代码:
using SeasideResearch.LibCurlNet;
using System;
using System.Collections.Generic;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
namespace Scraping
{
public class LibCurlScraper
{
StringBuilder sb = new StringBuilder();
MemoryStream ms = new MemoryStream();
public string CookieFile { get; set; }
public string RedirectUrl { get; set; }
public string UserAgent { get; set; }
public string ContentType { get; set; }
public bool DisplayHeaders { get; set; }
public bool FollowRedirects { get; set; }
public LibCurlScraper()
{
UserAgent = "useragent";
ContentType = "application/x-www-form-urlencoded";
Curl.GlobalInit((int)CURLinitFlag.CURL_GLOBAL_ALL);
DisplayHeaders = false;
}
private int MyWriteFunction(byte[] buf, int size, int nmemb, Object extraData)
{
foreach (byte b in buf)
{
//Console.Write((char)b);
sb.Append((char)b);
}
return buf.Length;
}
private int MyWriteBinaryFunction(byte[] buf, int size, int nmemb, Object extraData)
{
foreach (byte b in buf)
{
//Console.Write((char)b);
ms.WriteByte(b);
}
return buf.Length;
}
public MemoryStream LoadBinary(string uri, string method = "GET", string postData = "", List<string> headers = null)
{
ms = new MemoryStream();
Easy easy = new Easy();
Easy.WriteFunction wf = MyWriteBinaryFunction;
easy.SetOpt(CURLoption.CURLOPT_URL, uri);
easy.SetOpt(CURLoption.CURLOPT_HEADER, false);
easy.SetOpt(CURLoption.CURLOPT_FOLLOWLOCATION, true);
Slist headerSlist = new Slist();
if (headers != null)
{
foreach (var header in headers)
{
headerSlist.Append(header);
}
}
easy.SetOpt(CURLoption.CURLOPT_HTTPHEADER, headerSlist);
easy.SetOpt(CURLoption.CURLOPT_SSL_VERIFYPEER, false);
easy.SetOpt(CURLoption.CURLOPT_SSL_VERIFYHOST, false);
easy.SetOpt(CURLoption.CURLOPT_USERAGENT, UserAgent);
easy.SetOpt(CURLoption.CURLOPT_TIMEOUT, 10);
easy.SetOpt(CURLoption.CURLOPT_CONNECTTIMEOUT, 3);
if (!string.IsNullOrEmpty(postData))
{
easy.SetOpt(CURLoption.CURLOPT_POST, true);
easy.SetOpt(CURLoption.CURLOPT_POSTFIELDS, postData);
}
easy.SetOpt(CURLoption.CURLOPT_COOKIEFILE, CookieFile);
easy.SetOpt(CURLoption.CURLOPT_COOKIEJAR, CookieFile);
easy.SetOpt(CURLoption.CURLOPT_WRITEFUNCTION, wf);
easy.Perform();
int code = 0;
easy.GetInfo(CURLINFO.CURLINFO_RESPONSE_CODE, ref code);
easy.Cleanup();
return ms;
}
public string Load(string uri, string method = "GET", string postData = "", List<string> headers = null)
{
sb.Clear();
Easy easy = new Easy();
Easy.WriteFunction wf = MyWriteFunction;
easy.SetOpt(CURLoption.CURLOPT_URL, uri);
easy.SetOpt(CURLoption.CURLOPT_HEADER, DisplayHeaders);
easy.SetOpt(CURLoption.CURLOPT_FOLLOWLOCATION, FollowRedirects);
Slist headerSlist = new Slist();
if (headers != null)
{
foreach (var header in headers)
{
headerSlist.Append(header);
}
}
easy.SetOpt(CURLoption.CURLOPT_HTTPHEADER, headerSlist);
easy.SetOpt(CURLoption.CURLOPT_SSL_VERIFYPEER, false);
easy.SetOpt(CURLoption.CURLOPT_SSL_VERIFYHOST, false);
easy.SetOpt(CURLoption.CURLOPT_USERAGENT, UserAgent);
easy.SetOpt(CURLoption.CURLOPT_TIMEOUT, 10);
easy.SetOpt(CURLoption.CURLOPT_CONNECTTIMEOUT, 3);
if (!string.IsNullOrEmpty(postData))
{
easy.SetOpt(CURLoption.CURLOPT_POST, true);
easy.SetOpt(CURLoption.CURLOPT_POSTFIELDS, postData);
}
if (method.Equals("POST"))
{
easy.SetOpt(CURLoption.CURLOPT_POST, true);
}
easy.SetOpt(CURLoption.CURLOPT_COOKIEFILE, CookieFile);
easy.SetOpt(CURLoption.CURLOPT_COOKIEJAR, CookieFile);
easy.SetOpt(CURLoption.CURLOPT_WRITEFUNCTION, wf);
easy.Perform();
int code = 0;
easy.GetInfo(CURLINFO.CURLINFO_RESPONSE_CODE, ref code);
easy.Cleanup();
//Console.WriteLine(code);
if (code == 302)
{
RedirectUrl = FindString(sb.ToString(), "Location:(.*?)\n");
//Console.WriteLine(RedirectUrl);
}
string page = sb.ToString();
page = page.Replace("\r\n", ""); // strip all new lines and tabs
page = page.Replace("\r", ""); // strip all new lines and tabs
page = page.Replace("\n", ""); // strip all new lines and tabs
page = page.Replace("\t", ""); // strip all new lines and tabs
page = Regex.Replace(page, @">\s+<", "><");
return page;
}
public static void OnDebug(CURLINFOTYPE infoType, String msg, Object extraData)
{
Console.WriteLine(msg);
TextWriter tw = new StreamWriter(@"C:\cookies\verbose.txt", true);
tw.WriteLine(msg);
tw.Close();
}
}
}
我有两个方法,一个用于返回字符串,另一个用于返回MemoryStream。在尝试写入文件之前,您需要初始化CookieFile属性并确保目录/文件是可写的。
我注意到如果您的cookie文件包含上一次运行的旧会话数据,则会出现问题。这可以通过在实例化LibCurlScraper的新实例并填充cookie文件之前删除您的cookie文件来解决。
理想情况下,我们可以为所有HTTP Cookie使用内置托管类,但这一直有效,直到找到更好的解决方案。
修改强>
我遇到了一些正确解析“Set-Cookie”标头的代码。它处理以逗号分隔的cookie,并提取每个cookie的名称,过期,路径,值和域。这应该是发出HTTP请求而不是LibCurl.NET的首选方式。您也可以将此方法应用于异步请求。
这段代码比微软自己的cookie解析器效果更好,这正是官方cookie解析器应该做的事情。我没有任何线索,为什么微软还没有解决这个问题,因为这是一个非常普遍的问题。
这是原始代码: http://snipplr.com/view/4427/
我在这里发帖,以防链接在某些时候出现故障:
public static CookieCollection GetAllCookiesFromHeader(string strHeader, string strHost)
{
ArrayList al = new ArrayList();
CookieCollection cc = new CookieCollection();
if (strHeader != string.Empty)
{
al = ConvertCookieHeaderToArrayList(strHeader);
cc = ConvertCookieArraysToCookieCollection(al, strHost);
}
return cc;
}
private static ArrayList ConvertCookieHeaderToArrayList(string strCookHeader)
{
strCookHeader = strCookHeader.Replace("\r", "");
strCookHeader = strCookHeader.Replace("\n", "");
string[] strCookTemp = strCookHeader.Split(',');
ArrayList al = new ArrayList();
int i = 0;
int n = strCookTemp.Length;
while (i < n)
{
if (strCookTemp[i].IndexOf("expires=", StringComparison.OrdinalIgnoreCase) > 0)
{
al.Add(strCookTemp[i] + "," + strCookTemp[i + 1]);
i = i + 1;
}
else
{
al.Add(strCookTemp[i]);
}
i = i + 1;
}
return al;
}
private static CookieCollection ConvertCookieArraysToCookieCollection(ArrayList al, string strHost)
{
CookieCollection cc = new CookieCollection();
int alcount = al.Count;
string strEachCook;
string[] strEachCookParts;
for (int i = 0; i < alcount; i++)
{
strEachCook = al[i].ToString();
strEachCookParts = strEachCook.Split(';');
int intEachCookPartsCount = strEachCookParts.Length;
string strCNameAndCValue = string.Empty;
string strPNameAndPValue = string.Empty;
string strDNameAndDValue = string.Empty;
string[] NameValuePairTemp;
Cookie cookTemp = new Cookie();
for (int j = 0; j < intEachCookPartsCount; j++)
{
if (j == 0)
{
strCNameAndCValue = strEachCookParts[j];
if (strCNameAndCValue != string.Empty)
{
int firstEqual = strCNameAndCValue.IndexOf("=");
string firstName = strCNameAndCValue.Substring(0, firstEqual);
string allValue = strCNameAndCValue.Substring(firstEqual + 1, strCNameAndCValue.Length - (firstEqual + 1));
cookTemp.Name = firstName;
cookTemp.Value = allValue;
}
continue;
}
if (strEachCookParts[j].IndexOf("path", StringComparison.OrdinalIgnoreCase) >= 0)
{
strPNameAndPValue = strEachCookParts[j];
if (strPNameAndPValue != string.Empty)
{
NameValuePairTemp = strPNameAndPValue.Split('=');
if (NameValuePairTemp[1] != string.Empty)
{
cookTemp.Path = NameValuePairTemp[1];
}
else
{
cookTemp.Path = "/";
}
}
continue;
}
if (strEachCookParts[j].IndexOf("domain", StringComparison.OrdinalIgnoreCase) >= 0)
{
strPNameAndPValue = strEachCookParts[j];
if (strPNameAndPValue != string.Empty)
{
NameValuePairTemp = strPNameAndPValue.Split('=');
if (NameValuePairTemp[1] != string.Empty)
{
cookTemp.Domain = NameValuePairTemp[1];
}
else
{
cookTemp.Domain = strHost;
}
}
continue;
}
}
if (cookTemp.Path == string.Empty)
{
cookTemp.Path = "/";
}
if (cookTemp.Domain == string.Empty)
{
cookTemp.Domain = strHost;
}
cc.Add(cookTemp);
}
return cc;
}