我在c#中编写了一个web爬虫。到目前为止,在我的图表中我可以扫描网站的源代码。对于我想要的网站,我需要登录才能访问静态页面。但是使用我的代码,我登录很好,可以扫描源代码,但是当我导航到下载页面时,我收到了一个错误。我认为这是因为我需要以某种方式告诉网站我仍然登录。我该怎么做?
当前代码。
using System;
using System.Net;
using System.IO;
using System.Text;
namespace WebCraler
{
class MainClass
{
static string username = "john" ;
static string password = "123";
public static void Main (string[] args)
{
Console.WriteLine ("Test login");
String Page = GetWebText("http://localhost/PHP/Login/userStats.php");
Console.WriteLine (Page);
Console.WriteLine ("Test Login");
String response = loginResponse();
Console.WriteLine (response);
}
public static string GetWebText(string url)
{
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
request.UserAgent = "A .NET Web Crawler";
WebResponse response = request.GetResponse();
Stream stream = response.GetResponseStream();
StreamReader reader = new StreamReader(stream);
string htmlText="";
string line;
while ((line = reader.ReadLine()) != null){
if(line.Contains("<td>"))
{
//htmlText += "\n *****Found Andrew Kralovec****** \n";
}
htmlText += line+"\n";
}
//string htmlText = reader.ReadToEnd();
return htmlText;
}
private static String loginResponse()
{
try{
ASCIIEncoding encoding = new ASCIIEncoding();
string postData = "myusername=" + username + "&mypassword=" + password;
byte[] data = encoding.GetBytes(postData);
WebRequest request = WebRequest.Create("http://localhost/PHP/Login/check_login.php");
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = data.Length;
Stream stream = request.GetRequestStream();
stream.Write(data, 0, data.Length);
stream.Close();
WebResponse response = request.GetResponse();
stream = response.GetResponseStream();
StreamReader steamReader = new StreamReader(stream);
String htmlRespones = steamReader.ReadToEnd();
steamReader.Close();
stream.Close();
return htmlRespones ;
}catch{
String htmlRespones = "Catch Error";
return htmlRespones ;
}
}
}
}
答案 0 :(得分:0)
当您登录网站时,服务器将发出一个cookie,必须在后续请求中重新发送,以便服务器知道您已登录(否则您将被重定向到登录页面或其他一些错误)。
默认情况下,HttpWebRequest
不会在请求之间保留Cookie,您需要自己管理:
private CookieContainer sessionCookies = new CookieContainer();
public void MakeRequest() {
HttpWebRequest request = HttpWebRequest.Create();
request.CookieContainer = this.sessionCookies;
// your code here
request.GetResponse();
}
答案 1 :(得分:0)
private CookieContainer sessionCookies = new CookieContainer();
public static string GetWebText(string url) {
HttpWebRequest request = HttpWebRequest.Create();
request.CookieContainer = this.sessionCookies; // loading cookies in
WebResponse response = request.GetResponse();
// now we need to store cookies received from server into the sessionCookies variable
this.sessionCookies = response.GetCoookies(); // GetCoookies() method or similar, check C# specification
...
return htmlText;
}
// your code here
- 可能只是请求和响应之间的任何内容(无)。我删除了它。