private void button2_Click(object sender, EventArgs e)
{
listBox1.Items.Clear();
StringBuilder sb = new StringBuilder();
byte[] ResultsBuffer = new byte[8192];
string SearchResults = "http://google.com/search?q=" + textBox2.Text.Trim();//txtKeyWords? Anladigim texte girilen deger
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream resStream = response.GetResponseStream();
string tempString = null;
int count = 0;
do
{
count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
if (count != 0)
{
tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);
sb.Append(tempString);
}
}
while (count > 0);
string sbb = sb.ToString();
HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
html.OptionOutputAsXml = true;
html.LoadHtml(sbb);
HtmlNode doc = html.DocumentNode;
StreamWriter sw = File.AppendText("website.txt");
foreach (HtmlNode link in doc.SelectNodes("//a[@href]"))
{
HtmlAttribute att = link.Attributes["href"];
string hrefValue = link.GetAttributeValue("href", string.Empty);
if (!hrefValue.ToString().ToUpper().Contains("GOOGLE") && hrefValue.ToString().Contains("/url?q=") && hrefValue.ToString().ToUpper().Contains("HTTP://"))
{
int index = hrefValue.IndexOf("&");
if (index > 0)
{
hrefValue = hrefValue.Substring(0, index);
listBox1.Items.Add(hrefValue.Replace("/url?q=", ""));
}
}
List<string> values = new List<string>();
string SourceCode = worker.GetSourceCode(SearchResults);
MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);
foreach (Match m in data)
{
string value = m.Groups[1].Value;
value = value.Replace("’", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("ö", "ö").Replace("ü", "ü").Replace("ç", "ç");
values.Add(value);
sw.Write(value);
}
}
sw.Close(); ;
}
public static string GetSourceCode(string url)
{
HttpWebRequest reg = (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse resp = (HttpWebResponse)reg.GetResponse();
StreamReader sr = new
StreamReader(resp.GetResponseStream(),System.Text.UTF8Encoding.UTF8);
string SourceCode = sr.ReadToEnd();
sr.Close();
resp.Close();
return SourceCode
大家好。我正在尝试准备一个Windows表单应用程序进行抓取。我将从我的Windows窗体输入一些表达式并自动搜索谷歌内的表达式。程序将显示我在列表框中找到的链接,并显示文本文件中的链接包含(该链接中的字母)。显示链接工作正常,但程序不记录文本文件中的链接内容。
我尝试了调试模式。结果程序没有进入该代码块。
foreach(Match m in data)
{
string value = m.Groups[1].Value;
value = value.Replace("’", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("ö", "ö").Replace("ü", "ü").Replace("ç", "ç");
values.Add(value);
sw.Write(value);
}
我尝试单独显示链接代码块和记录链接内容代码块。它们都工作正常。当我试图将它们结合起来时,它无法获得正常工作的代码。没有错误但没有工作。请帮忙。
答案 0 :(得分:0)
private void Clicked(object sender, EventArgs e)
{
List<string> values = new List<string>();
string url = textBox1.Text;
string SourceCode = worker.GetSourceCode(url);
MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);
foreach (Match m in data)
{
string value = m.Groups[1].Value;
value = value.Replace("’", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("ö", "ö").Replace("ü", "ü").Replace("ç", "ç");
values.Add(value);
StreamWriter sw = File.AppendText("website.txt");
sw.Write(value);
sw.Close(); ;
}
}
private void button2_Click(object sender, EventArgs e)
{
listBox1.Items.Clear();
StringBuilder sb = new StringBuilder();
byte[] ResultsBuffer = new byte[8192];
string SearchResults = "http://google.com/search?q=" + textBox2.Text.Trim();//txtKeyWords? Anladigim texte girilen deger
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream resStream = response.GetResponseStream();
string tempString = null;
int count = 0;
do
{
count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
if (count != 0)
{
tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);
sb.Append(tempString);
}
}
while (count > 0);
string sbb = sb.ToString();
HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
html.OptionOutputAsXml = true;
html.LoadHtml(sbb);
HtmlNode doc = html.DocumentNode;
//StreamWriter sw = File.AppendText("website.txt");
foreach (HtmlNode link in doc.SelectNodes("//a[@href]"))
{
HtmlAttribute att = link.Attributes["href"];
string hrefValue = link.GetAttributeValue("href", string.Empty);
if (!hrefValue.ToString().ToUpper().Contains("GOOGLE") && hrefValue.ToString().Contains("/url?q=") && hrefValue.ToString().ToUpper().Contains("HTTP://"))
{
int index = hrefValue.IndexOf("&");
if (index > 0)
{
hrefValue = hrefValue.Substring(0, index);
hrefValue = hrefValue.Replace("/url?q=", "");
listBox1.Items.Add(hrefValue);
GetData(hrefValue);
}
}
}
}
private void GetData(string url)
{
StreamWriter sw = File.AppendText("website.txt");
List<string> values = new List<string>();
string SourceCode = worker.GetSourceCode(url);
MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);
foreach (Match m in data)
{
string value = m.Groups[1].Value;
value = value.Replace("’", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("ö", "ö").Replace("ü", "ü").Replace("ç", "ç");
values.Add(value);
sw.Write(value);
}
sw.Close();
}
private void listBox1_SelectedIndexChanged(object sender, EventArgs e)
{
}
private void label3_Click(object sender, EventArgs e)
{
}
private void label2_Click(object sender, EventArgs e)
{
}
}
}
我终于成功启动了。这是答案。在我的回答中留下了一些问题。它们都是正则表达式。因为网站HTML代码没有标准概念。因此需要使用正则表达式进行纠正。当ı完成我的项目时,我将分享我的完整代码。