Web Scraping-crawling

时间:2017-09-29 05:38:02

标签: regex winforms web-scraping

 private void button2_Click(object sender, EventArgs e)
    {
        listBox1.Items.Clear();
        StringBuilder sb = new StringBuilder();
        byte[] ResultsBuffer = new byte[8192];
        string SearchResults = "http://google.com/search?q=" + textBox2.Text.Trim();//txtKeyWords? Anladigim texte girilen deger
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
        HttpWebResponse response = (HttpWebResponse)request.GetResponse();

        Stream resStream = response.GetResponseStream();
        string tempString = null;
        int count = 0;
        do
        {
            count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
            if (count != 0)
            {
                tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);
                sb.Append(tempString);
            }
        }

        while (count > 0);
        string sbb = sb.ToString();

        HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
        html.OptionOutputAsXml = true;
        html.LoadHtml(sbb);
        HtmlNode doc = html.DocumentNode;
        StreamWriter sw = File.AppendText("website.txt");
        foreach (HtmlNode link in doc.SelectNodes("//a[@href]"))
        {
            HtmlAttribute att = link.Attributes["href"];
            string hrefValue = link.GetAttributeValue("href", string.Empty);
            if (!hrefValue.ToString().ToUpper().Contains("GOOGLE") && hrefValue.ToString().Contains("/url?q=") && hrefValue.ToString().ToUpper().Contains("HTTP://"))
            {
                int index = hrefValue.IndexOf("&");
                if (index > 0)
                {
                    hrefValue = hrefValue.Substring(0, index);
                    listBox1.Items.Add(hrefValue.Replace("/url?q=", ""));
                }

            }
            List<string> values = new List<string>();

            string SourceCode = worker.GetSourceCode(SearchResults);

            MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);

            foreach (Match m in data)
            {

                string value = m.Groups[1].Value;
                value = value.Replace("&rsquo;", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("&ouml;", "ö").Replace("&uuml;", "ü").Replace("&ccedil;", "ç");
                values.Add(value);

                sw.Write(value);
            }
        }
        sw.Close(); ;
    }



       public static string GetSourceCode(string url)
    {
         HttpWebRequest reg = (HttpWebRequest)WebRequest.Create(url);
        HttpWebResponse resp = (HttpWebResponse)reg.GetResponse();
        StreamReader sr = new 
       StreamReader(resp.GetResponseStream(),System.Text.UTF8Encoding.UTF8);
        string SourceCode = sr.ReadToEnd();
        sr.Close();
        resp.Close();
        return SourceCode

大家好。我正在尝试准备一个Windows表单应用程序进行抓取。我将从我的Windows窗体输入一些表达式并自动搜索谷歌内的表达式。程序将显示我在列表框中找到的链接,并显示文本文件中的链接包含(该链接中的字母)。显示链接工作正常,但程序不记录文本文件中的链接内容。

我尝试了调试模式。结果程序没有进入该代码块。

foreach(Match m in data)
        {

            string value = m.Groups[1].Value;
            value = value.Replace("&rsquo;", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("&ouml;", "ö").Replace("&uuml;", "ü").Replace("&ccedil;", "ç");
            values.Add(value);

            sw.Write(value);
        }

我尝试单独显示链接代码块和记录链接内容代码块。它们都工作正常。当我试图将它们结合起来时,它无法获得正常工作的代码。没有错误但没有工作。请帮忙。

1 个答案:

答案 0 :(得分:0)

    private void Clicked(object sender, EventArgs e)
    {
        List<string> values = new List<string>();
        string url = textBox1.Text;
        string SourceCode = worker.GetSourceCode(url);

        MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);

        foreach (Match m in data)
        {

            string value = m.Groups[1].Value;
            value = value.Replace("&rsquo;", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("&ouml;", "ö").Replace("&uuml;", "ü").Replace("&ccedil;", "ç");
            values.Add(value);
            StreamWriter sw = File.AppendText("website.txt");
            sw.Write(value);
            sw.Close(); ;
        }

    }

    private void button2_Click(object sender, EventArgs e)
    {
        listBox1.Items.Clear();
        StringBuilder sb = new StringBuilder();
        byte[] ResultsBuffer = new byte[8192];
        string SearchResults = "http://google.com/search?q=" + textBox2.Text.Trim();//txtKeyWords? Anladigim texte girilen deger
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
        HttpWebResponse response = (HttpWebResponse)request.GetResponse();

        Stream resStream = response.GetResponseStream();
        string tempString = null;
        int count = 0;
        do
        {
            count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
            if (count != 0)
            {
                tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);
                sb.Append(tempString);
            }
        }

        while (count > 0);
        string sbb = sb.ToString();

        HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
        html.OptionOutputAsXml = true;
        html.LoadHtml(sbb);
        HtmlNode doc = html.DocumentNode;
        //StreamWriter sw = File.AppendText("website.txt");
        foreach (HtmlNode link in doc.SelectNodes("//a[@href]"))
        {
            HtmlAttribute att = link.Attributes["href"];
            string hrefValue = link.GetAttributeValue("href", string.Empty);

            if (!hrefValue.ToString().ToUpper().Contains("GOOGLE") && hrefValue.ToString().Contains("/url?q=") && hrefValue.ToString().ToUpper().Contains("HTTP://"))
            {

                int index = hrefValue.IndexOf("&");

                if (index > 0)
                {
                    hrefValue = hrefValue.Substring(0, index);
                    hrefValue = hrefValue.Replace("/url?q=", "");
                    listBox1.Items.Add(hrefValue);
                    GetData(hrefValue);
                }                  
            }              
        }            
    }

    private void GetData(string url)
    {
        StreamWriter sw = File.AppendText("website.txt");

        List<string> values = new List<string>();

        string SourceCode = worker.GetSourceCode(url);

        MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);

        foreach (Match m in data)
        {

            string value = m.Groups[1].Value;
            value = value.Replace("&rsquo;", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("&ouml;", "ö").Replace("&uuml;", "ü").Replace("&ccedil;", "ç");
            values.Add(value);

            sw.Write(value);

        }
        sw.Close();
    }

    private void listBox1_SelectedIndexChanged(object sender, EventArgs e)
    {

    }

    private void label3_Click(object sender, EventArgs e)
    {

    }

    private void label2_Click(object sender, EventArgs e)
    {

    }


}

}

我终于成功启动了。这是答案。在我的回答中留下了一些问题。它们都是正则表达式。因为网站HTML代码没有标准概念。因此需要使用正则表达式进行纠正。当ı完成我的项目时,我将分享我的完整代码。