我正在为C#中的网页抓取和抓取编写控制台应用程序,仅用于学习目的。显示结果时,会显示某些值以及html标记,infact标记。我想出了强大的标签并完全取代了它们。但是,如果有许多强大的标签具有不同的内联样式值呢? 我怎么能解决这个问题?
问题出在GetData()函数
中using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Web;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
namespace MyCrawler
{
public class Program
{
public static string GetContent(string url)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
WebResponse response = request.GetResponse();
StreamReader reader = new StreamReader(response.GetResponseStream());
string line = "";
StringBuilder builder = new StringBuilder();
while ((line = reader.ReadLine()) != null)
{
builder.Append(line.Trim());
}
reader.Close();
return builder.ToString().Replace("\n", "");
}
public static void GetData(string content)
{
// these tags are to be replaced
string ToBeReplaced1 = "<strong style=\"color:#F00\">"; //
string ToBeReplaced2 = "</strong>";
string ToBeReplaced3 = "<strong style=\"color:#000099\">";
// pattern for regular expression
string pattern3 = "<dt>(.*?)</dt><dd>(.*?)</dd>";
Regex regex = new Regex(pattern3);
MatchCollection mc = regex.Matches(content);
foreach(Match m2 in mc)
{
Console.Write(m2.Groups[1].Value);
Console.WriteLine(((m2.Groups[2].Value.Replace(ToBeReplaced3, "")).Replace(ToBeReplaced1, "")).Replace(ToBeReplaced2, ""));
}
Console.WriteLine();
}
public static void Main(string[] args)
{
string url = "http://www.merojob.com/";
string content = GetContent(url);
string pattern = "<div class=\"employername\"><h2>(.*?)</h2><a href=\"(.*?)\"";
Regex regex = new Regex(pattern);
MatchCollection mc = regex.Matches(content);
foreach (Match m in mc)
{
foreach (Capture c in m.Groups[2].Captures)
{
//Console.WriteLine(c.Value); // write the value to the console "pattern"
content = GetContent(c.Value);
GetData(content);
}
}
Console.ReadKey();
}
}
}
答案 0 :(得分:1)
在您的情况下,最好的方法是使用专用库,例如HtmlAgilityPack,以便能够检索特定标记并操纵DOM文档的结构。手动操作是一种痛苦的方法。使用正则表达式执行此操作可能endanger your mind因此使用库来处理您的html
即使这只是为了学习目的,你也没有真正使用正确的工具或练习来开始学习,因为这是一个非常复杂的主题。