我从以下网址csharp-online中的示例节目中获取灵感 并打算从此页面alexa
中检索所有网址using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.Text.RegularExpressions;
namespace ExtractingUrls
{
class Program
{
static void Main(string[] args)
{
WebClient client = new WebClient();
const string url = "http://www.alexa.com/topsites/category/Top/Society/History/By_Topic/Science/Engineering_and_Technology";
string source = client.DownloadString(url);
//Console.WriteLine(Getvals(source));
string matchPattern =
@"<a.rel=""nofollow"".style=""font-size:0.8em;"".href=[""'](?<url>[^""^']+[.]*)[""'].class=""offsite"".*>(?<name>[^<]+[.]*)</a>";
foreach (Hashtable grouping in ExtractGroupings(source, matchPattern, true))
{
foreach (DictionaryEntry DE in grouping)
{
Console.WriteLine("Value = " + DE.Value);
Console.WriteLine("");
}
}
// End.
Console.ReadLine();
}
public static ArrayList ExtractGroupings(string source, string matchPattern, bool wantInitialMatch)
{
ArrayList keyedMatches = new ArrayList();
int startingElement = 1;
if (wantInitialMatch)
{
startingElement = 0;
}
Regex RE = new Regex(matchPattern, RegexOptions.Multiline);
MatchCollection theMatches = RE.Matches(source);
foreach (Match m in theMatches)
{
Hashtable groupings = new Hashtable();
for (int counter = startingElement; counter < m.Groups.Count; counter++)
{
// If we had just returned the MatchCollection directly, the
// GroupNameFromNumber method would not be available to use
groupings.Add(RE.GroupNameFromNumber(counter),
m.Groups[counter]);
}
keyedMatches.Add(groupings);
}
return (keyedMatches);
}
}
}
但是在这里我遇到了一个问题,当我执行每个URL时显示三次,这是首先显示整个锚标记,然后显示两次URL。任何人都可以建议我应该在哪里纠正,以便我可以让每个网址只显示一次。
答案 0 :(得分:3)
使用HTML Agility Pack解析HTML。我认为这会让你的问题更容易解决。
这是一种方法:
WebClient client = new WebClient();
string url = "http://www.alexa.com/topsites/category/Top/Society/History/By_Topic/Science/Engineering_and_Technology";
string source = client.DownloadString(url);
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(source);
foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href and @rel='nofollow']"))
{
Console.WriteLine(link.Attributes["href"].Value);
}
答案 1 :(得分:1)
在你的正则表达式中,你有两个分组和整个匹配。如果我正确地阅读它,你应该只想要匹配的URL部分,这是3个分组中的第二个.......
而不是:
for (int counter = startingElement; counter < m.Groups.Count; counter++)
{
// If we had just returned the MatchCollection directly, the
// GroupNameFromNumber method would not be available to use
groupings.Add(RE.GroupNameFromNumber(counter),
m.Groups[counter]);
}
你不想要这个吗?:
groupings.Add(RE.GroupNameFromNumber(1),m.Groups[1]);
答案 2 :(得分:1)
int startingElement = 1;
if (wantInitialMatch)
{
startingElement = 0;
}
...
for (int counter = startingElement; counter < m.Groups.Count; counter++)
{
// If we had just returned the MatchCollection directly, the
// GroupNameFromNumber method would not be available to use
groupings.Add(RE.GroupNameFromNumber(counter),
.Groups[counter]);
}
你传递wantInitialMatch = true
,所以你的for循环正在返回:
.Groups[0] //entire match
.Groups[1] //(?<url>[^""^']+[.]*) href part
.Groups[2] //(?<name>[^<]+[.]*) link text
答案 3 :(得分:0)