有人可以帮我这个代码吗? 我正在尝试下载此html http://mises.org/books/中的所有网址(它们都是pdf的)
我理解基本逻辑,我想我只是弄乱了正则表达式。这就是我到目前为止所做的:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
namespace DownloadPdfs
{
class Program
{
static void Main(string[] args)
{
StringBuilder sb = new StringBuilder();
byte[] buf = new byte[8192];
HttpWebRequest request = (HttpWebRequest)
WebRequest.Create("http://mises.org/books/");
HttpWebResponse response = (HttpWebResponse)
request.GetResponse();
Stream resStream = response.GetResponseStream();
string tempString = null;
int count = 0;
do
{
count = resStream.Read(buf, 0, buf.Length);
if (count != 0)
{
tempString = Encoding.ASCII.GetString(buf, 0, count);
sb.Append(tempString);
}
}
while (count > 0); // any more data to read?
string html = sb.ToString();
List<string> listoflinks = new List<string>();
string input = html;
Regex rx = new Regex(@"(?<="")[^""]+(?="")|[^\s""]\S*");
for (Match match = rx.Match(input); match.Success; match = match.NextMatch())
{
listoflinks.Add(match.ToString());
}
foreach (var v in listoflinks)
{
using (WebClient Client = new WebClient())
{
Client.DownloadFile(v,v);
}
}
}
}
}
答案 0 :(得分:1)
尝试以下代码。该模式将匹配锚点的HREF
属性值。
Regex rx = new Regex(@"href=""(?<Url>[^.""]+\.pdf)""",RegexOptions.IgnoreCase | RegexOptions.Multiline);
for (Match match = rx.Match(input); match.Success; match = match.NextMatch())
{
var link = match.Groups["Url"].Value;
listoflinks.Add(link);
}
答案 1 :(得分:0)
使用库解析html,如HtmlAgilityPack。
public List<string> GetLinks(string html)
{
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);
var linkNodes = htmlDoc.DocumentNode.SelectNodes("//a[@href]");
if (linkNodes == null)
{
return new List<string>();
}
var linkNodesWithLink = linkNodes.Where(x => x.Attributes.Contains("href")).ToList();
var links = linkNodesWithLink.Select(x => x.Attributes["href"].Value)
.Where(x => !string.IsNullOrWhiteSpace(x))
.Select(x => x.Trim())
.ToList();
links = links.Distinct().ToList();
return links;
}