我正在尝试使用WebBrowser
类复制网站的文本(从用户那里获取URL),但是似乎没有任何线程在运行。我也尝试在没有线程的情况下使用WebBrowser
,但是没有用。任何建议都将受到欢迎。这是我第一次使用这些库,非常感谢您帮助我获得想要的东西。
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Xml;
using System.Windows.Forms;
using System.Threading;
public partial class _Default : Page
{
protected void Page_Load(object sender, EventArgs e)
{
}
private void runBrowserThread(Uri url)
{
var th = new Thread(() => {
var br = new WebBrowser();
br.DocumentCompleted += browser_DocumentCompleted;
br.Navigate(url);
global::System.Windows.Forms.Application.Run();
object n = new object();
br.Document.ExecCommand("SelectAll",true,n);
br.Document.ExecCommand("Copy",true,n);
string text = Clipboard.GetText();
MessageBox.Show(text, "Text");
});
th.SetApartmentState(ApartmentState.STA);
th.Start();
}
void browser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
var br = sender as WebBrowser;
if (br.Url == e.Url)
{
Console.WriteLine("Natigated to {0}", e.Url);
// global::System.Windows.Forms.Application.ExitThread(); // Stops the thread
}
}
public void url_input_Click(Object sender, EventArgs e)
{
string StringFromTheInput = TextBox1.Text;
System.Uri uri = new System.Uri(StringFromTheInput);
runBrowserThread(uri);
}
public static Dictionary<string, int> WordCount(string content, int numWords = int.MaxValue)
{
var delimiterChars = new char[] { ' ', ',', ':', '\t', '\"', '\r', '{', '}', '[', ']', '=', '/' };
return content
.Split(delimiterChars)
.Where(x => x.Length > 0)
.Select(x => x.ToLower())
.GroupBy(x => x)
.Select(x => new { Word = x.Key, Count = x.Count() })
.OrderByDescending(x => x.Count)
.Take(numWords)
.ToDictionary(x => x.Word, x => x.Count);
}
}
答案 0 :(得分:2)
来自注释-如何从页面的HTML中提取实际内容。
修改
与Israel Nehes讨论了该问题之后,看来解决方案是检索特定的标签值。
我已经更新了代码,希望这会有所帮助。
检索HTML,然后使用XPath路径表达式检索您感兴趣的节点,即
和标记
static public StringBuilder Content { get; set; }
static void Main(string[] args)
{
string html;
Content = new StringBuilder();
string url = @"https://www.msn.com/en-gb/news/uknews/universal-credit-forcing-families-to-wait-months-for-help-to-pay-childcare-bills-mps-warn/ar-BBRjFtR?li=BBoPRmx";
WebClient wc = new WebClient();
HtmlDocument doc = new HtmlDocument();
html = wc.DownloadString(url);
doc.LoadHtml(html);
var allP = doc.DocumentNode.SelectNodes("//p");
var allLink = doc.DocumentNode.SelectNodes("//a");
foreach (var p in allP)
{
var outerHtml = p.OuterHtml;
List<string> potentialContent = Regex.Replace(outerHtml, "<[^>]*>", "").Split(' ').ToList();
if (potentialContent.Count > 1)
{
Content.Append(new StringBuilder(string.Join(" ", potentialContent)));
}
}
foreach (var p in allLink)
{
var outerHtml = p.OuterHtml;
List<string> potentialContent = Regex.Replace(outerHtml, "<[^>]*>", "").Split(' ').ToList();
if (potentialContent.Count > 1)
{
Content.Append(new StringBuilder(string.Join(" ", potentialContent)));
}
}
}
内容属性将包含标签值。