我需要获取网页中包含的渲染文本(没有html标签的纯文本)。
假设:
<p>The <b>Moon</b> is an <a href="/wiki/Astronomical_object" title="Astronomical object">astronomical body</a> that <a href="/wiki/Orbit" title="Orbit">orbits</a> <a href="/wiki/Planet" title="Planet">planet</a> <a href="/wiki/Earth" title="Earth">Earth</a>, being Earth's only <a href="/wiki/Claimed_moons_of_Earth#Temporary_satellites" title="Claimed moons of Earth">permanent</a> <a href="/wiki/Natural_satellite" title="Natural satellite">natural satellite</a>.
我应该得到:
The Moon is an astronomical body that orbits planet Earth, being Earth's only permanent natural satellite.
我试过这个,但我只能得到HTML代码:
private static ChromiumWebBrowser wb;
public static void Start()
{
const string testUrl = "https://en.wikipedia.org/wiki/Moon";
var settings = new CefSettings()
{
CachePath = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "CefSharp\\Cache")
};
Cef.Initialize(settings, performDependencyCheck: true, browserProcessHandler: null);
wb = new ChromiumWebBrowser(testUrl);
wb.FrameLoadEnd += browser_FrameLoadEnd;
}
static void browser_FrameLoadEnd(object sender, FrameLoadEndEventArgs e)
{
if (e.Frame.IsMain)
{
wb.GetSourceAsync().ContinueWith(taskHtml =>
{
String html = taskHtml.Result;
Console.Write(html);
});
}
}