我想从网站上获取HTML代码。在浏览器中,我通常只需单击上下文菜单中的“查看页面源”或类似内容。但是我怎样才能使它自动化呢?我已经尝试过使用WebBrowser类,但有时它不起作用。我不是网络开发者所以我真的不知道我的方法至少是否有意义。我认为主要的问题是我有时会得到html,而不是所有的代码都被执行了。因此它没有完成。我有问题,例如这个网站:http://www.sreality.cz/en/search/for-sale/praha
我的代码(我试图让它变小但可以单独运行):
using System;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Windows.Forms;
namespace WebBrowserForm
{
internal static class Program
{
[STAThread]
private static void Main()
{
Application.EnableVisualStyles();
Application.SetCompatibleTextRenderingDefault(false);
for (int i = 0; i < 10; i++)
{
Form1 f = new Form1();
f.ShowDialog();
}
// Now I can check Form1.List and see that some html is final and some is not
}
}
public class Form1 : Form
{
public static List<string> List = new List<string>();
private const string Url = "http://www.sreality.cz/en/search/for-sale/praha";
private System.Windows.Forms.WebBrowser webBrowser1;
public Form1()
{
this.webBrowser1 = new System.Windows.Forms.WebBrowser();
this.SuspendLayout();
this.webBrowser1.Dock = System.Windows.Forms.DockStyle.Fill;
this.webBrowser1.Name = "webBrowser1";
this.webBrowser1.TabIndex = 0;
this.ResumeLayout(false);
Load += new EventHandler(Form1_Load);
this.webBrowser1.ObjectForScripting = new MyScript();
}
private void Form1_Load(object sender, EventArgs e)
{
webBrowser1.Navigate(Url);
webBrowser1.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webBrowser1_DocumentCompleted);
}
private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
if (webBrowser1.ReadyState == WebBrowserReadyState.Complete)
{
// Final html for 99% of web pages, but unfortunately not for all
string tst = webBrowser1.Document.GetElementsByTagName("HTML")[0].OuterHtml;
webBrowser1.DocumentCompleted -= new WebBrowserDocumentCompletedEventHandler(webBrowser1_DocumentCompleted);
Application.DoEvents();
webBrowser1.Navigate("javascript: window.external.CallServerSideCode();");
Application.DoEvents();
}
}
[ComVisible(true)]
public class MyScript
{
public void CallServerSideCode()
{
HtmlDocument doc = ((Form1)Application.OpenForms[0]).webBrowser1.Document;
string renderedHtml = doc.GetElementsByTagName("HTML")[0].OuterHtml;
// here I sometimes get full html but sometimes the same as in webBrowser1_DocumentCompleted method
List.Add(renderedHtml);
((Form1)Application.OpenForms[0]).Close();
}
}
}
}
我希望在“webBrowser1_DocumentCompleted”方法中,我可以获得最终的html。它通常有效,但有了这个网站,它没有。所以我尝试在自己的代码中获取html,这应该在网站上执行 - &gt;方法'CallServerSideCode'。奇怪的是,有时我会得到最终的html(基本上与我通过浏览器手动完成相同),但有时候不是。我认为这个问题是由于我的脚本在整个网站被渲染之后开始的。但我不确定,因为这种事情远离我的舒适区,我真的不明白我在做什么。我只是想尝试在互联网上找到的东西。
那么,有谁知道代码有什么问题?或者更重要的是如何从网站轻松获取最终的HTML?
任何帮助表示感谢。
答案 0 :(得分:0)
您应该使用WebClient
类来下载HTML页面。无需显示控制。
您需要方法DownloadString
答案 1 :(得分:0)
如果你将外部函数的调用添加到正文的末尾并通过Jquery“ondomready”函数包装它可能会有所帮助。我的意思是这样的:
private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
if (webBrowser1.ReadyState == WebBrowserReadyState.Complete)
{
// Final html for 99% of web pages, but unfortunately not for all
string tst = webBrowser1.Document.GetElementsByTagName("HTML")[0].OuterHtml;
webBrowser1.DocumentCompleted -= new WebBrowserDocumentCompletedEventHandler(webBrowser1_DocumentCompleted);
HtmlElement body = webBrowser1.Document.GetElementsByTagName("body")[0];
HtmlElement scriptEl = webBrowser1.Document.CreateElement("script");
IHTMLScriptElement element = (IHTMLScriptElement)scriptEl.DomElement;
element.text = "$(function() { window.external.CallServerSideCode(); });";
body.AppendChild(scriptEl);
}
}
[ComVisible(true)]
public class MyScript
{
public void CallServerSideCode()
{
HtmlDocument doc = ((Form1)Application.OpenForms[0]).webBrowser1.Document;
string renderedHtml = doc.GetElementsByTagName("HTML")[0].OuterHtml;
// here I sometimes get full html but sometimes the same as in webBrowser1_DocumentCompleted method
List.Add(renderedHtml);
((Form1)Application.OpenForms[0]).Close();
}
}