获取字符串数据Xpath

时间:2018-09-03 07:51:33

标签: c# xpath get gecko geckofx

我需要帮助才能从网站获取数据。我在应用程序中使用geckofx。我希望它在加载页面后从xpath位置检索文本数据

XPathResult xpathResult = geckoWebBrowser1.Document.EvaluateXPath("/html/body/table[3]/tbody/tr[1]/td[2]/a[1]");
IEnumerable<GeckoNode> foundNodes = xpathResult.GetNodes();

如何将数据下载为文本?

2 个答案:

答案 0 :(得分:0)

您似乎正在努力从GeckoFX对象检索文本。
以下是一些电话和操作指南,可以帮助您入门:

//get by XPath
XPathResult xpathResult = _browser.Document.EvaluateXPath("//*[@id]/div/p[2]");
var foundNodes = xpathResult.GetNodes();
foreach (var node in foundNodes)
{
    var x = node.TextContent; // get text text contained by this node (including children)
    GeckoHtmlElement element = node as GeckoHtmlElement; //cast to access.. inner/outerHtml
    string inner = element.InnerHtml;
    string outer = element.OuterHtml;
    //iterate child nodes
    foreach (var child in node.ChildNodes)
    {

    }
}
//get by id
GeckoHtmlElement htmlElementById = _browser.Document.GetHtmlElementById("mw-content-text");
//get by tag
GeckoElementCollection byTag = _browser.Document.GetElementsByTagName("input");
foreach (var ele in byTag)
{
    var y = ele.GetAttribute("value");

}
//get by class
var byClass = _browser.Document.GetElementsByClassName("input");
foreach (var node in byClass)
{
    //...
}

//cast to a different object
var username = ((GeckoInputElement)_browser.Document.GetHtmlElementById("yourUsername")).Value;
//create new object from DomObject
var button = new GeckoButtonElement(_browser.Document.GetElementById("myBtn").DomObject);

答案 1 :(得分:0)

public string extract(string xpath, string type)
    {
        string result = string.Empty;
        GeckoHtmlElement elm = null;

        GeckoWebBrowser wb = geckoWebBrowser1;//(GeckoWebBrowser)GetCurrentWB();
        if (wb != null)
        {
            elm = GetElement(wb, xpath);
            if (elm != null)
                //UpdateUrlAbsolute(wb.Document, elm);

            if (elm != null)
            {
                switch (type)
                {
                    case "html":
                        result = elm.OuterHtml;
                        break;
                    case "text":
                        if (elm.GetType().Name == "GeckoTextAreaElement")
                        {
                            result = ((GeckoTextAreaElement)elm).Value;
                        }
                        else
                        {
                            result = elm.TextContent.Trim();
                        }
                        break;
                    case "value":
                        result = ((GeckoInputElement)elm).Value;
                        break;
                    default:
                        result = extractData(elm, type);
                        break;
                }
            }
        }

        return result;
    }
    private string extractData(GeckoHtmlElement ele, string attribute)
    {
        var result = string.Empty;

        if (ele != null)
        {
            var tmp = ele.GetAttribute(attribute);
            /*if (tmp == null)
            {
                tmp = extractData(ele.Parent, attribute);
            }*/
            if (tmp != null)
                result = tmp.Trim();
        }

        return result;
    }
    private object GetCurrentWB()
    {
        if (tabControl1.SelectedTab != null)
        {
            if(tabControl1.SelectedTab.Controls.Count > 0)
            //if (tabControl1.SelectedTab.Controls.Count > 0)
            {
                Control ctr = tabControl1.SelectedTab.Controls[0];

                if (ctr != null)
                {

                    return ctr as object;
                }
            }
        }
        return null;
    }
    private GeckoHtmlElement GetElement(GeckoWebBrowser wb, string xpath)
    {
        GeckoHtmlElement elm = null;
        if (xpath.StartsWith("/"))
        {
            if (xpath.Contains("@class") || xpath.Contains("@data-type"))
            {
                var html = GetHtmlFromGeckoDocument(wb.Document);
                HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                doc.LoadHtml(html);

                var node = doc.DocumentNode.SelectSingleNode(xpath);
                if (node != null)
                {
                    var currentXpath = "/" + node.XPath;
                    elm = (GeckoHtmlElement)wb.Document.EvaluateXPath(currentXpath).GetNodes().FirstOrDefault();
                }
            }
            else
            {
                elm = (GeckoHtmlElement)wb.Document.EvaluateXPath(xpath).GetNodes().FirstOrDefault();
            }
        }
        else
        {
            elm = (GeckoHtmlElement)wb.Document.GetElementById(xpath);
        }
        return elm;
    }
    private string GetHtmlFromGeckoDocument(GeckoDocument doc)
    {
        var result = string.Empty;

        GeckoHtmlElement element = null;
        var geckoDomElement = doc.DocumentElement;
        if (geckoDomElement is GeckoHtmlElement)
        {
            element = (GeckoHtmlElement)geckoDomElement;
            result = element.InnerHtml;
        }

        return result;
    }

    private void button5_Click(object sender, EventArgs e)
    {
        var text = extract("/html/body/table[3]/tbody/tr[1]/td[2]/a[2]", "text");
        MessageBox.Show(text);
    }

我还插入了我使用了更长代码的代码,但是它也可以工作。也许有人会需要它。该代码的创建者是Web自动化应用程序ĐinhCôngThắng, 问候