使用XPath在不同标签之间搜索文本

时间:2014-02-17 21:54:56

标签: c# html xpath html-agility-pack

我正在使用HtmlAgilityPack,我需要在Html文档中找到一个短语。例如,我有一份文件:

<!DOCTYPE html>
<html>
<body>

<h1>aaa Heading ilo araferi</h1>

Thats <p>My <b>first</b> paragraph.</p>
<p>My second paragraph.</p>
<p>My third paragraph.</p>

</body>
</html>

我想将"Thats <p>My <b>first</b> paragraph.</p>"包裹在span内。为此,我需要找到所有出现的文本(没有html标签)。例如

  

多数民众赞成我的第一段。

换句话说,我希望短语Thats My first paragraph.Thats <p>My <b>first</b> paragraph.</p>匹配

问题是,我不知道如何为这个特定任务执行XPath查询。任何帮助将不胜感激。感谢

1 个答案:

答案 0 :(得分:1)

编辑:已更新,因此在跨度替换后html仍然有效

using System.Collections.Generic;
using System.IO;
using System.Text;
using HtmlAgilityPack;
using System;

namespace Test {
  class Program {
    static void Main(string[] args) {
      var markup = @"<!DOCTYPE html>
    <html>
    <body>

    <h1>aaa Heading ilo araferi</h1>

    Thats <p>My <b>first</b> paragraph.</p>
    <p>My second paragraph.</p>
    <p>My third paragraph.</p>

    </body>
    </html>";
      var doc = new HtmlDocument();
      doc.LoadHtml(markup);
      var map = new List<HtmlNode>();

      var nodes = doc.DocumentNode.SelectNodes("//text()");
      var builder = new StringBuilder(markup.Length);
      for (var j = 0; j < nodes.Count; j++) {
        var node = nodes[j];
        builder.Append(node.InnerHtml);
        for (var i = 0; i < node.InnerHtml.Length; i++) {
          map.Add(node);
        }
      }

      var keyword = "Thats My first paragraph.";
      int index = builder.ToString().IndexOf(keyword);
      if (index >= 0) {
        var firstNode = map[index];
        var lastNode = map[index + keyword.Length - 1];
        var ancestor = Ancestor(firstNode, lastNode);
        if (ancestor != null) {
          while (firstNode != null && Level(firstNode) - Level(ancestor) > 1) {
            firstNode = firstNode.ParentNode;
          }
          while (lastNode != null && Level(lastNode) - Level(ancestor) > 1) {
            lastNode = lastNode.ParentNode;
          }
          if (firstNode != null && lastNode != null && ancestor == Ancestor(firstNode, lastNode)) {
            var span = doc.CreateElement("span");
            ancestor.ChildNodes.Insert(ancestor.ChildNodes.IndexOf(firstNode), span);
            int start = ancestor.ChildNodes.IndexOf(firstNode);
            int end = ancestor.ChildNodes.IndexOf(lastNode);
            for (var i = start; i <= end; i++) {
              var node = ancestor.ChildNodes[start];
              ancestor.ChildNodes.Remove(start);
              span.ChildNodes.Append(node);
            }
          }
        }
      }
      var writer = new StringWriter();
      doc.Save(writer);
      markup = writer.ToString();
    }

    public static HtmlNode Ancestor(HtmlNode a, HtmlNode b) {
      if (a == null) {
        throw new ArgumentNullException("a");
      }
      if (b == null) {
        throw new ArgumentNullException("b");
      }

      var parentsOfA = new List<HtmlNode>();
      while (a != null) {
        parentsOfA.Add(a);
        a = a.ParentNode;
      }

      while (b != null) {
        if (parentsOfA.Contains(b)) {
          return b;
        }
        b = b.ParentNode;
      }
      return null;
    }

    public static int Level(HtmlNode node) {
      int level = 0;
      while (node != null) {
        level++;
        node = node.ParentNode;
      }
      return level;
    }
  }
}