在C#或Java中解析自定义HTML列表标记

时间:2014-12-16 14:17:33

标签: c# regex custom-tags

我有一些这样的文字:

This is a simple line
[olist]
    [#]This is line 1
    [#]This is line 2
        [olist]
            [#]This is line 2.1
            [#]This is line 2.2
            [#]This is line 2.3
    and it continues here
        [/olist]
    [#]This is line 3
[/olist]
Another line

如何在C#中将其解析为HTML,如下所示

This is a simple line
<ol>
    <li>This is line 1</li>
    <li>This is line 2
        <ol>
            <li>This is line 2.1</li>
            <li>This is line 2.2</li>
            <li>This is line 2.3
    and it continues here</li>
        </ol>
    </li>
    <li>This is line 3</li>
</ol>
Another line

我目前正在拆分和连接,但子列表未正确处理。

更新: - 示例代码

这就是我目前正在做的事情。

var html = ReplaceList(customHtml,"olist","ol");

private static string ReplaceList(string text, string key, string tag)
{
    var itemTmpl = GetListEntry(text, key);
    while (itemTmpl != null)
    {
        var buf = new StringBuilder();
        var arr = itemTmpl.Split(new[] { "[#]" }, StringSplitOptions.RemoveEmptyEntries);
        foreach (var str in arr)
        {
            if (!string.IsNullOrWhiteSpace(str))
                buf.AppendFormat("<li>{0}</li>", str.Trim());
        }

        var content = string.Format("<{0}>{1}</{0}>", tag, buf);


        text = text.SubstringBefore("[" + key + "]") + content +
                        text.SubstringAfter("[/" + key + "]");

        itemTmpl = GetListEntry(text, key);
    }

    return text;
}

private static string GetListEntry(string text, string key)
{
    var tag1 = string.Format("[{0}]", key);
    var tag2 = string.Format("[/{0}]", key);

    var start = text.IndexOf(tag1, StringComparison.Ordinal);
    var end = (start > -1) ? text.IndexOf(tag2, start, StringComparison.Ordinal) : -1;

    if (start < 0 || end <= start)
        return null;

    var result = text.Substring(start + tag1.Length, end - start - tag1.Length);

    return result;
}

请注意 某些列表项跨越多行,也可能包含换行符

2 个答案:

答案 0 :(得分:1)

首先必须将其解析为某个抽象树,然后从抽象树中组合结果。 即:

public interface IElement
{
  void AddElement(IElement element);
  IElement Parent { get; }
}

class OlElement : IElement
{
  public IList<LiElement> Elements { get; set; }
  public IElement Parent { get; set; }

  public OlElement(IElement parent)
  {
    Parent = parent;
    Elements = new List<LiElement>();
  }

  public void AddElement(IElement element)
  {
    Elements.Add((LiElement)element);
  }

  public override string ToString()
  {
    var builder = new StringBuilder();
    builder.AppendLine("<ol>");
    foreach(var child in Elements)
    {
      builder.AppendLine(child.ToString());
    }
    builder.AppendLine("</ol>");
    return builder.ToString();
  }
}

class LiElement : IElement
{
  public string Text { get; set; }
  public IElement Parent { get; set; }
  public IList<OlElement> Elements { get; set; }

  public LiElement(IElement parent, string text)
  {
    Parent = parent;
    Text = text;
    Elements = new List<OlElement>();
  }

  public void AddElement(IElement element)
  {
    Elements.Add((OlElement)element);
  }

  public override string ToString()
  {
    var builder = new StringBuilder();
    builder.Append("<li>");
    builder.Append(Text);
    foreach (var child in Elements)
    {
      builder.AppendLine(child.ToString());
    }
    builder.AppendLine("</li>");
    return builder.ToString();
  }
}

获得结果:

const string text = @"[olist]
[#]This is line 1
[#]This is line 2
    [olist]
        [#]This is line 2.1
        [#]This is line 2.2
        [#]This is line 2.3
    [/olist]
[#]This is line 3
[/olist]";
var regex = new Regex(@"^\s*\[(?<tag>[^\]]+)\](?<text>.*)$");
var builder = new StringBuilder();
var root = new OlElement(null);
var currentElement = (IElement)root;
using (var reader = new StringReader(text))
{
  string line;
  while ((line = reader.ReadLine()) != null)
  {
    var match = regex.Match(line);
    if (match.Success)
    {
      switch (match.Groups["tag"].Value)
      {
        case "#":
          if (currentElement is OlElement)
          {
            var child = new LiElement(currentElement, match.Groups["text"].Value);
            currentElement.AddElement(child);
            currentElement = child;
            break;
          }
          if (currentElement is LiElement)
          {
            var child = new LiElement(currentElement.Parent, match.Groups["text"].Value);
            currentElement.Parent.AddElement(child);
            currentElement = child;
          }
          break;
        case "olist":
          if (currentElement == root)
          {
            break;
          }
          if (currentElement is LiElement)
          {
            var child = new OlElement(currentElement);
            currentElement.AddElement(child);
            currentElement = child;
          }
          break;
        case "/olist":
          if (currentElement is LiElement)
          {
            currentElement = currentElement.Parent.Parent;
            break;
          }
          if (currentElement is OlElement)
          {
            currentElement = currentElement.Parent;
          }
          break;
        default:
          break;
      }
    }
  }
}
var result = root.ToString();

答案 1 :(得分:0)

考虑以下方法(注意它是&#34;快速和脏&#34;在确定标签时)。

非常简单 - 只需逐行阅读文本并对其进行转换(使用一些预测和计算深度级别的子列表)。

string src = @"[olist]
    [#]This is line 1
    [#]This is line 2
        [olist]
            [#]This is line 2.1
                [olist]
                    [#]This is line 2.1.1
                    [#]This is line 2.1.2
                [/olist]
            [#]This is line 2.2
            [#]This is line 2.3
        [/olist]
    [#]This is line 3
[/olist]";


var sb = new StringBuilder();
var lines = src.Split(new string[] {Environment.NewLine}, StringSplitOptions.RemoveEmptyEntries);
int i = 0;
int innerListsCount = 0;

while (i < lines.Length)
{
    string line = lines[i];
    if (line.EndsWith("[olist]"))
        sb.AppendLine(line.Replace("[olist]", "<ol>"));
    else if (line.EndsWith("[/olist]"))
    {
        sb.AppendLine(line.Replace("[/olist]", "</ol>"));
        if (innerListsCount > 0)
        {
            for (int j = 0; j <= innerListsCount; j++)
                sb.Append("    ");

            sb.AppendLine("</li>");
        }

        innerListsCount--;
    }
    else if (line.Trim().StartsWith("[#]"))
    {
        sb.Append(line.Replace("[#]", "<li>"));

        if (i < lines.Length && lines[i + 1].EndsWith("[olist]"))
        {
            innerListsCount++;
            sb.AppendLine();
        }
        else
            sb.AppendLine("</li>");
    }

    i++;
}

Console.WriteLine(sb.ToString());

输出看起来完全符合您的要求:

<ol>
    <li>This is line 1</li>
    <li>This is line 2
        <ol>
            <li>This is line 2.1
                <ol>
                    <li>This is line 2.1.1</li>
                    <li>This is line 2.1.2</li>
                </ol>
            </li>
            <li>This is line 2.2</li>
            <li>This is line 2.3</li>
        </ol>
        </li>
    <li>This is line 3</li>
</ol>