HTML Diff工具API

时间:2012-04-23 18:54:58

标签: html api diff compare double-byte

我正在寻找一个能直观地显示结构,字符/单词和样式的html差异的api。此工具还必须支持双字节字符,并且足够灵活,我可以将其添加到现有网站,以便轻松显示比较结果。我目前正在使用组件软件COM实现,它不支持双字节字符,并且在大约六年内没有更新。

2 个答案:

答案 0 :(得分:0)

我发现可以执行此类操作的唯一两个工具是http://changedetection.comhttp://imnosy.com。两者都可以指定一个网址并观察它们的变化。

答案 1 :(得分:0)

这就是我使用的:

[http://code.google.com/p/google-diff-match-patch/] [1]

我必须编写自己的方法来进行比较,但经过一些工作后看起来很好。此实现比较传入的测试,因此如果您只是比较2个文本字符串,它可以正常工作。我的diff_prettyHtml调用已更改为:

public string diff_prettyHtml(List<Diff> diffs)
    {
        StringBuilder html = new StringBuilder();
        foreach (Diff aDiff in diffs)
        {
            string text = aDiff.text.Replace("&", "&amp;").Replace("<", "&lt;")
              .Replace(">", "&gt;").Replace("\n", "<br>");
            switch (aDiff.operation)
            {
                case Operation.INSERT:
                    html.Append("<ins class='diff'>").Append(text)
                        .Append("</ins>");
                    break;
                case Operation.DELETE:
                    html.Append("<del class='diff'>").Append(text)
                        .Append("</del>");
                    break;
                case Operation.EQUAL:
                    html.Append("<span>").Append(text).Append("</span>");
                    break;
            }
        }
        return html.ToString();
    }

现在,如果您想对2个html字符串进行比较预览,这有点不同。这就是我所做的:

DiffMatchPatch.diff_match_patch diff = new DiffMatchPatch.diff_match_patch();
                List<DiffMatchPatch.Diff> differences = diff.diff_main(oldHtml,
                    newHtml);
                return diff.diff_previewHtml(differences);


public string diff_previewHtml(List<Diff> diffs) {
      StringBuilder html = new StringBuilder();
      foreach (Diff aDiff in diffs) {
        string text = aDiff.text;
        switch (aDiff.operation) {
          case Operation.INSERT:
                html.Append("<ins class='diff'>").Append(text)
                .Append("</ins>");
            break;
          case Operation.DELETE:
            html.Append("<del class='diff'>").Append(text)
                .Append("</del>");
            break;
          case Operation.EQUAL:
            html.Append(text);
            break;
        }
      }
      return html.ToString();
    }

unicode类如下:

using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Linq;

namespace HtmlCompare
{
    class Unicoder
    {

        private Hashtable _htmlHash = new Hashtable();
        private const string _htmlPattern = @"<(S*?)[^>]*>.*?|<.*?\/>";
        private List<string> _blockElements = "img,br".Split(',').ToList<string>();
        private int _currentHash = 44032;

        public string pushHash(string tag)
        {
            if (_htmlHash[tag] == null)
            {
                //_htmlHash[tag] = char.Parse("\\u" + Convert.ToString(_currentHash,16));
                _htmlHash[tag] = char.ConvertFromUtf32(_currentHash);
                _currentHash++;
            }
            return _htmlHash[tag].ToString();
        }

        private string tagMatch(Match tag)
        {
            return pushHash(tag.Value);
        }

        public string html2plain(string html)
        {
            MatchEvaluator tagEvaluator = new MatchEvaluator(tagMatch);
            return Regex.Replace(html, _htmlPattern, tagEvaluator, RegexOptions.IgnoreCase | RegexOptions.Multiline);
        }

        private string ProcessDiffTag(string tagStart, string tagEnd, string contents)
        {
            ArrayList diffTagParts = new ArrayList();
            MatchCollection matches = Regex.Matches(contents,
                    _htmlPattern,
                    RegexOptions.IgnoreCase | RegexOptions.Multiline);

            if (matches.Count > 0)
            {
                int contentsStringIndex = 0;
                int contentsStringEndIndex = 0;
                int lastContentStringIndex = 0;

                bool lastTag = false;
                TagDefinition definition;
                foreach (Match currentMatch in matches)
                {
                    contentsStringIndex = currentMatch.Index;
                    contentsStringEndIndex = contentsStringIndex + currentMatch.Length;

                    lastTag = (currentMatch == matches[matches.Count - 1]);


                    // did we miss text that isn't a tag?
                    if (contentsStringIndex > lastContentStringIndex)
                    {
                        definition = new TagDefinition();
                        definition.Tag = false;
                        definition.Text = contents.Substring(lastContentStringIndex, contentsStringIndex - lastContentStringIndex);
                        AddTagDefinition(diffTagParts, definition);
                    }
                    else if (lastTag && contents.Length > contentsStringEndIndex) // something after the last tag?
                    {
                        definition = new TagDefinition();
                        definition.Tag = false;
                        definition.Text = contents.Substring(contentsStringEndIndex, contents.Length - contentsStringEndIndex);
                        AddTagDefinition(diffTagParts, definition);
                    }

                    // work on current tag
                    definition = new TagDefinition();
                    definition.Tag = true;
                    definition.OpeningTag = !IsClosingTag(currentMatch.Value);
                    definition.TagType = GetTagType(currentMatch.Value);
                    definition.Text = currentMatch.Value;
                    AddTagDefinition(diffTagParts, definition);

                    lastContentStringIndex = contentsStringEndIndex;
                }

                return GoThroughDiffParts(diffTagParts,
                        tagStart,
                        tagEnd);
            }
            else
                return string.Concat(tagStart, contents, tagEnd);
        }

        private string GetTagType(string tag)
        {
            int startIndex = 1; // skip <
            if (tag.StartsWith("</"))
                startIndex = 2; // skip </
            int endIndex = tag.IndexOf(" ");
            if (endIndex == -1)
                endIndex = tag.IndexOf(">");

            return tag.Substring(startIndex, endIndex - startIndex);

        }

        private string GoThroughDiffParts(ArrayList parts, string startTag, string endTag)
        {
            IEnumerator enumerator = parts.GetEnumerator();
            StringBuilder before = new StringBuilder(string.Empty);
            StringBuilder middle = new StringBuilder(string.Empty);
            StringBuilder after = new StringBuilder(string.Empty);

            TagDefinition definition;
            while (enumerator.MoveNext())
            {
                definition = (TagDefinition)enumerator.Current;
                if (!definition.Used) // have we already used this part?
                {
                    definition.Used = true;
                    if (_blockElements.Contains(definition.TagType))
                        middle.Append(definition.Text);
                    else if (definition.MatchingIndex == -1) // no matching tag
                    {
                        if (definition.Tag) // html tag?
                        {
                            if (definition.OpeningTag)
                                before.Append(definition.Text);
                            else
                                after.Append(definition.Text);
                        }
                        else
                            middle.Append(definition.Text);
                    }
                    else
                    {
                        if (!definition.Tag) // text and has a matching tag
                        {
                            TagDefinition matchingTag = (TagDefinition)parts[definition.MatchingIndex];
                            if (matchingTag.OpeningTag)
                                matchingTag.Text += definition.Text;
                            else
                                matchingTag.Text = string.Concat(definition.Text, matchingTag.Text);
                            definition.Used = true;
                        }
                        else
                            middle.Append(definition.Text);
                    }
                }
            }

            bool includeDiffTag = true;
            if (string.IsNullOrEmpty(middle.ToString()))
                includeDiffTag = false; // we don't want the ins/del tag around nothing
            else if (string.IsNullOrWhiteSpace(middle.ToString())) // spacing should be kept
                middle = new StringBuilder("&nbsp;" + middle.Replace("\n", "<br />"));

            if(includeDiffTag)
                middle.Insert(0, startTag); // <ins>[middle]
            middle.Insert(0, before); // [before]<ins>[middle]
            if (includeDiffTag)
                middle.Append(endTag); // [before]<ins>[middle]</ins>
            middle.Append(after); // [before]<ins>[middle]</ins>[end]

            return middle.ToString();
        }

        private string DiffTagMatch(Match tag)
        {
            string tagStart = tag.Groups[1].Value;
            string tagEnd = tag.Groups[5].Value;
            string contents = tag.Groups[4].Value;
            if (string.IsNullOrEmpty(contents))
                return string.Empty; // we don't want the ins/del tag around nothing
            else if (string.IsNullOrWhiteSpace(contents)) // spacing should be kept
                return string.Concat(tagStart, "&nbsp;", contents.Replace("\n", "<br />"), tagEnd);
            else
                return ProcessDiffTag(tagStart,
                    tagEnd,
                    contents);

        }

        private bool IsClosingTag(string tag)
        {
            return tag.Contains("</") && !tag.ToLower().Contains("<img") && !tag.ToLower().Contains("<br");
        }

        public string CleanUpMisplacedDiffTags(string html)
        {
            return Regex.Replace(html, @"(\<((ins|del).*?)\>)(.*?)(\<\/((ins|del).*?)\>)", DiffTagMatch, RegexOptions.IgnoreCase | RegexOptions.Multiline);
        }

        public string plain2html(string plain)
        {
            IDictionaryEnumerator enumerator = _htmlHash.GetEnumerator();
            while (enumerator.MoveNext())
            {
                plain = Regex.Replace(plain, 
                    _htmlHash[enumerator.Key].ToString(), 
                    enumerator.Key.ToString(), 
                    RegexOptions.IgnoreCase | RegexOptions.Multiline);
            }
            return CleanUpMisplacedDiffTags(plain);
        }

        private void AddTagDefinition(ArrayList list, TagDefinition tag)
        {
            IEnumerator enumerator = list.GetEnumerator();
            TagDefinition currentDefinition;
            int index = 0;
            int insertingIndex = list.Count;
            while (enumerator.MoveNext())
            {

                currentDefinition = (TagDefinition)enumerator.Current;
                //if (!tag.OpeningTag && currentDefinition.MatchingIndex == -1)
                //    currentDefinition.MatchingIndex = insertingIndex;

                if (tag.MatchingIndex == -1 && // matching tag not found yet
                        (currentDefinition.OpeningTag && !tag.OpeningTag) && // opening & closing
                        currentDefinition.TagType == currentDefinition.TagType) // same tag type
                {
                    tag.MatchingIndex = index;
                    currentDefinition.MatchingIndex = insertingIndex;
                }
            }

            list.Add(tag);
        }

        private class TagDefinition
        {
            public bool Tag { get; set; }
            public string TagType { get; set; }
            public string Text { get; set; }
            public int MatchingIndex { get; set; }
            public bool OpeningTag { get; set; }
            public bool Used { get; set; }

            public TagDefinition()
            {
                this.Tag = false;
                this.Text = string.Empty;
                this.TagType = string.Empty;
                this.MatchingIndex = -1;
                this.OpeningTag = false;
                this.Used = false;
            }
        }
    }
}