Question

这是我一直遇到的一些困难。我有一个本地客户端脚本，需要允许用户获取远程网页并搜索结果页面的表单。为了做到这一点（没有正则表达式），我需要将文档解析为完全可遍历的DOM对象。

我想强调的一些限制：

我不想使用库（比如jQuery）。我需要在这里做太多的臃肿。
在任何情况下都不应执行远程页面中的脚本（出于安全原因）。
需要提供诸如getElementsByTagName之类的DOM API。
它只需要在Internet Explorer中工作，但至少在7中工作。
让我假装我无法访问服务器。我这样做，但我不能用它。

我尝试了什么

假设我在变量html中有一个完整的HTML文档字符串（包括DOCTYPE声明），这是我到目前为止所尝试的内容：

var frag = document.createDocumentFragment(),
div  = frag.appendChild(document.createElement("div"));

div.outerHTML = html;
//-> results in an empty fragment

div.insertAdjacentHTML("afterEnd", html);
//-> HTML is not added to the fragment

div.innerHTML = html;
//-> Error (expected, but I tried it anyway)

var doc = new ActiveXObject("htmlfile");
doc.write(html);
doc.close();
//-> JavaScript executes

我还尝试从HTML中提取<head>和<body>个节点，并将它们添加到片段内的<HTML>元素中，但仍然没有运气。

有没有人有任何想法？

Answer 1

小提琴：http://jsfiddle.net/JFSKe/6/

DocumentFragment 未实施DOM方法。将document.createElement与innerHTML结合使用会删除<head>和<body>标记（即使创建的元素是根元素<html>）。因此，应该在其他地方寻求解决方案。我创建了一个跨浏览器字符串到DOM的函数，它使用了一个不可见的内联框架。

将禁用所有外部资源和脚本。有关详细信息，请参阅代码说明。

代码

/*
 @param String html    The string with HTML which has be converted to a DOM object
 @param func callback  (optional) Callback(HTMLDocument doc, function destroy)
 @returns              undefined if callback exists, else: Object
                        HTMLDocument doc  DOM fetched from Parameter:html
                        function destroy  Removes HTMLDocument doc.         */
function string2dom(html, callback){
    /* Sanitise the string */
    html = sanitiseHTML(html); /*Defined at the bottom of the answer*/

    /* Create an IFrame */
    var iframe = document.createElement("iframe");
    iframe.style.display = "none";
    document.body.appendChild(iframe);

    var doc = iframe.contentDocument || iframe.contentWindow.document;
    doc.open();
    doc.write(html);
    doc.close();

    function destroy(){
        iframe.parentNode.removeChild(iframe);
    }
    if(callback) callback(doc, destroy);
    else return {"doc": doc, "destroy": destroy};
}

/* @name sanitiseHTML
   @param String html  A string representing HTML code
   @return String      A new string, fully stripped of external resources.
                       All "external" attributes (href, src) are prefixed by data- */

function sanitiseHTML(html){
    /* Adds a <!-\"'--> before every matched tag, so that unterminated quotes
        aren't preventing the browser from splitting a tag. Test case:
       '<input style="foo;b:url(0);><input onclick="<input type=button onclick="too() href=;>">' */
    var prefix = "<!--\"'-->";
    /*Attributes should not be prefixed by these characters. This list is not
     complete, but will be sufficient for this function.
      (see http://www.w3.org/TR/REC-xml/#NT-NameChar) */
    var att = "[^-a-z0-9:._]";
    var tag = "<[a-z]";
    var any = "(?:[^<>\"']*(?:\"[^\"]*\"|'[^']*'))*?[^<>]*";
    var etag = "(?:>|(?=<))";

    /*
      @name ae
      @description          Converts a given string in a sequence of the
                             original input and the HTML entity
      @param String string  String to convert
      */
    var entityEnd = "(?:;|(?!\\d))";
    var ents = {" ":"(?:\\s|&nbsp;?|&#0*32"+entityEnd+"|&#x0*20"+entityEnd+")",
                "(":"(?:\\(|&#0*40"+entityEnd+"|&#x0*28"+entityEnd+")",
                ")":"(?:\\)|&#0*41"+entityEnd+"|&#x0*29"+entityEnd+")",
                ".":"(?:\\.|&#0*46"+entityEnd+"|&#x0*2e"+entityEnd+")"};
                /*Placeholder to avoid tricky filter-circumventing methods*/
    var charMap = {};
    var s = ents[" "]+"*"; /* Short-hand space */
    /* Important: Must be pre- and postfixed by < and >. RE matches a whole tag! */
    function ae(string){
        var all_chars_lowercase = string.toLowerCase();
        if(ents[string]) return ents[string];
        var all_chars_uppercase = string.toUpperCase();
        var RE_res = "";
        for(var i=0; i<string.length; i++){
            var char_lowercase = all_chars_lowercase.charAt(i);
            if(charMap[char_lowercase]){
                RE_res += charMap[char_lowercase];
                continue;
            }
            var char_uppercase = all_chars_uppercase.charAt(i);
            var RE_sub = [char_lowercase];
            RE_sub.push("&#0*" + char_lowercase.charCodeAt(0) + entityEnd);
            RE_sub.push("&#x0*" + char_lowercase.charCodeAt(0).toString(16) + entityEnd);
            if(char_lowercase != char_uppercase){
                RE_sub.push("&#0*" + char_uppercase.charCodeAt(0) + entityEnd);   
                RE_sub.push("&#x0*" + char_uppercase.charCodeAt(0).toString(16) + entityEnd);
            }
            RE_sub = "(?:" + RE_sub.join("|") + ")";
            RE_res += (charMap[char_lowercase] = RE_sub);
        }
        return(ents[string] = RE_res);
    }
    /*
      @name by
      @description  second argument for the replace function.
      */
    function by(match, group1, group2){
        /* Adds a data-prefix before every external pointer */
        return group1 + "data-" + group2 
    }
    /*
      @name cr
      @description            Selects a HTML element and performs a
                                  search-and-replace on attributes
      @param String selector  HTML substring to match
      @param String attribute RegExp-escaped; HTML element attribute to match
      @param String marker    Optional RegExp-escaped; marks the prefix
      @param String delimiter Optional RegExp escaped; non-quote delimiters
      @param String end       Optional RegExp-escaped; forces the match to
                                  end before an occurence of <end> when 
                                  quotes are missing
     */
    function cr(selector, attribute, marker, delimiter, end){
        if(typeof selector == "string") selector = new RegExp(selector, "gi");
        marker = typeof marker == "string" ? marker : "\\s*=";
        delimiter = typeof delimiter == "string" ? delimiter : "";
        end = typeof end == "string" ? end : "";
        var is_end = end && "?";
        var re1 = new RegExp("("+att+")("+attribute+marker+"(?:\\s*\"[^\""+delimiter+"]*\"|\\s*'[^'"+delimiter+"]*'|[^\\s"+delimiter+"]+"+is_end+")"+end+")", "gi");
        html = html.replace(selector, function(match){
            return prefix + match.replace(re1, by);
        });
    }
    /* 
      @name cri
      @description            Selects an attribute of a HTML element, and
                               performs a search-and-replace on certain values
      @param String selector  HTML element to match
      @param String attribute RegExp-escaped; HTML element attribute to match
      @param String front     RegExp-escaped; attribute value, prefix to match
      @param String flags     Optional RegExp flags, default "gi"
      @param String delimiter Optional RegExp-escaped; non-quote delimiters
      @param String end       Optional RegExp-escaped; forces the match to
                                  end before an occurence of <end> when 
                                  quotes are missing
     */
    function cri(selector, attribute, front, flags, delimiter, end){
        if(typeof selector == "string") selector = new RegExp(selector, "gi");
        flags = typeof flags == "string" ? flags : "gi";
         var re1 = new RegExp("("+att+attribute+"\\s*=)((?:\\s*\"[^\"]*\"|\\s*'[^']*'|[^\\s>]+))", "gi");

        end = typeof end == "string" ? end + ")" : ")";
        var at1 = new RegExp('(")('+front+'[^"]+")', flags);
        var at2 = new RegExp("(')("+front+"[^']+')", flags);
        var at3 = new RegExp("()("+front+'(?:"[^"]+"|\'[^\']+\'|(?:(?!'+delimiter+').)+)'+end, flags);

        var handleAttr = function(match, g1, g2){
            if(g2.charAt(0) == '"') return g1+g2.replace(at1, by);
            if(g2.charAt(0) == "'") return g1+g2.replace(at2, by);
            return g1+g2.replace(at3, by);
        };
        html = html.replace(selector, function(match){
             return prefix + match.replace(re1, handleAttr);
        });
    }

    /* <meta http-equiv=refresh content="  ; url= " > */
    html = html.replace(new RegExp("<meta"+any+att+"http-equiv\\s*=\\s*(?:\""+ae("refresh")+"\""+any+etag+"|'"+ae("refresh")+"'"+any+etag+"|"+ae("refresh")+"(?:"+ae(" ")+any+etag+"|"+etag+"))", "gi"), "<!-- meta http-equiv=refresh stripped-->");

    /* Stripping all scripts */
    html = html.replace(new RegExp("<script"+any+">\\s*//\\s*<\\[CDATA\\[[\\S\\s]*?]]>\\s*</script[^>]*>", "gi"), "<!--CDATA script-->");
    html = html.replace(/<script[\S\s]+?<\/script\s*>/gi, "<!--Non-CDATA script-->");
    cr(tag+any+att+"on[-a-z0-9:_.]+="+any+etag, "on[-a-z0-9:_.]+"); /* Event listeners */

    cr(tag+any+att+"href\\s*="+any+etag, "href"); /* Linked elements */
    cr(tag+any+att+"src\\s*="+any+etag, "src"); /* Embedded elements */

    cr("<object"+any+att+"data\\s*="+any+etag, "data"); /* <object data= > */
    cr("<applet"+any+att+"codebase\\s*="+any+etag, "codebase"); /* <applet codebase= > */

    /* <param name=movie value= >*/
    cr("<param"+any+att+"name\\s*=\\s*(?:\""+ae("movie")+"\""+any+etag+"|'"+ae("movie")+"'"+any+etag+"|"+ae("movie")+"(?:"+ae(" ")+any+etag+"|"+etag+"))", "value");

    /* <style> and < style=  > url()*/
    cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi, "url", "\\s*\\(\\s*", "", "\\s*\\)");
    cri(tag+any+att+"style\\s*="+any+etag, "style", ae("url")+s+ae("(")+s, 0, s+ae(")"), ae(")"));

    /* IE7- CSS expression() */
    cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi, "expression", "\\s*\\(\\s*", "", "\\s*\\)");
    cri(tag+any+att+"style\\s*="+any+etag, "style", ae("expression")+s+ae("(")+s, 0, s+ae(")"), ae(")"));
    return html.replace(new RegExp("(?:"+prefix+")+", "g"), prefix);
}

代码说明

sanitiseHTML函数基于我的replace_all_rel_by_abs函数（请参阅this answer）。 sanitiseHTML函数完全被重写，以实现最高效率和可靠性。

此外，还添加了一组新的RegExps以删除所有脚本和事件处理程序（包括CSS expression()，IE7-）。为确保按预期解析所有标记，调整后的标记以为前缀。此前缀对于正确解析嵌套的“事件处理程序”以及未终止的引号是必需的：<a id="><input onclick="<div onmousemove=evil()>">。

使用内部函数cr / cri动态创建这些RegExps（ C reate R eplace [ I n线段]）。这些函数接受参数列表，并创建和执行高级RE替换。为了确保HTML实体不会破坏RegExp（refresh中的<meta http-equiv=refresh>可以用各种方式编写），动态创建的RegExp部分由函数ae构建（ ny E ntity）。
实际的替换是由函数by完成的（将替换为）。在此实施中，by在所有匹配的属性之前添加data-。

所有<script>//<[CDATA[ .. //]]></script>次出现都是条纹的。此步骤是必需的，因为CDATA部分允许代码中的</script>个字符串。更换完成后，可以安全地进行下一次更换：

删除剩余的<script>...</script>代码。

<meta http-equiv=refresh .. >标记已移除

所有事件侦听器和外部指针/属性（href，src，url()）都以data-为前缀，如如前所述。

创建了IFrame个对象。 IFrame不太可能泄漏内存（与htmlfile ActiveXObject相反）。 IFrame变为不可见，并附加到文档中，以便可以访问DOM。 document.write()用于将HTML写入IFrame。 document.open()和document.close()用于清空文档的先前内容，以便生成的文档是给定html字符串的精确副本。

如果指定了回调函数，则将使用两个参数调用该函数。 第一个参数是对生成的document对象的引用。 第二个参数是一个函数，它在被调用时会破坏生成的DOM树。当你不再需要树时，应该调用这个函数。
如果没有指定回调函数，函数返回一个由两个属性组成的对象（doc和destroy ），其行为与前面提到的论点相同。

附加说明

将designMode属性设置为“开”将停止执行脚本（Chrome不支持）。如果您出于特定原因必须保留<script>代码，则可以使用iframe.designMode = "On"代替脚本剥离功能。

我无法找到htmlfile activeXObject的可靠来源。根据{{3}}，htmlfile比IFrame慢，并且更容易受到内存泄漏的影响。

所有受影响的属性（href，src，...）都以data-为前缀。获取/更改这些属性的示例显示为data-href：
elem.getAttribute("data-href")和elem.setAttribute("data-href", "...")
elem.dataset.href和elem.dataset.href = "..."。

已禁用外部资源。因此，页面可能看起来完全不同：
~~<link rel="stylesheet" href="main.css" />~~ 无外部样式
<script>document.body.bgColor="red";</script> < / strike> 没有脚本样式
<img src="128x128.png" /> 没有图片：元素的大小可能完全不同。

实施例

<强> sanitiseHTML(html)
将此书签粘贴到位置栏中。它将提供一个注入textarea的选项，显示已清理的HTML字符串。

javascript:void(function(){var s=document.createElement("script");s.src="http://rob.lekensteyn.nl/html-sanitizer.js";document.body.appendChild(s)})();

代码示例 - string2dom(html) ：

string2dom("<html><head><title>Test</title></head></html>", function(doc, destroy){ alert(doc.title); /* Alert: "Test" */ destroy(); }); var test = string2dom("<div id='secret'></div>"); alert(test.doc.getElementById("secret").tagName); /* Alert: "DIV" */ test.destroy();

值得注意的参考

this source - 功能sanitiseHTML(html)基于我之前创建的replace_all_rel_by_abs(html)函数。

SO: JS RE to change all relative to absolute URLs - 标准嵌入元素的完整列表

Elements - Embedded content - （已弃用）元素的附加列表（例如<applet>）

Elements - Previous HTML elements - “比iframe沙箱慢。如果没有管理，会泄漏内存”

Answer 2

不确定为什么要搞乱documentFragments，你可以将HTML文本设置为新div元素的innerHTML。然后，您可以将该div元素用于getElementsByTagName等，而无需将div添加到DOM：

var htmlText= '<html><head><title>Test</title></head><body><div id="test_ele1">this is test_ele1 content</div><div id="test_ele2">this is test_ele content2</div></body></html>';

var d = document.createElement('div');
d.innerHTML = htmlText;

console.log(d.getElementsByTagName('div'));

如果你真的嫁给了documentFragment的想法，你可以使用这个代码，但你仍然需要将它包装在div中以获得你所追求的DOM函数：

function makeDocumentFragment(htmlText) {
    var range = document.createRange();
    var frag = range.createContextualFragment(htmlText);
    var d = document.createElement('div');
    d.appendChild(frag);
    return d;
}

Answer 3

我不确定IE是否支持document.implementation.createHTMLDocument，但如果支持，请使用此算法（改编自我的DOMParser HTML extension）。请注意，不会保留DOCTYPE。：

var
      doc = document.implementation.createHTMLDocument("")
    , doc_elt = doc.documentElement
    , first_elt
;
doc_elt.innerHTML = your_html_here;
first_elt = doc_elt.firstElementChild;
if ( // are we dealing with an entire document or a fragment?
       doc_elt.childElementCount === 1
    && first_elt.tagName.toLowerCase() === "html"
) {
    doc.replaceChild(first_elt, doc_elt);
}

// doc is an HTML document
// you can now reference stuff like doc.title, etc.

Answer 4

假设HTML也是有效的XML，您可以使用loadXML()

Answer 5

DocumentFragment不支持getElementsByTagName - 仅Document支持。

您可能需要使用像jsdom这样的库，它提供了DOM的实现，您可以使用getElementsByTagName和其他DOM API进行搜索。您可以将其设置为不执行脚本。是的，它很“重”，我不知道它是否适用于IE 7。

Answer 6

只是徘徊在这个页面上，有点迟了没有任何用处:)但以下应该可以帮助任何有类似问题的人...但是IE7 / 8应该真的被现在忽略而且有更好的效果更现代的浏览器支持的方法。

以下是我测试过的几乎所有的东西 - 只有两个不同的方面是：

我已经将定制的getElementById和getElementsByName函数添加到根div元素中，因此这些函数不会出现在树的下方（除非代码被修改为cater）为此）。
doctype将被忽略 - 但我不认为这会产生太大的影响，因为我的经验是doctype不会影响dom的结构，只是如何渲染（显然不会用这种方法发生）。

基本上，系统依赖于使用者对<tag>和<namespace:tag>的处理方式不同的事实。已经发现某些特殊标签不能存在于div元素中，因此它们被删除。命名空间元素可以放在的任何位置（除非另有说明DTD）。虽然这些命名空间标签实际上并不像真正的标签那样，但考虑到我们只是将它们用于文档中的结构位置，它并不会真正导致问题。

标记和代码如下：

<!DOCTYPE html>
<html>
<head>
<script>

  /// function for parsing HTML source to a dom structure
  /// Tested in Mac OSX, Win 7, Win XP with FF, IE 7/8/9, 
  /// Chrome, Safari & Opera.
  function parseHTML(src){

    /// create a random div, this will be our root
    var div = document.createElement('div'),
        /// specificy our namespace prefix
        ns = 'faux:',
        /// state which tags we will treat as "special"
        stn = ['html','head','body','title'];
        /// the reg exp for replacing the special tags
        re = new RegExp('<(/?)('+stn.join('|')+')([^>]*)?>','gi'),
        /// remember the getElementsByTagName function before we override it
        gtn = div.getElementsByTagName;

    /// a quick function to namespace certain tag names
    var nspace = function(tn){
      if ( stn.indexOf ) {
        return stn.indexOf(tn) != -1 ? ns + tn : tn;
      }
      else {
        return ('|'+stn.join('|')+'|').indexOf(tn) != -1 ? ns + tn : tn;
      }
    };

    /// search and replace our source so that special tags are namespaced
    /// &nbsp; required for IE7/8 to render tags before first text found
    /// <faux:check /> tag added so we can test how namespaces work
    src = '&nbsp;<'+ns+'check />' + src.replace(re,'<$1'+ns+'$2$3>');
    /// inject to the div
    div.innerHTML = src;
    /// quick test to see how we support namespaces in TagName searches
    if ( !div.getElementsByTagName(ns+'check').length ) {
      ns = '';
    }

    /// create our replacement getByName and getById functions
    var createGetElementByAttr = function(attr, collect){
      var func = function(a,w){
        var i,c,e,f,l,o; w = w||[];
        if ( this.nodeType == 1 ) {
          if ( this.getAttribute(attr) == a ) {
            if ( collect ) {
              w.push(this);
            }
            else {
              return this;
            }
          }
        }
        else {
          return false;
        }
        if ( (c = this.childNodes) && (l = c.length) ) {
          for( i=0; i<l; i++ ){
            if( (e = c[i]) && (e.nodeType == 1) ) {
              if ( (f = func.call( e, a, w )) && !collect ) {
                return f;
              }
            }
          }
        }
        return (w.length?w:false);
      }
      return func;
    }

    /// apply these replacement functions to the div container, obviously 
    /// you could add these to prototypes for browsers the support element 
    /// constructors. For other browsers you could step each element and 
    /// apply the functions through-out the node tree... however this would  
    /// be quite messy, far better just to always call from the root node - 
    /// or use div.getElementsByTagName.call( localElement, 'tag' );
    div.getElementsByTagName = function(t){return gtn.call(this,nspace(t));}
    div.getElementsByName    = createGetElementByAttr('name', true);
    div.getElementById       = createGetElementByAttr('id', false);

    /// return the final element
    return div;
  }

  window.onload = function(){

    /// parse the HTML source into a node tree
    var dom = parseHTML( document.getElementById('source').innerHTML );

    /// test some look ups :)
    var a = dom.getElementsByTagName('head'),
        b = dom.getElementsByTagName('title'),
        c = dom.getElementsByTagName('script'),
        d = dom.getElementById('body');

    /// alert the result
    alert(a[0].innerHTML);
    alert(b[0].innerHTML);
    alert(c[0].innerHTML);
    alert(d.innerHTML);

  }
</script>
</head>
<body>
  <xmp id="source">
    <!DOCTYPE html>
    <html>
    <head>
      <!-- Comment //-->
      <meta charset="utf-8">
      <meta name="robots" content="index, follow">
      <title>An example</title>
      <link href="test.css" />
      <script>alert('of parsing..');</script>
    </head>
    <body id="body">
      <b>in a similar way to createDocumentFragment</b>
    </body>
    </html>
  </xmp>
</body>
</html>

Answer 7

要在不触发请求的情况下使用完整的HTML DOM功能，而不必处理不兼容问题：

var doc = document.cloneNode();
if (!doc.documentElement) {
    doc.appendChild(doc.createElement('html'));
    doc.documentElement.appendChild(doc.createElement('head'));
    doc.documentElement.appendChild(doc.createElement('body'));
}

全套！ doc是一个html文档，但它不在线。

我可以将整个HTML文档加载到Internet Explorer中的文档片段中吗？

我尝试了什么

7 个答案:

代码

代码说明

附加说明

实施例

值得注意的参考