Question

我需要一个可靠的Javascript库/函数来检查HTML代码段是否有效，我可以从我的代码中调用。例如，它应该检查打开的标签和引号是否已关闭，嵌套是否正确等等。

我不希望验证失败，因为某些东西不是100％标准（但无论如何都会有效）。

Answer 1

更新：此答案有限 - 请参阅下面的编辑。

扩展@ kolink的答案，我使用：

var checkHTML = function(html) {
  var doc = document.createElement('div');
  doc.innerHTML = html;
  return ( doc.innerHTML === html );
}

即，我们使用HTML创建一个临时div。为此，浏览器将根据HTML字符串创建一个DOM树，这可能涉及关闭标签等。

将div的HTML内容与原始HTML进行比较会告诉我们浏览器是否需要更改任何内容。

checkHTML('<a>hell<b>o</b>')

返回false。

checkHTML('<a>hell<b>o</b></a>')

返回true。

编辑：正如下面的@Quentin所述，由于各种原因，这是过分严格：浏览器通常会修复省略的结束标记，即使结束标记是可选的那个标签。例如：

<p>one para
<p>second para

...被认为是有效的（因为允许Ps省略结束标记），但checkHTML将返回false。浏览器还会标记标签案例，并改变空白区域。在决定使用此方法时，您应该了解这些限制。

Answer 2

嗯，这段代码：

function tidy(html) {
    var d = document.createElement('div');
    d.innerHTML = html;
    return d.innerHTML;
}

这将“纠正”格式错误的HTML，以充分利用浏览器的能力。如果这对您有帮助，那么比尝试验证HTML要容易得多。

Answer 3

到目前为止，所提出的解决方案都没有在回答原始问题方面做得很好，特别是在谈到

时

我不希望验证失败，因为某些内容不是100％标准（但无论如何都会有效）。

tldr＆gt;＆gt;检查JSFiddle

所以我使用了关于这个主题的答案和评论的输入，并创建了一个执行以下操作的方法：

检查html字符串标记（如果有效）
尝试渲染html字符串
从理论上比较要创建的标签计数与实际渲染的html dom标签计数
如果选中＆＃39;严格＆＃39;，则<br/>和空属性规范化=""不会被忽略
将呈现的innerHTML与给定的html字符串进行比较（忽略空格和引号）

返回

true 如果呈现的html与给定的html字符串相同
false
规范化的html字符串如果呈现的html似乎有效但不等于给定的html字符串

规范化意味着，在渲染时，浏览器会忽略或修复输入的某些特定部分（例如为<p>添加缺少的结束标记并转换其他部分（如单引号或双引号）或者＆符号的编码）。区分＆＃34;失败＆＃34;和＃34;标准化＆＃34;允许将内容标记为用户＆＃34;这不会像您期望的那样呈现＃34;。

大多数时候规范化会返回原始html字符串的略微改变的版本 - 但有时结果却完全不同。所以这应该用于，例如标记用户输入以进一步查看，然后将其保存到数据库或盲目呈现。（有关规范化的示例，请参阅JSFiddle）

检查考虑以下例外

忽略将单引号规范化为双引号
image以及具有src属性的其他代码已撤防＆＃39;在渲染期间
（如果不严格）忽略<br/>＆gt;＆gt; <br>转化
（如果不严格）忽略空属性的标准化（<p disabled>＆gt;＆gt; <p disabled="">）
在阅读.innerHTML时对最初未编码的＆符号进行编码，例如在属性值

function simpleValidateHtmlStr(htmlStr, strictBoolean) {
  if (typeof htmlStr !== "string")
    return false;

  var validateHtmlTag = new RegExp("<[a-z]+(\s+|\"[^\"]*\"\s?|'[^']*'\s?|[^'\">])*>", "igm"),
    sdom = document.createElement('div'),
    noSrcNoAmpHtmlStr = htmlStr
      .replace(/ src=/, " svhs___src=") // disarm src attributes
      .replace(/&amp;/igm, "#svhs#amp##"), // 'save' encoded ampersands
    noSrcNoAmpIgnoreScriptContentHtmlStr = noSrcNoAmpHtmlStr
      .replace(/\n\r?/igm, "#svhs#nl##") // temporarily remove line breaks
      .replace(/(<script[^>]*>)(.*?)(<\/script>)/igm, "$1$3") // ignore script contents
      .replace(/#svhs#nl##/igm, "\n\r"),  // re-add line breaks
    htmlTags = noSrcNoAmpIgnoreScriptContentHtmlStr.match(/<[a-z]+[^>]*>/igm), // get all start-tags
    htmlTagsCount = htmlTags ? htmlTags.length : 0,
    tagsAreValid, resHtmlStr;


  if(!strictBoolean){
    // ignore <br/> conversions
    noSrcNoAmpHtmlStr = noSrcNoAmpHtmlStr.replace(/<br\s*\/>/, "<br>")
  }

  if (htmlTagsCount) {
    tagsAreValid = htmlTags.reduce(function(isValid, tagStr) {
      return isValid && tagStr.match(validateHtmlTag);
    }, true);

    if (!tagsAreValid) {
      return false;
    }
  }


  try {
    sdom.innerHTML = noSrcNoAmpHtmlStr;
  } catch (err) {
    return false;
  }

  // compare rendered tag-count with expected tag-count
  if (sdom.querySelectorAll("*").length !== htmlTagsCount) {
    return false;
  }

  resHtmlStr = sdom.innerHTML.replace(/&amp;/igm, "&"); // undo '&' encoding

  if(!strictBoolean){
    // ignore empty attribute normalizations
    resHtmlStr = resHtmlStr.replace(/=""/, "")
  }

  // compare html strings while ignoring case, quote-changes, trailing spaces
  var
    simpleIn = noSrcNoAmpHtmlStr.replace(/["']/igm, "").replace(/\s+/igm, " ").toLowerCase().trim(),
    simpleOut = resHtmlStr.replace(/["']/igm, "").replace(/\s+/igm, " ").toLowerCase().trim();
  if (simpleIn === simpleOut)
    return true;

  return resHtmlStr.replace(/ svhs___src=/igm, " src=").replace(/#svhs#amp##/, "&amp;");
}

在这里，您可以在JSFiddle https://jsfiddle.net/abernh/twgj8bev/中找到它，以及不同的测试用例，包括

"<a href='blue.html id='green'>missing attribute quotes</a>" // FAIL
"<a>hell<B>o</B></a>"                                        // PASS
'<a href="test.html">hell<b>o</b></a>'                       // PASS
'<a href=test.html>hell<b>o</b></a>',                        // PASS
"<a href='test.html'>hell<b>o</b></a>",                      // PASS
'<ul><li>hell</li><li>hell</li></ul>',                       // PASS
'<ul><li>hell<li>hell</ul>',                                 // PASS
'<div ng-if="true && valid">ampersands in attributes</div>'  // PASS

Answer 4

function validHTML(html) {
  var openingTags, closingTags;

  html        = html.replace(/<[^>]*\/\s?>/g, '');      // Remove all self closing tags
  html        = html.replace(/<(br|hr|img).*?>/g, '');  // Remove all <br>, <hr>, and <img> tags
  openingTags = html.match(/<[^\/].*?>/g) || [];        // Get remaining opening tags
  closingTags = html.match(/<\/.+?>/g) || [];           // Get remaining closing tags

  return openingTags.length === closingTags.length ? true : false;
}

var htmlContent = "<p>your html content goes here</p>" // Note: String without any html tag will consider as valid html snippet. If it’s not valid in your case, in that case you can check opening tag count first.

if(validHTML(htmlContent)) {
  alert('Valid HTML')
}
else {
  alert('Invalid HTML');
}

Answer 5

9 年后，如何使用 DOMParser？

它接受字符串作为参数并返回 Document 类型，就像 HTML 一样。因此，当它出现错误时，返回的文档对象中包含 <parsererror> 元素。

如果您将 html 解析为 xml，至少您可以检查您的 html 是否符合 xhtml。

示例

> const parser = new DOMParser();
> const doc = parser.parseFromString('<div>Input: <input /></div>', 'text/xml');
> (doc.documentElement.querySelector('parsererror') || {}).innerText; // undefined

把它包装成一个函数

function isValidHTML(html) {
  const parser = new DOMParser();
  const doc = parser.parseFromString(html, 'text/xml');
  if (doc.documentElement.querySelector('parsererror')) {
    return doc.documentElement.querySelector('parsererror').innerText;
  } else {
    return true;
  }
}

测试上述功能

isValidHTML('<a>hell<B>o</B></a>') // true
isValidHTML('<a href="test.html">hell</a>') // true
isValidHTML('<a href='test.html'>hell</a>') // true
isValidHTML("<a href=test.html>hell</a>")  // This page contains the following err..
isValidHTML('<ul><li>a</li><li>b</li></ul>') // true
isValidHTML('<ul><li>a<li>b</ul>') // This page contains the following err..
isValidHTML('<div><input /></div>' // true
isValidHTML('<div><input></div>' // This page contains the following err..

以上适用于非常简单的 html。但是，如果您的 html 有一些类似代码的文本； <script>、<style> 等，尽管它是有效的 HTML，但您只需要为 XML 验证进行操作

以下将类似代码的 html 更新为有效的 XML 语法。

export function getHtmlError(html) {
  const parser = new DOMParser();
  const htmlForParser = `<xml>${html}</xml>`
    .replace(/(src|href)=".*?&.*?"/g, '$1="OMITTED"')
    .replace(/<script[\s\S]+?<\/script>/gm, '<script>OMITTED</script>')
    .replace(/<style[\s\S]+?<\/style>/gm, '<style>OMITTED</style>')
    .replace(/<pre[\s\S]+?<\/pre>/gm, '<pre>OMITTED</pre>')
    .replace(/&nbsp;/g, '&#160;');

  const doc = parser.parseFromString(htmlForParser, 'text/xml');
  if (doc.documentElement.querySelector('parsererror')) {
    console.error(htmlForParser.split(/\n/).map( (el, ndx) => `${ndx+1}: ${el}`).join('\n'));
    return doc.documentElement.querySelector('parsererror');
  }
}

Answer 6

使用纯JavaScript，您可以使用以下函数检查元素是否存在：

books

使用以下代码，我们可以对此进行测试：

<强> HTML：

if (typeof(element) != 'undefined' && element != null)

<强> CSS：

<input type="button" value="Toggle .not-undefined" onclick="toggleNotUndefined()">
<input type="button" value="Check if .not-undefined exists" onclick="checkNotUndefined()">
<p class=".not-undefined"></p>

<强> JavaScript的：

p:after {
    content: "Is 'undefined'";
    color: blue;
}
p.not-undefined:after {
    content: "Is not 'undefined'";
    color: red;
}

可以在JSFiddle找到。

Answer 7

function isHTML(str)
{
 var a = document.createElement('div');
 a.innerHTML = str;
 for(var c= a.ChildNodes, i = c.length; i--)
 {
    if (c[i].nodeType == 1) return true;
 }
return false;
}

祝你好运！

Answer 8

这取决于您使用的js-library。

node.js https://www.npmjs.com/package/html-validator

的Html validatod

jQuery https://api.jquery.com/jquery.parsehtml/

的Html验证程序

但是，如前所述，使用浏览器验证损坏的HTML是一个好主意：

function tidy(html) {
    var d = document.createElement('div');
    d.innerHTML = html;
    return d.innerHTML;
}

Answer 9

从上面扩展@Tarun 的回答：

function validHTML(html) { // checks the validity of html, requires all tags and property-names to only use alphabetical characters and numbers (and hyphens, underscore for properties)
    html = html.toLowerCase().replace(/(?<=<[^>]+?=\s*"[^"]*)[<>]/g,"").replace(/(?<=<[^>]+?=\s*'[^']*)[<>]/g,""); // remove all angle brackets from tag properties
    html = html.replace(/<script.*?<\/script>/g, '');  // Remove all script-elements
    html = html.replace(/<style.*?<\/style>/g, '');  // Remove all style elements tags
    html = html.toLowerCase().replace(/<[^>]*\/\s?>/g, '');      // Remove all self closing tags
    html = html.replace(/<(\!|br|hr|img).*?>/g, '');  // Remove all <br>, <hr>, and <img> tags
    //var tags=[...str.matchAll(/<.*?>/g)]; this would allow for unclosed initial and final tag to pass parsing
    html = html.replace(/^[^<>]+|[^<>]+$|(?<=>)[^<>]+(?=<)/gs,""); // remove all clean text nodes, note that < or > in text nodes will result in artefacts for which we check and return false
    tags = html.split(/(?<=>)(?=<)/);
    if (tags.length%2==1) {
        console.log("uneven number of tags in "+html)
        return false;
    }
    var tagno=0;
    while (tags.length>0) {
        if (tagno==tags.length) {
            console.log("these tags are not closed: "+tags.slice(0,tagno).join());
            return false;
        }
        if (tags[tagno].slice(0,2)=="</") {
            if (tagno==0) {
                console.log("this tag has not been opened: "+tags[0]);
                return false;
            }
            var tagSearch=tags[tagno].match(/<\/\s*([\w\-\_]+)\s*>/);
            if (tagSearch===null) {
                console.log("could not identify closing tag "+tags[tagno]+" after "+tags.slice(0,tagno).join());
                return false;
            } else tags[tagno]=tagSearch[1];
            if (tags[tagno]==tags[tagno-1]) {
                tags.splice(tagno-1,2);
                tagno--;
            } else {
                console.log("tag '"+tags[tagno]+"' trying to close these tags: "+tags.slice(0,tagno).join());
                return false;
            }
        } else {
            tags[tagno]=tags[tagno].replace(/(?<=<\s*[\w_\-]+)(\s+[\w\_\-]+(\s*=\s*(".*?"|'.*?'|[^\s\="'<>`]+))?)*/g,""); // remove all correct properties from tag
            var tagSearch=tags[tagno].match(/<(\s*[\w\-\_]+)/);
            if ((tagSearch===null) || (tags[tagno]!="<"+tagSearch[1]+">")) {
                console.log("fragmented tag with the following remains: "+tags[tagno]);
                return false;
            }
            var tagSearch=tags[tagno].match(/<\s*([\w\-\_]+)/);
            if (tagSearch===null) {
                console.log("could not identify opening tag "+tags[tagno]+" after "+tags.slice(0,tagno).join());
                return false;
            } else tags[tagno]=tagSearch[1];
            tagno++;
        }
    }
    return true;
}

这会执行一些额外的检查，例如测试标签是否匹配以及属性是否会解析。由于它不依赖于现有的 DOM，因此可以在服务器环境中使用，但要注意：它很慢。此外，理论上，标签可以是更宽松的名称，因为您基本上可以在标签和属性名称中使用任何 unicode（除了少数例外）。然而，这不会通过我自己的健全性检查。

检查HTML代码段是否对Javascript有效

9 个答案: