以下是我的HTML示例(实际HTML非常大且复杂,我不是为了简单起见而发布的):
<!DOCTYPE html>
<html>
<head>
</head>
<body style="background-color: #000000;font-family:'Open Sans'">
<div class:'abc' id="cde"></div>
<div class:"abc" id="fed"></div>
<div class:abc id="ce"></div>
<div class:"abc"><p class="content" autocomplete> I am some text which might contain attribute:"invalid value" and I must not be removed</p></div>
</body>
</html>
此处的目标是从HTML中删除无效属性,而不会干扰其余的html。显然,无效属性可以是attribute="value"
,attribute=value
或attribute='value'
或甚至attribute
(例如<input id="abc" type="text" value="test" disabled>
)以外的任何其他属性,正则表达式应删除它。此内容无法加载到DOM中,因此请仅建议使用基于正则表达式的解决方案。
首先,我正在尝试/[a-zA-Z]+:"?'?[a-zA-Z]+"?'?/gi
,但我知道我无处可靠!
这里有the fiddle供你玩。
预期输出:
<!DOCTYPE html>
<html>
<head>
</head>
<body style="background-color: #000000;font-family:'Open Sans'">
<div id="cde"></div>
<div id="fed"></div>
<div id="ce"></div>
<div ><p class="content" autocomplete> I am some text which might contain attribute:"invalid value" and I must not be removed</p></div>
</body>
</html>
更新1:
attribute:value
/ attribute;value
/ attribute:"value"
(基本上除支持的有效属性以外的任何内容)等,如果它们位于<element>
内。答案 0 :(得分:2)
我会两次使用.replace
功能:
var html = `<!DOCTYPE html>
<html>
<head>
</head>
<body style="background-color: #000000;font-family:'Open Sans'">
<div class:'abc' id="cde"></div>
<div class:"abc" id="fed"></div>
<div class:abc id="ce"></div>
<div class:"abc"><p class="content" autocomplete required blah=blah> I am some text which might contain attribute:"invalid value" and I must not be removed</p></div>
</body>
</html>`;
var htmlCleaned = html.replace(/(<\w+)(\s[^>]*)>+/g, function($m, $1, $2) {
return $1 + $2.replace(/\s*?(\s?\w+(?:=(?:'[^'\\]*(?:\\.[^'\\]*)*'|"[^"\\]*(?:\\.[^"\\]*)*"|\w+)|(?!\S)))|\s*\S+/g, '$1') + ">";
});
console.log(htmlCleaned)
答案 1 :(得分:1)
这是一项值得专用库的任务。为了识别无效属性,您需要首先找到有效且不易理解的有效标签。例如,某些标签未关闭时需要做什么?是否应关闭像input
这样的不可关闭的标签? href应该是div的属性吗?等等等
使用普通正则表达式几乎不可能。即使它不会覆盖所有情况,也可能太复杂=不受支持。
只需将它发送给为您完成此操作的图书馆,例如这一个https://github.com/dave-kennedy/clean-html
答案 2 :(得分:1)
此代码解析您的html拆分相关部分并检查属性是否有效。
你可以使它更有效率,因为它循环多次,但这种方式更容易理解它的组成部分。
那就是说。不要使用此代码。如果你不能解析你的元素到DOM的方法,如果你在Node中,你可以解析为xml并使用节点来确保一切正常。
我的小型控制台应用不显示自动填充属性,但它位于字符串中。
此代码可能会在生产环境中失败!
const html = document.querySelector('#input').innerHTML
const isElement = x =>
/^<.*>$/.test(x)
const isValidAttribute = x =>
/^(([a-zA-Z-]+)=?((?:\"|\')[^\'\"]*(?:\"|\'))*|\w+)$/.test(x)
const similarToAttribute = x =>
/=.*((?:\"|\').*(?:\"|\'))/.test(x)
const isOpeningOrClosingBracket = x =>
/(^<|>$)/.test(x)
const output =
html
// .replace(/(\n|\r)+/gm, '') // uncomment to remove new lines
.split(/(<[^>]+>)/) // split the elements
.filter(x => x !== "") // remove empty elements
.map( x => !isElement(x)
? x // it's not an element node, return it
: x.split(/(<\w+|>|\s)/) // split the parts of elements
.filter(x => x !== " " && x !== "") // remove empty elements
.reduce((acc, x) => {
return isOpeningOrClosingBracket(x) || isValidAttribute(x)
? acc.concat(x) // return valid components
: acc // failed check, dont return the attribute
}, [])
)
.map(x => Array.isArray(x) // arrays are elements
? x.slice(0, x.length - 1).join(' ') + x[x.length -1] // join the element string
: x // return anything else
)
.join('') // join the entire array into a string
const div = document.createElement('section')
div.innerHTML = output
console.log(output)
console.log(div)
/* UNIT TESTS */
expect('string is valid element format', () => {
assert(isElement('<div>')).equal(true)
assert(isElement('</div>')).equal(true)
assert(isElement('not an element')).equal(false)
})
expect('string is valid attribute format', () => {
assert(isValidAttribute('class="thing"')).equal(true)
assert(isValidAttribute('class:\'abc\'="thing"')).equal(false)
assert(isValidAttribute('class:\'abc\'="thing"')).equal(false)
assert(isValidAttribute('autocomplete')).equal(true)
})
expect('string has similar properties to an attribute', () => {
assert(similarToAttribute('this is not an attribute')).equal(false)
assert(similarToAttribute('class:\'abc\'="thing"')).equal(true)
assert(similarToAttribute('class:\'abc\'="thing"')).equal(true)
})
expect('string is opening or closing tag', () => {
assert(isOpeningOrClosingBracket('<div')).equal(true)
assert(isOpeningOrClosingBracket('>')).equal(true)
assert(isOpeningOrClosingBracket('class="thing"')).equal(false)
})
<script src="https://codepen.io/synthet1c/pen/KyQQmL.js"></script>
<pre id="input">
<div class:'abc' id="cde"></div>
<div class:"abc" id="fed"></div>
<div class:abc id="ce"></div>
<div class:"abc"><p class="content" autocomplete> I am some text which might contain attribute:"invalid value" and I must not be removed</p></div>
</pre>
答案 3 :(得分:1)
虽然通常不建议,但您可以在DOM
上使用两个表达式,一个用于过滤潜在的元素,一个用于实际根除相关属性:
var html = `<!DOCTYPE html>
<html>
<head>
</head>
<body style="background-color: #000000;font-family:'Open Sans'">
<div class:'abc' id="cde"></div>
<div class:"abc" id="fed"></div>
<div class:abc id="ce"></div>
<div class:"abc"><p class="content" autocomplete> I am some text which might contain attribute:"invalid value" and I must not be removed</p></div>
<!-- another one here -->
<div class:'abc defg' id="ce"></div>
</body>
</html>`;
var cleaned = html.replace(/<(?:(?!>).)*\b\w+:['"]?\w+['"]?.*?>/g, function(match) {
return match.replace(/\s+\w+:(?:(?:'[^']*')|(?:"[^"]*")|\w+)\s*(?!\w)/g, '');
});
console.log(cleaned);
< # <
(?:(?!>).)* # anything where > is not immediately ahead
\b\w+: # a word boundary +1 word characters and :
['"]? # quotes, optional
\w+ # another 1+ word characters
['"]? # as above
.*? # anything else lazily afterwards
> # >
......而对于第二个(内在的):
\s+\w+: # 1+ whitespaces, 1+ word characters
(?: # non-capturing group
(?:'[^']*') # '...'
| # or
(?:"[^"]*") # "..."
| # or
\w+ # 1+ word characters
)
\s*(?!\w) # 0+ whitespaces, make sure there's no
# word character ahead
<小时/> 请注意,这不会考虑到某事。例如
data-attribute='some weird <> characters here: """'>
或data-key="hey, i'am \"escaped, yippeh!">
,它们都是完全有效的
如果您期望这样的输入,请改为使用解析器。
答案 4 :(得分:0)