免责声明:我知道用正则表达式解析HTML并不是正确的方法。我实际上只是想在HTML中解析文本。
我正在解析几个页面,我正在寻找价格。以下是我到目前为止的情况:
var all = document.body.querySelectorAll(":not(script)");
var regex = /\$[0-9,]+(\.[0-9]{2})?/g;
for (var i = 0; i < all.length; i++) {
var node_value = all[i].nodeValue;
for (var j = 0; j < all[i].childNodes.length; j++) {
var node_value = all[i].childNodes[j].nodeValue;
if (node_value !== null) {
var matches = node_value.match(regex);
if (matches !== null && matches.length > 0) {
alert("that's a match");
}
}
}
}
这个特殊的代码可以得到这样的价格:
<div>This is the current price: <span class="current">$60.00</span></div>
但是,有些价格具有以下结构:
<div>This is the current price: <sup>$</sup><span>80.00</span></div>
我如何改进算法才能找到这些价格?我应该使用正则表达式查看<sup>symbol</sup><span>price</span>
的第一个for循环吗?
重要:一旦匹配,我需要找出哪个DOM元素持有该价格。保持价格的最内在元素。例如:
<div><span>$80.00</span></div>
我需要说这是持有价格的元素,而不是div。
答案 0 :(得分:1)
试试这个:
var text = document.body.textContent || document.body.innerText,
regex = /\$\s*[0-9,]+(?:\s*\.\s*\d{2})?/g,
match = text.match(regex);
if( match) {
match = match[0].replace(/\s/g,"");
alert("Match found: "+match);
}
使用递归搜索:
function findPrice(node) {
node = node || document.body;
var text = node.textContent || node.innerText,
regex = /\$\s*[0-9,]+(?:\s*\.\s*\d{2})?/,
match = text.match(regex);
if( match) {
var children = node.children, l = children.length, i;
for( i=0; i<l; i++) {
if( findPrice(children[i])) {
return children[i];
}
}
// if no children matched, then this is the narrowest container
return node;
}
else return false;
}
var result = findPrice();
答案 1 :(得分:0)
如果您可以选择浏览器,则可以使用XPath预先选择候选人。以下代码查找候选节点。我在Firefox 25中尝试过它。您可能还想查看What browsers support Xpath 2.0?和http://www.yaldex.com/ajax-tutorial-4/BBL0029.html以了解跨浏览器方法。
<html><head><script type="text/javascript">
function func() {
//span containing digits preceeded by superscript dollar sign
var xpathExpr1 = "//span[translate(text(),'0123456789.,','')!=text()][preceding-sibling::sup[text()='$']]";
//span containing digits and starting with dollar sign
var xpathExpr2 = "//span[translate(text(),'0123456789.,','')!=text() and contains(text(),'$')]";
var xpathExpr3 = xpathExpr1 + "|" + xpathExpr2; // union
var contextNode = document.body;
var namespaceResolver = function(prefix){return "";}
var resultType = XPathResult.UNORDERED_NODE_ITERATOR_TYPE;
var xpathResult = document.evaluate(xpathExpr1, contextNode, namespaceResolver, resultType, null);
alert(xpathResult);
var node;
while ((node = xpathResult.iterateNext()) != null) {
alert(node.textContent);
}
}
</script></head>
<body onload="func()"> aaa
<sup>$</sup><span>80.00</span> bbb
<span>$129</span> ccc
<sup>$</sup><span>ABC</span> ddd
</body></html>