Question

我用cheerio解析HTML文件（稍后用Mocha测试），这些文件中的HTML元素可以有很多属性，我想检查属性是否在同一个元素中重复：

示例部分文件，其元素包含重复的＆＃34; class＆＃34;属性：

<div class="logo-center" data-something-very-long="something long" ... class="logo" data-more-stuff>

以下是加载文件的代码：

var fileContents = fs.readFileSync(file, "utf8");
var $ = cheerio.load(fileContents);

注意：它不必是一个类属性，它可以是重复的任何其他属性。

Answer 1

再次解析被测元素。要实现这一点，您需要深入了解cheerio / htmlparser2生成的原始DOM对象。它使用为domhandler记录的属性，但不用于cheerio，因此可能需要对版本进行一些处理。我用

进行了测试

└─┬ cheerio@1.0.0-rc.1 
  ├─┬ htmlparser2@3.9.2 
  │ ├── domhandler@2.4.1

我已经制定了这种ES6风格，但你可以使用较旧的，更传统的结构轻松地做到这一点。

RegExp可能需要一些改进，这取决于您对正在测试的文件的期望。

const fileContents = fs.readFileSync(file, "utf8");
const $ = cheerio.load(fileContents, {
  useHtmlParser2: true,
  withStartIndices: true,
  withEndIndices: true
});

function getDuplicateAttributes ($elem) {
    const dom = $elem.get(0);

    // identify tag text position in string
    const start = dom.startIndex;
    const end = dom.children.length ? dom.children[0].startIndex : dom.endIndex + 1;
    // extract
    const html = fileContents.slice(start, end);

    // generator function loops through all attribute matches on the html string
    function* multivals (attr) {
        const re = new RegExp(`\\s${attr}="(.*?)"`, 'g');
        let match;
        while((match = re.exec(html)) !== null) {
            // yield each property value found for the attr name
            yield match[1];
        }
    }

    // the DOM will contain all attribute names once
    const doubleAttributeList = Object.keys(dom.attribs)
       // compound attribute names with all found values
      .map((attr) => {
           const matchIterator = multivals(attr);
           return [attr, Array.from(matchIterator)];
      })
      // filter for doubles
      .filter((entry) => entry[1].length > 1);

    return new Map(doubleAttributeList);
}

一旦你找到了双打，你就没有说明你想做什么，所以他们刚回来。

Answer 2

@ccprog回答有效，这是一个小型的ES5重构：

var file = 'some file';
var fileContents = fs.readFileSync(file, 'utf8');
var $ = cheerio.load(fileContents, {
  useHtmlParser2: true,
  withStartIndices: true,
  withEndIndices: true
});

function getDuplicateAttributes ($elem) {
  var dom = $elem.get(0);

  // identify tag text position in fileContents
  var start = dom.startIndex;
  var end = dom.children.length ? dom.children[0].startIndex : dom.endIndex + 1;

  // extract
  var html = fileContents.slice(start, end);

  // the DOM will contain all attribute names once
  return Object.keys(dom.attribs)
    // compound attribute names with all found values
    .map(function (attr) {
      // modify regexp to capture values if needed
      var regexp = new RegExp('\\s' + attr + '[\\s>=]', 'g');
      return html.match(regexp).length > 1 ? attr : null;
    })
    // filter for doubles
    .filter(function (attr) {  
      return attr !== null;
    });
}

var duplicatedAttrs = getDuplicateAttributes($(".some-elem"));

代码：

删除生成器
ES6至ES5
改进RegExp
使用string.match（）而不是regexp.exec（）。

如何检查元素是否具有cheerio js的重复属性

2 个答案: