将HTML下载到允许的标签

时间:2017-04-29 20:09:11

标签: javascript node.js regex

除了列表中的内容之外,我还想删除所有HTML标记(但不包括内容)。

我想使用Node。

以下正则表达式可以匹配标记-- drop table if exists t; -- drop function if exists f(jsonb); create function f(jsonb) returns bool language plpgsql immutable as $$ begin if jsonb_typeof($1) <> 'array' then return false; else return ( select bool_and(jsonb_typeof(j) = 'object' and j ? 'id') from jsonb_array_elements($1) as a(j)); end if; end $$; -- test the function select f('[{"id":1}]'::jsonb), f('{"id":1}'::jsonb), f('[{"id":1},"id"]'::jsonb); create table t(x jsonb check (f(x))); insert into t values('[{"id":1}]'); -- success insert into t values('{"id":1}'); -- fail insert into t values('[{"id":1},"id"]'); -- fail too ,但是如何继续删除除匹配标记之外的所有标记?

2 个答案:

答案 0 :(得分:3)

仅替换abr

  1. |中没有[]。集[a|br]为:a|br。改为使用非捕获组。
  2. .+?.*?替换<a><br>
  3. 集合[a]可能只是a
  4. 试试这个:

    /<(?:a|br).*?>|<\/a>/g
    

    https://regex101.com/r/KWJi01/2

    要替换除abr以外的所有代码:

    使用此正则表达式:

    /<(?:(?!\/?a|br).*?)>/g
    

    https://regex101.com/r/KWJi01/3

答案 1 :(得分:0)

不使用正则表达式的问题的另一种解决方案是以下方法:

// creating a function which takes two arguments:
// htmlString: String, a string of HTML to process,
// permitted:  Array, an array of HTML element tag-names.
function filterHTML(htmlString, permitted) {

  // here we iterate over the Array of permitted elements,
  // and convert any uppercase tag-names to lowercase:
  permitted = permitted.map(
    el => el.toLowerCase()
  );

  // creating a new element in which to hold the passed-in
  // string of HTML when parsed into HTML:
  let temp = document.createElement('div');

  // assigning the passed-in string of HTML as the
  // innerHTML of the temp element:
  temp.innerHTML = htmlString;

  // finding all elements held within the temp element,
  // and passing the NodeList to Array.from() in order
  // to convert the Array-like NodeList into an Array:
  let allElements = Array.from(temp.querySelectorAll('*'));

  // iterating over the array of found elements, using
  // Array.prototype.forEach():
  allElements.forEach(

    // using an Arrow function, 'element' is the current
    // element of the Array of elements over which we're
    // iterating:
    element => {

      // if the current element's tagName - converted to
      // lowercase - is not found within the Array of
      // permitted tags:
      if (permitted.indexOf(element.tagName.toLowerCase()) === -1) {

        // while the current (unpermitted) element has
        // a first-child:
        while (element.firstChild) {

          // we access the element's parent node, and
          // call node.insertBefore() to place the
          // first-child of the element before the
          // element (removing it from the element
          // which is unpermitted and placing it as
          // a previous-sibling):
          element.parentNode.insertBefore(element.firstChild, element);
        }

        // finding the element's parent node, and calling
        // node.removeChild() in order to remove the current
        // element from its parent node (and therefore from
        // the temp element):
        element.parentNode.removeChild(element);
      }
    });

  // here we return the innerHTML of the temp element, after
  // all unpermitted elements have been removed:
  return temp.innerHTML;
}

// the allowed tags, to be used in the above function,
// note that the tags do not have the '<', '>' or any
// attributes:
let allowedTags = ['br', 'div'];

// the following is just to visually display on the page
// the unprocessed and processed innerHTML, and also the
// rendered HTML following the processing:
document.querySelector('#input').textContent = document.querySelector('#test').innerHTML;

document.querySelector('#output').textContent = filterHTML(document.querySelector('#test').innerHTML, allowedTags).trim();

document.querySelector('#result').innerHTML = document.querySelector('#output').textContent;

&#13;
&#13;
function filterHTML(htmlString, permitted) {
  let temp = document.createElement('div');

  temp.innerHTML = htmlString;

  let allElements = Array.from(temp.querySelectorAll('*'));

  allElements.forEach(
    element => {
      if (permitted.indexOf(element.tagName.toLowerCase()) === -1) {
        while (element.firstChild) {
          element.parentNode.insertBefore(element.firstChild, element);
        }
        element.parentNode.removeChild(element);
      }
    });
  return temp.innerHTML;
}

let allowedTags = ['br', 'div'];

document.querySelector('#input').textContent = document.querySelector('#test').innerHTML;

document.querySelector('#output').textContent = filterHTML(document.querySelector('#test').innerHTML, allowedTags).trim();

document.querySelector('#result').innerHTML = document.querySelector('#output').textContent;
&#13;
div {
  border: 2px solid #000;
  margin: 0 0 1em 0;
  padding: 0.5em;
  box-sizing: border-box;
  border-radius: 1em;
}

div[id$=put] {
  white-space: pre-wrap;
}

#input {
  border-color: #f00;
}

#output {
  border-color: limegreen;
}

::before {
  color: #666;
  display: block;
  border-bottom: 1px solid #666;
  margin-bottom: 0.5em;
}

#input::before {
  content: 'The original innerHTML of the "#test" element:';
}

#output::before {
  content: 'The filtered innerHTML, with only permitted elements remaining:'
}

#result::before {
  content: 'This element shows the filtered HTML of the "test" element:'
}
&#13;
<div id="test">
  <div><span>
      <a href="#">link text!</a>
      <br />
      <hr />
      <div>div text!</div>
    </span></div>
</div>

<div id="input"></div>
<div id="output"></div>
<div id="result"></div>
&#13;
&#13;
&#13;

JS Fiddle demo

参考文献: