从文本中删除所有标记/脚本/函数/ html / css代码

时间:2017-11-07 13:30:06

标签: python regex

我正在尝试从文本中删除所有标记,脚本,html额外代码。

我尝试了几种方式,但最终文本仍包含HTML代码。我已将所有正则表达式和tecniques一起添加,以便如果一个错过了可以被另一个

捕获
def removeHTML(self, text):
    soup = BeautifulSoup(text)
    text = ''.join(soup.findAll(text=True))
    # return text
    parsedText = re.compile(r'<.*?>')
    scripts = re.compile(r'<(script).*?</\1>(?s)')
    css = re.compile(r'<style.*?/style>')
    tags = re.compile(r'<.*?>')
    text = parsedText.sub('', text)
    text = scripts.sub('', text)
    text = css.sub('', text)
    text = tags.sub('', text)       
    text = re.sub('{{.*?}}', '', text)
    return text

许多脚本和功能都有文字。

对此存在更好的解决方案吗?

未捕获的文本之一是 -

    function startupApp(stencilBootstrap) {
      stencilBootstrap("pages/home", "{\"themeImageSizes\":{\"200\":{\"width\":200,\"height\":200},\"500\":{\"width\":500,\"height\":500},\"1024\":{\"width\":1024,\"height\":1024},\"account-product-thumbnail\":{\"width\":500,\"height\":500},\"core-swatch\":{\"width\":150,\"height\":150},\"core-product-picklist\":{\"width\":80,\"height\":200},\"account-logo\":{\"width\":500,\"height\":200},\"logo\":{\"width\":500,\"height\":200},\"thumbnail\":{\"width\":100,\"height\":100},\"512x1024\":{\"width\":512,\"height\":1024},\"1280x650\":{\"width\":1280,\"height\":650},\"1600x700\":{\"width\":1600,\"height\":700},\"account-product-thumb\":{\"width\":440,\"height\":600}},\"headerNavMenu\":\"Menu\",\"headerNavClose\":\"Close\",\"headerNavSearch\":\"Search\",\"required\":\"*\",\"selectState\":\"Choose a State/Province\",\"validationRequired\":\"This field is required.\",\"validationEmail\":\"Your E-mail address appears to be invalid.\",\"validationNumber\":\"You can enter only numbers in this field.\",\"validationNumMax\":\"Please enter a number less than undefined.\",\"validationNumMin\":\"Please enter a number greater than undefined.\",\"validationNumRange\":\"Please enter a number greater than undefined and less than undefined.\",\"validationMaxLength\":\"Maximum undefined characters allowed.\",\"validationMinLength\":\"Minimum undefined characters allowed.\",\"validationMaxChecked\":\"Maximum undefined options allowed.\",\"validationMinChecked\":\"Please select minimum undefined options.\",\"validationMaxSelected\":\"Maximum undefined selection allowed.\",\"validationMinSelected\":\"Minimum undefined selection allowed.\",\"validationNotEqual\":\"Fields do not match.\",\"validationDifferent\":\"Fields cannot be the same as each other.\",\"addSuccess\":\"*product* has been successfully added to your cart. View your *cart_link*.\",\"addToCart\":\"Add To Cart\",\"productYouSave\":\"You save\",\"productIncludingTax\":\"(Inc Tax)\",\"productExcludingTax\":\"(Exc Tax)\",\"cartLink\":\"cart\",\"checkoutLink\":\"product.checkout_link\",\"homeLink\":\"product.home_link\",\"preOrder\":\"Pre-Order\",\"soldOut\":\"Sold Out\",\"urlsCart\":\"/cart.php\",\"urlsCheckout\":\"/checkout.php\",\"productsPerPage\":21,\"productImageZoom\":true,\"disableProductAjax\":false,\"compareProducts\":\"Compare Products\",\"compareItems\":\"Comparing *num* Products\",\"compareOpen\":\"Show\",\"compareClose\":\"Hide\",\"compareAdd\":\"Add a product\",\"compareRemove\":\"Remove product\",\"searchHideForm\":\"Hide search form\",\"searchShowForm\":\"Show search form\",\"carouselDelay\":5000}").load();
    }



    // Exported in app.js
    window.stencilBootstrap("pages/home", "{\"themeImageSizes\":{\"200\":{\"width\":200,\"height\":200},\"500\":{\"width\":500,\"height\":500},\"1024\":{\"width\":1024,\"height\":1024},\"account-product-thumbnail\":{\"width\":500,\"height\":500},\"core-swatch\":{\"width\":150,\"height\":150},\"core-product-picklist\":{\"width\":80,\"height\":200},\"account-logo\":{\"width\":500,\"height\":200},\"logo\":{\"width\":500,\"height\":200},\"thumbnail\":{\"width\":100,\"height\":100},\"512x1024\":{\"width\":512,\"height\":1024},\"1280x650\":{\"width\":1280,\"height\":650},\"1600x700\":{\"width\":1600,\"height\":700},\"account-product-thumb\":{\"width\":440,\"height\":600}},\"headerNavMenu\":\"Menu\",\"headerNavClose\":\"Close\",\"headerNavSearch\":\"Search\",\"required\":\"*\",\"selectState\":\"Choose a State/Province\",\"validationRequired\":\"This field is required.\",\"validationEmail\":\"Your E-mail address appears to be invalid.\",\"validationNumber\":\"You can enter only numbers in this field.\",\"validationNumMax\":\"Please enter a number less than undefined.\",\"validationNumMin\":\"Please enter a number greater than undefined.\",\"validationNumRange\":\"Please enter a number greater than undefined and less than undefined.\",\"validationMaxLength\":\"Maximum undefined characters allowed.\",\"validationMinLength\":\"Minimum undefined characters allowed.\",\"validationMaxChecked\":\"Maximum undefined options allowed.\",\"validationMinChecked\":\"Please select minimum undefined options.\",\"validationMaxSelected\":\"Maximum undefined selection allowed.\",\"validationMinSelected\":\"Minimum undefined selection allowed.\",\"validationNotEqual\":\"Fields do not match.\",\"validationDifferent\":\"Fields cannot be the same as each other.\",\"addSuccess\":\"*product* has been successfully added to your cart. View your *cart_link*.\",\"addToCart\":\"Add To Cart\",\"productYouSave\":\"You save\",\"productIncludingTax\":\"(Inc Tax)\",\"productExcludingTax\":\"(Exc Tax)\",\"cartLink\":\"cart\",\"checkoutLink\":\"product.checkout_link\",\"homeLink\":\"product.home_link\",\"preOrder\":\"Pre-Order\",\"soldOut\":\"Sold Out\",\"urlsCart\":\"/cart.php\",\"urlsCheckout\":\"/checkout.php\",\"productsPerPage\":21,\"productImageZoom\":true,\"disableProductAjax\":false,\"compareProducts\":\"Compare Products\",\"compareItems\":\"Comparing *num* Products\",\"compareOpen\":\"Show\",\"compareClose\":\"Hide\",\"compareAdd\":\"Add a product\",\"compareRemove\":\"Remove product\",\"searchHideForm\":\"Hide search form\",\"searchShowForm\":\"Show search form\",\"carouselDelay\":5000}").load();

 snippet location footer 

  {
    "@context": "http://schema.org",
    "@type": "WebSite",
    "name": "Cherrico Pottery, LLC",
    "url": "http://store.cherricopottery.com/"
  }


  var schema = document.createElement('script');
  schema.type = 'application/ld+json';
  schema.text = JSON.stringify({
    "@context": "http://schema.org",
    "@type": "BreadcrumbList",
    "itemListElement": [
        {
          "@type": "ListItem",
          "position": 0 + 1,
          "item": {
            "@id": window.location.href,
            "name": "Home"
          }
        }
    ]
  });

  document.querySelector('body').appendChild(schema);

  var script = document.getElementById('schema-breadcrumbs');
  script.parentElement.removeChild(script);

0 个答案:

没有答案