我正在尝试从文本中删除所有标记,脚本,html额外代码。
我尝试了几种方式,但最终文本仍包含HTML代码。我已将所有正则表达式和tecniques一起添加,以便如果一个错过了可以被另一个
捕获def removeHTML(self, text):
soup = BeautifulSoup(text)
text = ''.join(soup.findAll(text=True))
# return text
parsedText = re.compile(r'<.*?>')
scripts = re.compile(r'<(script).*?</\1>(?s)')
css = re.compile(r'<style.*?/style>')
tags = re.compile(r'<.*?>')
text = parsedText.sub('', text)
text = scripts.sub('', text)
text = css.sub('', text)
text = tags.sub('', text)
text = re.sub('{{.*?}}', '', text)
return text
许多脚本和功能都有文字。
对此存在更好的解决方案吗?
未捕获的文本之一是 -
function startupApp(stencilBootstrap) {
stencilBootstrap("pages/home", "{\"themeImageSizes\":{\"200\":{\"width\":200,\"height\":200},\"500\":{\"width\":500,\"height\":500},\"1024\":{\"width\":1024,\"height\":1024},\"account-product-thumbnail\":{\"width\":500,\"height\":500},\"core-swatch\":{\"width\":150,\"height\":150},\"core-product-picklist\":{\"width\":80,\"height\":200},\"account-logo\":{\"width\":500,\"height\":200},\"logo\":{\"width\":500,\"height\":200},\"thumbnail\":{\"width\":100,\"height\":100},\"512x1024\":{\"width\":512,\"height\":1024},\"1280x650\":{\"width\":1280,\"height\":650},\"1600x700\":{\"width\":1600,\"height\":700},\"account-product-thumb\":{\"width\":440,\"height\":600}},\"headerNavMenu\":\"Menu\",\"headerNavClose\":\"Close\",\"headerNavSearch\":\"Search\",\"required\":\"*\",\"selectState\":\"Choose a State/Province\",\"validationRequired\":\"This field is required.\",\"validationEmail\":\"Your E-mail address appears to be invalid.\",\"validationNumber\":\"You can enter only numbers in this field.\",\"validationNumMax\":\"Please enter a number less than undefined.\",\"validationNumMin\":\"Please enter a number greater than undefined.\",\"validationNumRange\":\"Please enter a number greater than undefined and less than undefined.\",\"validationMaxLength\":\"Maximum undefined characters allowed.\",\"validationMinLength\":\"Minimum undefined characters allowed.\",\"validationMaxChecked\":\"Maximum undefined options allowed.\",\"validationMinChecked\":\"Please select minimum undefined options.\",\"validationMaxSelected\":\"Maximum undefined selection allowed.\",\"validationMinSelected\":\"Minimum undefined selection allowed.\",\"validationNotEqual\":\"Fields do not match.\",\"validationDifferent\":\"Fields cannot be the same as each other.\",\"addSuccess\":\"*product* has been successfully added to your cart. View your *cart_link*.\",\"addToCart\":\"Add To Cart\",\"productYouSave\":\"You save\",\"productIncludingTax\":\"(Inc Tax)\",\"productExcludingTax\":\"(Exc Tax)\",\"cartLink\":\"cart\",\"checkoutLink\":\"product.checkout_link\",\"homeLink\":\"product.home_link\",\"preOrder\":\"Pre-Order\",\"soldOut\":\"Sold Out\",\"urlsCart\":\"/cart.php\",\"urlsCheckout\":\"/checkout.php\",\"productsPerPage\":21,\"productImageZoom\":true,\"disableProductAjax\":false,\"compareProducts\":\"Compare Products\",\"compareItems\":\"Comparing *num* Products\",\"compareOpen\":\"Show\",\"compareClose\":\"Hide\",\"compareAdd\":\"Add a product\",\"compareRemove\":\"Remove product\",\"searchHideForm\":\"Hide search form\",\"searchShowForm\":\"Show search form\",\"carouselDelay\":5000}").load();
}
// Exported in app.js
window.stencilBootstrap("pages/home", "{\"themeImageSizes\":{\"200\":{\"width\":200,\"height\":200},\"500\":{\"width\":500,\"height\":500},\"1024\":{\"width\":1024,\"height\":1024},\"account-product-thumbnail\":{\"width\":500,\"height\":500},\"core-swatch\":{\"width\":150,\"height\":150},\"core-product-picklist\":{\"width\":80,\"height\":200},\"account-logo\":{\"width\":500,\"height\":200},\"logo\":{\"width\":500,\"height\":200},\"thumbnail\":{\"width\":100,\"height\":100},\"512x1024\":{\"width\":512,\"height\":1024},\"1280x650\":{\"width\":1280,\"height\":650},\"1600x700\":{\"width\":1600,\"height\":700},\"account-product-thumb\":{\"width\":440,\"height\":600}},\"headerNavMenu\":\"Menu\",\"headerNavClose\":\"Close\",\"headerNavSearch\":\"Search\",\"required\":\"*\",\"selectState\":\"Choose a State/Province\",\"validationRequired\":\"This field is required.\",\"validationEmail\":\"Your E-mail address appears to be invalid.\",\"validationNumber\":\"You can enter only numbers in this field.\",\"validationNumMax\":\"Please enter a number less than undefined.\",\"validationNumMin\":\"Please enter a number greater than undefined.\",\"validationNumRange\":\"Please enter a number greater than undefined and less than undefined.\",\"validationMaxLength\":\"Maximum undefined characters allowed.\",\"validationMinLength\":\"Minimum undefined characters allowed.\",\"validationMaxChecked\":\"Maximum undefined options allowed.\",\"validationMinChecked\":\"Please select minimum undefined options.\",\"validationMaxSelected\":\"Maximum undefined selection allowed.\",\"validationMinSelected\":\"Minimum undefined selection allowed.\",\"validationNotEqual\":\"Fields do not match.\",\"validationDifferent\":\"Fields cannot be the same as each other.\",\"addSuccess\":\"*product* has been successfully added to your cart. View your *cart_link*.\",\"addToCart\":\"Add To Cart\",\"productYouSave\":\"You save\",\"productIncludingTax\":\"(Inc Tax)\",\"productExcludingTax\":\"(Exc Tax)\",\"cartLink\":\"cart\",\"checkoutLink\":\"product.checkout_link\",\"homeLink\":\"product.home_link\",\"preOrder\":\"Pre-Order\",\"soldOut\":\"Sold Out\",\"urlsCart\":\"/cart.php\",\"urlsCheckout\":\"/checkout.php\",\"productsPerPage\":21,\"productImageZoom\":true,\"disableProductAjax\":false,\"compareProducts\":\"Compare Products\",\"compareItems\":\"Comparing *num* Products\",\"compareOpen\":\"Show\",\"compareClose\":\"Hide\",\"compareAdd\":\"Add a product\",\"compareRemove\":\"Remove product\",\"searchHideForm\":\"Hide search form\",\"searchShowForm\":\"Show search form\",\"carouselDelay\":5000}").load();
snippet location footer
{
"@context": "http://schema.org",
"@type": "WebSite",
"name": "Cherrico Pottery, LLC",
"url": "http://store.cherricopottery.com/"
}
var schema = document.createElement('script');
schema.type = 'application/ld+json';
schema.text = JSON.stringify({
"@context": "http://schema.org",
"@type": "BreadcrumbList",
"itemListElement": [
{
"@type": "ListItem",
"position": 0 + 1,
"item": {
"@id": window.location.href,
"name": "Home"
}
}
]
});
document.querySelector('body').appendChild(schema);
var script = document.getElementById('schema-breadcrumbs');
script.parentElement.removeChild(script);