从刮网页python清理字符串

时间:2016-09-19 02:12:12

标签: javascript python

我通过Python和请求库抓取了网站的内容,我试图清除所有的html和javascript。

from lxml.html.clean import Cleaner
import lxml.html as html
text = html.document_fromstring(r.text).text_content()
cleaner = Cleaner(kill_tags=['noscript', 'img', 'a', 'h1'], remove_tags=['p'], style=True)
text = cleaner.clean_html(text)
text = ' '.join(text.split())

但是我仍然会收到很多看起来像这样的东西(***取代识别信息):

var STYLEID = \'9\', STATICURL = \'static/\', IMGDIR = \'comiis_xzs19lou\', VERHASH = \'wRx\', charset = \'gbk\',  cookiepre = \'bdRb_a91d_\', cookiedomain = \'.***\', cookiepath = \'/\', showusercard = \'1\', attackevasive = \'0\', disallowfloat = \'login|newthread\', creditnotice = \'****\', defaultstyle = \'\', REPORTURL = \'***==\', SITEURL = \'http://****/\', JSPATH = \'data/cache/\', CSSPATH = \'data/cache/style_\', DYNAMICURL = \'\'; body{background:#EBDFC5;}

var fid = parseInt(\'12\'), tid = parseInt(\'2474591\'); zoomstatus = parseInt(1);var imagemaxwidth = \'600\';var aimgcount = new Array(); #framesbO97X { margin:0px !important;border:0px !important;}#portal_block_1251 { margin:0px !important;border:0px !important;}#portal_block_1251 .dxb_bc { margin:0px !important;}#frameJo3fOn { margin:0px !important;border:0px !important;}#portal_block_1252 { margin:0px !important;border:0px !important;}#portal_block_1252 .dxb_bc { margin:0px !important;}#framerU9V5m { margin:0px !important;border:0px !important;}#portal_block_1253 { margin:0px !important;border:0px !important;}#portal_block_1253 .dxb_bc { margin:0px !important;}#frameRCK99J { margin:10px 0px 5px !important;}#frameJ7YGtB { margin:0px !important;border:0px !important;}#portal_block_1255 { margin:0px !important;border:0px !important;}#portal_block_1255 .dxb_bc { margin:0px !important;}#framerVqc7m { margin:0px !important;border:0px !important;}#portal_block_1256 { margin:0px !important;border:0px !important;}#portal_block_1256 .dxb_bc { margin:0px !important;}#frameFW5eQe { margin:0px !important;border:0px !important;}#frameKBtrA1 { margin:0px !important;border:0px !important;}#portal_block_1257 { margin:0px !important;border:0px !important;}#portal_block_1257 .dxb_bc { margin:0px !important;}#portal_block_1258 { margin:0px !important;border:0px !important;}#portal_block_1258 .dxb_bc { margin:0px !important;}

function succeedhandle_followmod(url, msg, values) { var fObj = $(\'followmod_\'+values[\'fuid\']); if(values[\'type\'] == \'add\') { fObj.innerHTML = \***\'; fObj.href = \'home.php?mod=spacecp&ac=follow&op=del&fuid=\'+values[\'fuid\']; } else if(values[\'type\'] == \'del\') { fObj.innerHTML = \***\'; fObj.href = \'home.php?mod=spacecp&ac=follow&op=add&hash=6a62d013&fuid=\'+values[\'fuid\']; } } _***(null, $C("t_f", null, "td"), "", "***", "***"); var rel_tid = "2474591"; var rel_title = "%C9%BD%B6%AB%CA%AF%BB%AF%BE%AD%C0%ED%B7%EB%B6%AB%C7%E0%B4%FE%B2%B6%EE%BF%D1%BA%C6%DA%BC%E4%CB%C0%CD%F6%A1%AA%A1%AA%CA%C7%B7%F1%B1%BB%C9%BD%B6%AB%CA%A1%BC%EC%B2%EC%D4%BA%BC%EC%B2%EC%B3%A4%CE%E2%C5%F4%B7%C9%C3%F0%BF%DA"; var rel_reltid = "0"; var rel_prepos = ""; var my_siteid = "7149150"; var rel_uid = "0"; var rel_views = "3909"; var rel_replies = "11"; var rel_page = "1"; var rel_show = "0"; _attachEvent(window, \'load\', getForbiddenFormula, document); function getForbiddenFormula() { var toGetForbiddenFormulaFIds = function () { ajaxget(\'plugin.php?id=cloudsearch&formhash=6a62d013\'); }; var a = document.body.getElementsByTagName(\'a\'); for(var i = 0;i document.documentElement.clientWidth) { $(\'***').style.cssFloat = \'right\'; $(\'***').style.left = \'auto\'; $(\'***\').style.right = 0; } else { $(\'***\').style.cssFloat = \'left\'; $(\'***\').style.left = (qrleft) + \'px\'; $(\'***\').style.right = \'auto\'; } } _attachEvent(window, \'scroll\', function () { ***; }) _attachEvent(window, \'load\', function() { ***; }, document); #scrolltop { display: none; } ul#navmenu ul { display: none; position: absolute; left: -233px; bottom: 5px; } ul#navmenu li:hover ul ul, ul#navmenu li.iehover ul ul, { display: none; } ul#navmenu li:hover ul, ul#navmenu ul li:hover ul, ul#navmenu ul ul li:hover ul, ul#navmenu li.iehover ul, ul#navmenu ul li.iehover ul, ul#navmenu ul ul li.iehover ul { display: block; } #jz52top a {margin: 6px 0;} #jz52top { visibility: visible; right: 10px; } #jz52topa { visibility: hidden;} #jz52top, #jz52top a { border: none;} #jz52top { position: fixed; bottom: 40px; display: block; width: 40px; background: none repeat scroll 0% 0% transparent; border: 0px #cdcdcd solid; border-radius: 3px; border-top: 0; cursor: pointer; } #jz52top:hover { text-decoration: none; } #jz52top a { display: block; width: 40px; height: 40px; padding: 0; line-height: 12px; text-align: center; color: #787878; text-decoration: none; background: #00a398 url(\'source/plugin/jz52_top/template/jz52top.png\') no-repeat 0 0; border-top: 0px #cdcdcd solid; } a.jz52topa:hover { background-position: -40px 0px !important;} a.replyfast { background-position: 0 -40px !important; } a.replyfast:hover { background-position: -40px -40px !important;} a.returnlist { background-position: 0 -80px !important; } a.returnlist:hover { background-position: -40px -80px !important;} a.returnboard { background-position: -80px -240px !important; } a.returnboard:hover { background-position: -120px -240px !important;} a.jzqr { background-position: 0 -120px !important; } a.jzqr:hover { background-position: -40px -120px !important;} a.jzwx { background-position: 0 -320px !important; } a.jzwx:hover { background-position: -40px -320px !important;} a.jzkf { background-position: -80px 0px !important; } a.jzkf:hover { background-position: -120px -0px !important;} a.jzfx { background-position: -80px -40px !important; } a.jzfx:hover { background-position: -120px -40px !important;} .jzfxn { background: #fff !important; width: 231px !important; height: 260px !important; } a.jzlast { background-position: -80px -80px !important; } a.jzlast:hover { background-position: -120px -80px !important;} a.jznext { background-position: -80px -120px !important; } a.jznext:hover { background-position: -120px -120px !important;} a.jzsct { background-position: 0px -160px !important; } a.jzsct:hover { background-position: -40px -160px !important;} a.jzscb { background-position: -80px -160px !important; } a.jzscb:hover { background-position: -120px -160px !important;} a.jzqqq { background-position: 0px -200px !important; } a.jzqqq:hover { background-position: -40px -200px !important;} a.jzwo { background-position: -80px -200px !important; } a.jzwo:hover { background-position: -120px -200px !important;} a.jzzdy { background-position: 0px -240px !important; } a.jzzdy:hover { background-position: -40px -240px !important;} a.jzfbzt { background-position: 0px -280px !important; } a.jzfbzt:hover { background-position: -40px -280px !important;} #jzqrn { background: #fff !important; width: 231px !important; height: 260px !important; } #jzqrn { border: 1px solid rgb(210, 210, 210); } #jzqrn p { font-size: 15px; padding-bottom: 15px; text-align: center; color: #999; font-family: Microsoft YaHei; } #jzwon { background: #fff !important; width: 231px !important; height: 260px !important; } #jzwon { border: 1px solid rgb(210, 210, 210); } #jzfxn { border: 1px solid rgb(210, 210, 210); } #jzfxn h3 { height: 23px; background: none repeat scroll 0% 0% rgb(250, 250, 250); border-bottom: 1px solid rgb(236, 236, 236); padding: 10px 0px 0px 10px; } #jzfxn .bdsharebuttonbox { padding: 13px 0px 0px 20px; } #jzfxn .bdsharebuttonbox a, #jzfxn .bdsharebuttonbox .bds_more { float: left; font-size: 12px; padding-left: 25px; line-height: 16px; text-align: left; height: 16px; background: url("***") no-repeat scroll 0px 0px ; background-repeat: no-repeat; cursor: pointer; margin: 6px 6px 6px 0px; text-indent: 0; overflow: hidden; width: 68px; } #jzfxn .bdsharebuttonbox .bds_qzone { background-position: 0px -52px !important; } #jzfxn .bdsharebuttonbox .bds_tsina { background-position: 0px -104px !important; } #jzfxn .bdsharebuttonbox .bds_tqq { background-position: 0px -260px !important; } #jzfxn .bdsharebuttonbox .bds_renren { background-position: 0px -208px !important; } #jzfxn .bdsharebuttonbox .bds_tqf { background-position: 0px -364px !important; } #jzfxn .bdsharebuttonbox .bds_tieba { background-position: 0px -728px !important; } #jzfxn .bdsharebuttonbox .bds_sqq { background-position: 0px -2652px !important; } #jzfxn .bdsharebuttonbox .bds_hi { background-position: 0px -416px !important; } #jzfxn .bdsharebuttonbox .bds_isohu { background-position: 0px -3016px !important; } #jzfxn .bdsharebuttonbox .bds_weixin { background-position: 0px -1612px !important; } #jzfxn .bdsharebuttonbox .bds_t163 { background-position: 0px -832px !important; } #jzfxn .bdsharebuttonbox .bds_tsohu { background-position: 0px -520px !important; } #jzfxn .bdsharebuttonbox .bds_baidu { background-position: 0px -2600px !important; } #jzfxn .bdsharebuttonbox .bds_qq { background-position: 0px -624px !important; } #jz52top a b { visibility: hidden; font-weight: normal; } // JavaScript Document function goTopEx(){ var obj=document.getElementById("goTopBtn"); function getScrollTop(){ return document.documentElement.scrollTop || document.body.scrollTop; } function setScrollTop(value){ if(document.documentElement.scrollTop){ document.documentElement.scrollTop=value; }else{ document.body.scrollTop=value; } } window.onscroll=function(){getScrollTop()>0?obj.style.display="":obj.style.display="none"; var h=document.body.scrollHeight - getScrollTop() - obj.offsetTop - obj.offsetHeight; obj.style.bottom=0+"px";

知道如何摆脱所有这些吗?

2 个答案:

答案 0 :(得分:1)

如果你想摆脱javascript,你需要清除<script>标签。 <noscript>标签的唯一用途是在浏览器禁用脚本时显示图像或文本。

cleaner = Cleaner(kill_tags=['script', 'noscript', 'img', 'a', 'h1'], remove_tags=['p'], style=True)

答案 1 :(得分:0)

我的错误,这是我的代码中的拼写错误,但感谢提示!正确的代码:

from lxml.html.clean import Cleaner
import lxml.html as html
text = html.document_fromstring(r.text)
cleaner = Cleaner(kill_tags=['script', 'noscript', 'img', 'a', 'h1'], remove_tags=['p'], style=True)
cleaner(text)
text = text.text_content()
text = ' '.join(text.split())