import urllib2
import urllib
from lxml.html import fromstring
from lxml.html.clean import Cleaner
from formatter import NullFormatter
import cookielib
import urllib,time
import urlparse
import datetime
import new
from htmllib import HTMLParser
from lxml.html import fromstring
from lxml.html.clean import Cleaner
import urllib2
import sys,popen2,os
import urlparse
def tagclean(url,Data=None):
html = urllib2.urlopen(url).read()
doc = fromstring(html)
tags = ['h1','h2','h3','h4','h5','h6', 'div', 'span', 'img', 'area', 'map']
args = {'meta':False, 'safe_attrs_only':False, 'page_structure':False,
'scripts':True, 'style':True, 'links':True, 'remove_tags':tags}
cleaner = Cleaner(**args)
path = '/html/body'
body = doc.xpath(path)[0]
return cleaner.clean_html(body).text_content().encode('ascii', 'ignore')
def writetofile(text,filename):
writefile = open(""+filename+"", "a")
writefile.write(str(text))
writefile.close()
if __name__=="__main__":
url = raw_input("Enter url:")
spliturl = url.split("http://")[1].replace(".","_")
metin=str(tagclean(url))
writetofile(text,spliturl+".txt")
在我的代码url scanner:
下def scanurl(url):
print "saving: ",url,datetime.datetime.now().strftime("%H:%M:%S")
tmp=urllib.urlretrieve(url)
print "saving finished",datetime.datetime.now().strftime("%H:%M:%S")
parser= HTMLParser(NullFormatter( ))
parser.feed( open(tmp[0]).read( ) )
urls=[]
for a in parser.anchorlist:
urls.append(urlparse.urljoin( url, a ))
return urls
我想将tagcleaner与此结合起来......
答案 0 :(得分:0)
我在这里疯狂猜测,因为我真的不知道你想要达到的目的,但你不是要取代
parser.feed( open(tmp[0]).read( ) )
与
parser.feed( tagclean(url) )
如果没有,你真的需要详细说明你的问题。