从主站点的网址和干净的标签抓取的Python直接保存到带有www和URL的文件

时间:2011-11-02 10:12:14

标签: python

import urllib2
import urllib
from lxml.html import fromstring
from lxml.html.clean import Cleaner
from formatter import NullFormatter
import cookielib
import urllib,time
import urlparse
import datetime
import new
from htmllib import HTMLParser
from lxml.html import fromstring
from lxml.html.clean import Cleaner
import urllib2
import sys,popen2,os
import urlparse

def tagclean(url,Data=None):    
    html = urllib2.urlopen(url).read()
    doc = fromstring(html)
    tags = ['h1','h2','h3','h4','h5','h6', 'div', 'span', 'img', 'area', 'map']
    args = {'meta':False, 'safe_attrs_only':False, 'page_structure':False,
            'scripts':True, 'style':True, 'links':True, 'remove_tags':tags}
    cleaner = Cleaner(**args)

    path = '/html/body'
    body = doc.xpath(path)[0]
    return cleaner.clean_html(body).text_content().encode('ascii', 'ignore')

def writetofile(text,filename):
    writefile = open(""+filename+"", "a")
    writefile.write(str(text))
    writefile.close()

if __name__=="__main__":    
    url = raw_input("Enter url:")
    spliturl = url.split("http://")[1].replace(".","_")
    metin=str(tagclean(url))
    writetofile(text,spliturl+".txt")

在我的代码url scanner:

def scanurl(url):   
    print "saving: ",url,datetime.datetime.now().strftime("%H:%M:%S")
    tmp=urllib.urlretrieve(url)
    print "saving finished",datetime.datetime.now().strftime("%H:%M:%S")
    parser= HTMLParser(NullFormatter( ))
    parser.feed( open(tmp[0]).read( ) )
    urls=[]
    for a in parser.anchorlist:
        urls.append(urlparse.urljoin( url, a ))
return urls

我想将tagcleaner与此结合起来......

1 个答案:

答案 0 :(得分:0)

我在这里疯狂猜测,因为我真的不知道你想要达到的目的,但你不是要取代

parser.feed( open(tmp[0]).read( ) )

parser.feed( tagclean(url) )

如果没有,你真的需要详细说明你的问题。