下面的代码获取HTML文件中的所有链接并将其写入文本文件。但它也在复制重复的行(链接)。有没有办法确保它不会写出已存在于文件中的链接?任何方法所以我不必手动编写功能代码?
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_starttag(self,tag,attrs):
if tag=="a":
if attrs.__len__>0:
for a in attrs:
if a[0]=="href":
print a[1]
f=open("index_link.txt","a+")
f.write(a[1]+"\n")
def main():
parser=MyHTMLParser()
f=open("index.html")
if f.mode=="r":
contents=f.read()
parser.feed(contents)
else:
print ("No file found")
f=open("textfile.html","w+")
f.write(contents)
if __name__=="__main__":
main()
答案 0 :(得分:2)
您需要自己录制找到的链接,例如使用set
:
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.links_found = set()
def handle_starttag(self,tag,attrs):
if tag=="a" and attrs:
for a in attrs:
if a[0]=="href" and a[1] not in self.links_found:
self.links_found.add(a[1])
print a[1]
with open("index_link.txt","a+") as f:
f.write(a[1]+"\n")
如果您想按顺序保留链接,也可以使用简单列表,而不是直接将它们写入文件:
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.found_links = []
def handle_starttag(self,tag,attrs):
if tag=="a":
attrs = dict(attrs)
if "href" in attrs and attrs["href"] not in self.found_links:
self.found_links.append(attrs["href"])
def main():
parser = MyHTMLParser()
with open("index.html") as f:
contents = f.read()
parser.feed(contents)
with open("index_link.txt","w") as f:
f.write('\n'.join(parser.found_links) + '\n')
with open("textfile.html","w") as f:
f.write(contents)
if __name__=="__main__":
main()
答案 1 :(得分:1)
这很简单,只需使用列表数据类型,这将是链接列表,这里我使用 html_links 变量
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
super(self).__init__
self.html_links = []
def handle_starttag(self,tag,attrs):
if tag=="a":
if attrs.__len__>0:
for a in attrs:
if a[0]=="href" and a[1] not in self.html_links:
print a[1]
self.html_links.append(a[1])
f=open("index_link.txt","a+")
f.write(a[1]+"\n")
def main():
parser=MyHTMLParser()
f=open("index.html")
if f.mode=="r":
contents=f.read()
parser.feed(contents)
else:
print ("No file found")
f=open("textfile.html","w+")
f.write(contents)
if __name__=="__main__":
main()
答案 2 :(得分:-1)
使用set()
。不要将链接直接写入文件(无论如何都是低效的),试试这个:
class MyHTMLParser(HTMLParser):
def __init(self)__:
super(HTMLParser, self).__init__()
self.my_links = set()
def handle_starttag(self,tag,attrs):
if tag != "a" or attrs.__len__ == 0:
return None
for a in attrs:
if a[0] == "href":
self.my_links.add(a[1])
然后检索链接:
parser = MyParser()
# ... do your parsing here
links = parser.my_links
with open('path/to/file', 'w') as f:
for link in list(links):
f.write(link)