如何从此Python代码中删除重复的链接?
import requests
from bs4 import BeautifulSoup, SoupStrainer
import bs4
search_link = "https://www.census.gov/data/tables/2016/demo/popest/state-total.html"
r = requests.get(search_link)
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('a')
file = open('testfile.txt','w')
for link in results:
S=link.get('href')
file.write("%s \n" % S)
file.close()
print(len(results))
import csv
for link in results:
S=link.get('href')
csvRow = [S]
csvfile = "data.csv"
with open(csvfile, "a") as fp:
wr = csv.writer(fp, dialect='excel')
wr.writerow(csvRow )
答案 0 :(得分:0)
在您的代码中替换:
for link in results:
S=link.get('href')
file.write("%s \n" % S)
使用:
existing_links = {}
for link in results:
S=link.get('href')
# Write link only if it wasn't writed before
if S not in existing_links:
file.write("%s \n" % S)
# Momorize link
existing_links[S] = True
答案 1 :(得分:0)
在set
检索到的值上使用href
:
for s in set(map(lamba x: x.get('href'), results)):
file.write("{}\n".formate(s))