我正在尝试从一个网站上爬取唯一的Web链接,条件是不能重复。表示以“ /”或“#”结尾的相同链接被视为重复。我从另一个堆栈溢出线程获得的代码给我错误:TypeError:无法混合str和non-str参数。
import bs4 as bs
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
BASE_url = urllib.request.urlopen("https://www.census.gov/programs-
surveys/popest.html").read()
soup = bs.BeautifulSoup(BASE_url, "html.parser")
filename = "C742TaskMisajon.csv"
f = open(filename, "w")
headers = "WebLinks\n"
f.write(headers)
all_links = soup.find_all('a')
url_set = set()
def clean_links(tags, base_url):
cleaned_links = set()
for tag in tags:
link = tag.get('href')
if link is None:
continue
if link.endswith('/') or link.endswith('#'):
link = link[-1]
full_url = urllib.parse.urljoin(base_url, link)
cleaned_links.add(full_url)
return cleaned_links
cleaned_links = clean_links(all_links, BASE_url)
for link in cleaned_links:
f.write(str(link) + '\n')