我正在尝试使Web搜寻器以递归方式找到网页的外部超链接。
使用以下代码,搜寻器可以很好地工作,但是它会搜索并插入已经保存在数据库中的链接。
我添加了SELECT
查询以对具有相同链接但没有任何变化的行进行计数。
出什么问题了?
代码:
def add_external_links(bs_obj, scheme, exclude_url, title):
for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|\/\/)((?!" + exclude_url + ").)*$")):
if link.attrs["href"].endswith("/"):
link.attrs["href"] = link.attrs["href"][:-1]
# Get matching rows
select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))
if select_in_return == 0:
if link.attrs["href"].startswith("//"):
cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, scheme + "://" + link.attrs["href"][2:], title, "Temp contents",))
conn.commit()
else:
cur.execute("INSERT INTO internal_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, link.attrs["href"], title, "Temp contents",))
conn.commit()
def split_address(addr):
address_parts = None
if "https" in addr:
address_parts = addr.replace("https://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")
elif "http" in addr:
address_parts = addr.replace("http://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")
return address_parts
def get_random_external_links(starting_page):
html = urlopen(starting_page)
try:
bs_obj = BeautifulSoup(html, "html.parser")
except AttributeError as e:
return -1
title = bs_obj.find("title")
# Get scheme, netloc and title of URI and pass them to add_external_links()
add_external_links(bs_obj, urlparse(starting_page).scheme, split_address(starting_page)[0], title.get_text())
cur.execute("SELECT href FROM external_links ORDER BY RAND() LIMIT 1;")
fetch = cur.fetchall()
selected_tuple = str(fetch[0][0])
if selected_tuple.startswith("b'"):
selected_tuple = selected_tuple[2:]
if selected_tuple.endswith("'"):
selected_tuple = selected_tuple[:-1]
return selected_tuple
def find_random_link(url):
get_link = get_random_external_link(url)
if get_link == -1:
return -1
else:
return find_random_link(get_link)
数据库“ external_links”:
+----------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+----------+--------------+------+-----+---------+----------------+
| idx | int(11) | NO | PRI | <null> | auto_increment |
| href | blob | NO | | <null> | |
| title | varchar(255) | NO | | <null> | |
| contents | blob | NO | | <null> | |
+----------+--------------+------+-----+---------+----------------+
答案 0 :(得分:1)
这是由于不同的uri风格。
- https://www.google.com
- https://google.com
- http://www.google.com
- http://google.com
- //www.google.com
- //google.com
- www.google.com
这七个链接是相同的地址,但是直到进入if select_in_return == 0:
块才进行检查。在执行INSERT INTO
查询之后,它们将成为相同的地址,但是在执行SELECT
查询时,它们被视为不同的链接,因此将存储重复的链接。
解决方案:
def add_external_links(bs_obj, scheme, exclude_url, title):
for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|\/\/)((?!" + exclude_url + ").)*$")):
# Remove protocol(https:// or http:// or //) and host(www.) from URI
if link.attrs["href"].startswith("//"):
link.attrs["href"] = link.attrs["href"][2:]
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "https" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("https://", "")
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "http" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("http://", "")
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
# Remove trailing slash
if link.attrs["href"].endswith("/"):
link.attrs["href"] = link.attrs["href"][:-1]
# Reassemble URI
link.attrs["href"] = scheme + "://" + link.attrs["href"]
# Get rows matching with URI
select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))
# Add URI to database if it is not duplicated
if select_in_return == 0:
cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, link.attrs["href"], title, "Temp contents",))
conn.commit()