我想使用Python来获取域中的所有链接,给定“root”URL(在列表中)。假设给定一个URL http://www.example.com,这应该返回与根URL相同的域的此页面上的所有链接,然后递归访问它们的每个链接并提取相同域的所有链接,依此类推。我所说的同一个域名是{@ 3}},我想要回复的唯一链接是http://www.example.com,http://www.example.com/something ...应该放弃任何外部内容,例如http://www.example.com/somethingelse。我怎么能用Python做到这一点?
编辑:我尝试使用lxml。我不认为这完全有效,我不知道如何考虑已处理页面的链接(导致无限循环)。
import urllib
import lxml.html
#given a url returns list of all sublinks within the same domain
def getLinks(url):
urlList = []
urlList.append(url)
sublinks = getSubLinks(url)
for link in sublinks:
absolute = url+'/'+link
urlList.extend(getLinks(absolute))
return urlList
#determine whether two links are within the same domain
def sameDomain(url, dom):
return url.startswith(dom)
#get tree of sublinks in same domain, url is root
def getSubLinks(url):
sublinks = []
connection = urllib.urlopen(url)
dom = lxml.html.fromstring(connection.read())
for link in dom.xpath('//a/@href'):
if not (link.startswith('#') or link.startswith('http') or link.startswith('mailto:')):
sublinks.append(link)
return sublinks
〜
答案 0 :(得分:2)
import sys
import requests
import hashlib
from bs4 import BeautifulSoup
from datetime import datetime
def get_soup(link):
"""
Return the BeautifulSoup object for input link
"""
request_object = requests.get(link, auth=('user', 'pass'))
soup = BeautifulSoup(request_object.content)
return soup
def get_status_code(link):
"""
Return the error code for any url
param: link
"""
try:
error_code = requests.get(link).status_code
except requests.exceptions.ConnectionError:
error_code =
return error_code
def find_internal_urls(lufthansa_url, depth=0, max_depth=2):
all_urls_info = []
status_dict = {}
soup = get_soup(lufthansa_url)
a_tags = soup.findAll("a", href=True)
if depth > max_depth:
return {}
else:
for a_tag in a_tags:
if "http" not in a_tag["href"] and "/" in a_tag["href"]:
url = "http://www.lufthansa.com" + a_tag['href']
elif "http" in a_tag["href"]:
url = a_tag["href"]
else:
continue
status_dict["url"] = url
status_dict["status_code"] = get_status_code(url)
status_dict["timestamp"] = datetime.now()
status_dict["depth"] = depth + 1
all_urls_info.append(status_dict)
return all_urls_info
if __name__ == "__main__":
depth = 2 # suppose
all_page_urls = find_internal_urls("someurl", 2, 2)
if depth > 1:
for status_dict in all_page_urls:
find_internal_urls(status_dict['url'])
以上代码段包含从汉莎航空公司网站上删除网址的必要模块。这里唯一的补充是你可以指定你想要递归的深度。
答案 1 :(得分:0)
以下是我所做的,只关注http://domain[xxx]之类的完整网址。快,但有点脏。
import requests
import re
domain = u"stackoverflow.com"
http_re = re.compile(u"(http:\/\/" + domain + "[\/\w \.-]*\/?)")
visited = set([])
def visit (url):
visited.add (url)
extracted_body = requests.get (url).text
matches = re.findall (http_re, extracted_body)
for match in matches:
if match not in visited :
visit (match)
visit(u"http://" + domain)
print (visited)
答案 2 :(得分:0)
@namita的代码中有一些错误。我对其进行了修改,现在效果很好。
import sys
import requests
import hashlib
from bs4 import BeautifulSoup
from datetime import datetime
def get_soup(link):
"""
Return the BeautifulSoup object for input link
"""
request_object = requests.get(link, auth=('user', 'pass'))
soup = BeautifulSoup(request_object.content, "lxml")
return soup
def get_status_code(link):
"""
Return the error code for any url
param: link
"""
try:
error_code = requests.get(link).status_code
except requests.exceptions.ConnectionError:
error_code = -1
return error_code
def find_internal_urls(main_url, depth=0, max_depth=2):
all_urls_info = []
soup = get_soup(main_url)
a_tags = soup.findAll("a", href=True)
if main_url.endswith("/"):
domain = main_url
else:
domain = "/".join(main_url.split("/")[:-1])
print(domain)
if depth > max_depth:
return {}
else:
for a_tag in a_tags:
if "http://" not in a_tag["href"] and "https://" not in a_tag["href"] and "/" in a_tag["href"]:
url = domain + a_tag['href']
elif "http://" in a_tag["href"] or "https://" in a_tag["href"]:
url = a_tag["href"]
else:
continue
# print(url)
status_dict = {}
status_dict["url"] = url
status_dict["status_code"] = get_status_code(url)
status_dict["timestamp"] = datetime.now()
status_dict["depth"] = depth + 1
all_urls_info.append(status_dict)
return all_urls_info
if __name__ == "__main__":
url = # your domain here
depth = 1
all_page_urls = find_internal_urls(url, 0, 2)
# print("\n\n",all_page_urls)
if depth > 1:
for status_dict in all_page_urls:
find_internal_urls(status_dict['url'])
答案 3 :(得分:-1)
从你的问题的标签,我假设你正在使用美丽的汤。 首先,您显然需要下载网页,例如使用urllib.request。在您执行该操作并将内容包含在字符串中后,将其传递给Beautiful Soup。在那之后,你可以找到所有与soup.find_all('a')的链接,假设汤是你美丽的汤对象。之后,您只需要检查hrefs:
最简单的版本是检查“http://www.example.com”是否在href中,但是不会捕获相对链接。我想一些狂野的正则表达式会做(用“www.example.com”查找所有内容或以“/”开头或以“?”(php)开头),或者你可能会查找包含www的所有内容,但不是www.example.com并丢弃它等。正确的策略可能取决于您正在抓取的网站,它的编码风格。
答案 4 :(得分:-1)
您可以使用正则表达式过滤掉此类链接
例如
<a\shref\=\"(http\:\/\/example\.com[^\"]*)\"
将上述正则表达式作为参考,并开始基于此编写脚本。