我正在尝试编写仅处理网站内部链接的搜寻器。我正在使用python 2.7,漂亮的汤和请求,我需要所有内部链接(绝对和亲戚)。
我的客户要求我提供网站的抓取工具,但我希望它仅抓取内部链接。我需要它忽略jpg / png / gif和其他类型的网址,因此它仅处理页面。
import re, request
from bs4 import BeautifulSoup
def processUrl(url):
if not url in checkedUrls:
try:
if 'text/html' in requests.head(url).headers['Content-Type']:
req=requests.get(url)
if req.status_code==200:
print url
checkedUrls.append(url)
html=BeautifulSoup(req.text,'html.parser')
pages=html.find_all('a')
for page in pages:
url=page.get('href')
processUrl(url)
except:
pass
checekdUrls=[]
url='http://sampleurl.com'
processUrl(url)
答案 0 :(得分:0)
这是您的代码,加上上面我注释的逻辑。
import re, request
from bs4 import BeautifulSoup
def processUrl(url, domain, checkedUrls=[]):
if domain not in url:
return checkedUrls
if not url in checkedUrls:
try:
if 'text/html' in requests.head(url).headers['Content-Type']:
req=requests.get(url)
if req.status_code==200:
print url
checkedUrls.append(url)
html=BeautifulSoup(req.text,'html.parser')
pages=html.find_all('a')
for page in pages:
url=page.get('href')
processUrl(url)
except:
pass
return checkedUrls
checekdUrls=[]
domain = 'sampleurl.com'
url='http://sampleurl.com'
checkedUrls = processUrl(url, domain, checkedUrls)