我正在创建一个网络抓取工具,但我遇到了一个问题。我的脚本所做的是抓取链接的第一页,然后抓取它找到的页面上的链接。我似乎遇到的问题是在第一组链接被抓取后,它会从第一次抓取中抓取网页的链接,但是似乎存在使用TLD的问题链接(http://example.com vs / about例如)。我已经尝试分解这些链接并添加一些过滤来捕获和记录它们,我已经尝试运行我自己的命令来过滤掉完整的URL,似乎没有任何区别。任何帮助,将不胜感激。以下是代码:
import sys
import requests
from bs4 import BeautifulSoup
# creating requests from user input
url = raw_input("Put in your domain! Without the 'http://www' part : ")
def makeRequest(url):
print("Trying...", url)
r = requests.get('http://' + url)
# Adding in BS4 for finding a tags in HTML
soup = BeautifulSoup(r.content, 'html.parser')
# Writes a as the link found in the href
output = soup.find_all('a')
return output
def makeFilter(link):
# Creating array for our links
found_link = []
for a in link:
a = a.get('href')
a_string = str(a)
if not a_string:
continue
# if statement to filter our links
if a_string[0] == '/':
# Realtive Links
found_link.append(a_string)
if 'http://' + url in a_string:
# Links from the same site
found_link.append(a_string)
if 'https://' + url in a_string:
# Links from the same site with SSL
found_link.append(a_string)
if 'http://www.' + url in a_string:
# Links from the same site
found_link.append(a_string)
if 'https://www.' + url in a_string:
# Links from the same site with SSL
found_link.append(a_string)
output = found_link
# print a_string
return output
# Function for removing duplicates
def remove_duplicates(values):
output = []
seen = set()
for value in values:
if value not in seen:
output.append(value)
seen.add(value)
return output
# Makes the request -> Filters the links -> Removes duplicates
def createURLList(values):
global filtered_list
try:
requests = makeRequest(values)
except:
print "Try again."
try:
new_list = makeFilter(requests)
except:
print "I can't make a list out of this."
try:
filtered_list = remove_duplicates(new_list)
except:
print "I can't do shit with this list."
return filtered_list
result = createURLList(url)
# print result
crawled_urls = []
crawled_urls = open('crawled_urls.txt', 'w')
# for verifying and crawling resulting pages
for b in result:
sub_directories = createURLList(url + b) # issue is right here, it's adding the url on the full URL in some instances
# remove_duplicates(sub_directories)
for z in sub_directories: # goes through the arrays crawled
crawler = []
crawler.append(z)
crawler = remove_duplicates(crawler) # remove duplicates from list
crawler = str(crawler)
crawled_urls.write(crawler + '\n')
crawled_urls.close()