我在编写Wikipedia的Web爬网程序时遇到问题。该搜寻器需要显示“另请参阅”部分以获取特定链接。而且,对于首次使用“另请参阅”的每个链接,此搜寻器还必须显示“另请参阅”部分。例如:此Wiki页面:https://en.wikipedia.org/wiki/Internet的“另请参见”部分包含此页面https://en.wikipedia.org/wiki/Crowdfunding,例如,该Crowdfunding页面包含以下内容:https://en.wikipedia.org/wiki/Angel_investor
此示例基于单个链接,但是在“另请参见”部分中,有10多个链接,这就是我需要创建的链接。我还必须谨慎行事。这是我的草稿的样子,但是它给了我错误,并且无法正常工作(甚至不是递归的):D
#Import Libraries
import time #For Delay
import urllib.request #Extracting web pages
import re
#Defining pages
starting_page = "https://en.wikipedia.org/wiki/Spacetime"
seed_page = "https://en.wikipedia.org" #Crawling the English Wikipedia
#Downloading entire Web Document (Raw Page Content)
def download_page(url):
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = str(resp.read())
return respData
except Exception as e:
print(str(e))
#Extract the "See also" section elements
def extract_see_also(page):
if 'id="See_also">' in page:
start_see_also = page.find('id="See_also">')
start_list_items = page.find('<li>', start_see_also + 1)
end_see_also = page.find('<h2>', start_list_items + 1)
see_also_section = page[start_list_items: end_see_also]
pure_item_raw = (re.sub(r'<.+?>', '', see_also_section)).replace('\n', ',')
pure_item_raw2 = pure_item_raw.replace(',,', ',')
pure_item = pure_item_raw2.replace(',,', ',')
flag = 0
else:
pure_item = "No Related Links"
flag = 1
return pure_item, flag
#Getting all links with the help of 'get_next_links'
def get_all_links(page):
links = []
while True:
link, end_link = get_next_link(page)
if link == "no_links":
break
else:
links.append(link) #Append all the links in the list named 'Links'
#time.sleep(0.1)
page = page[end_link:]
return links
#Crawl Initiation
#Check for file type in URL so crawler does not crawl images and text files
def extension_scan(url):
a = ['.png','.jpg','.jpeg','.gif','.tif','.txt']
j = 0
while j < (len(a)):
if a[j] in url:
#print("There!")
flag2 = 1
break
else:
#print("Not There!")
flag2 = 0
j = j+1
#print(flag2)
return flag2
#URL parsing for incomplete or duplicate URLs
def url_parse(url):
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
url = url #.lower() #Make it lower case
s = urlparse(url) #parse the given url
seed_page_n = seed_page #.lower() #Make it lower case
#t = urlparse(seed_page_n) #parse the seed page (reference page)
i = 0
flag = 0
while i<=9:
if url == "/":
url = seed_page_n
flag = 0
elif not s.scheme:
url = "http://" + url
flag = 0
elif "#" in url:
url = url[:url.find("#")]
flag = 0
elif "?" in url:
url = url[:url.find("?")]
flag = 0
elif s.netloc == "":
url = seed_page + s.path
flag = 0
#elif "www" not in url:
# url = "www."[:7] + url[7:]
# flag = 0
elif url[len(url)-1] == "/":
url = url[:-1]
flag = 0
#elif s.netloc != t.netloc:
# url = url
# flag = 1
# break
else:
url = url
flag = 0
break
i = i+1
s = urlparse(url) #Parse after every loop to update the values of url parameters
return(url, flag)
t0 = time.time()
database = {} #Create a dictionary
#Main Crawl function that calls all the above function and crawls the entire site sequentially
def web_crawl():
to_crawl = [starting_page] #Define list name 'Seed Page'
#print(to_crawl)
crawled=[] #Define list name 'Seed Page'
#database = {} #Create a dictionary
#k = 0;
for k in range(0, 3):
i=0 #Initiate Variable to count No. of Iterations
while i<3: #Continue Looping till the 'to_crawl' list is not empty
urll = to_crawl.pop(0) #If there are elements in to_crawl then pop out the first element
urll,flag = url_parse(urll)
#print(urll)
flag2 = extension_scan(urll)
time.sleep(3)
#If flag = 1, then the URL is outside the seed domain URL
if flag == 1 or flag2 == 1:
pass #Do Nothing
else:
if urll in crawled: #Else check if the URL is already crawled
pass #Do Nothing
else: #If the URL is not already crawled, then crawl it and extract all the links from it
print("Link = " + urll)
raw_html = download_page(urll)
#print(raw_html)
see_also,flag2 = extract_see_also(raw_html)
print("Related Links = " + see_also)
crawled.append(urll)
#Remove duplicated from to_crawl
n = 1
j = 0
#k = 0
while j < (len(to_crawl)-n):
if to_crawl[j] in to_crawl[j+1:(len(to_crawl)-1)]:
to_crawl.pop(j)
n = n+1
else:
pass #Do Nothing
j = j+1
i=i+1
#print(to_crawl)
#print("Iteration No. = " + str(i))
#print("To Crawl = " + str(len(to_crawl)))
#print("Crawled = " + str(len(crawled)))
return ""
print (web_crawl())
t1 = time.time()
total_time = t1-t0