我个人突发奇想,我已经编写了一些代码来搜索任何两篇维基百科文章之间的最短链接。事实证明这是一种非常强大的力量,并且需要很长时间才能找到目标,如果它不仅仅是一个或两个深的链接,但它有效!我最终将跟踪并利用链接路径和内容,但我希望首先使搜索工作最佳。有没有更快的方法来做这个或者在这里削减一些主要角落的好方法?
import urllib2
from bs4 import BeautifulSoup
Start = 'http://en.wikipedia.org/wiki/Alan_Reid_%28politician%29'
End = 'http://en.wikipedia.org/wiki/Ayr'
#Using BeautifulSoup, this grabs the page
def soup_request(target):
request = urllib2.Request(target)
request.add_header("User-Agent", "Mozilla/5.0")
page = urllib2.urlopen(target)
soup = BeautifulSoup(page)
return soup
#This will grab all Wiki links off a given page
def get_links(Start):
soup = soup_request(Start)
Wiki_links = []
#Finds all links
for url in soup.findAll('a'):
result = url.get('href')
try:
if str(result)[:5] == '/wiki':
Wiki_links.append(result)
except:
pass
for q in range(len(Wiki_links)):
Wiki_links[q] = 'http://en.wikipedia.org'+str(Wiki_links[q])
print "Got new links from",Start
return Wiki_links
#This will check all the given links to see if the title matches the goal webpage
def check_links(Links,End):
goalsoup = soup_request(End)
goaltitle = goalsoup.html.title
Found = False
count = 0
for q in Links:
if Found:
break
length = len(Links)
#Runs through all the given links and checks their titles for correct one
if q is not None:
count += 1
soup = soup_request(q)
print "Checked",count,"links out of",length
try:
title = soup.html.head.title
if title == goaltitle:
Found = True
print "Found it!"
break
except:
print 'doh'
pass
return Found
#Top function to do all the stuff in the right order, applying a maximum depth of how deep into the links
def wiki_crawl(Start, End, depth):
Old_Links = [Start]
count = depth
while count > 0:
New_Links = []
for q in range(len(Old_Links)):
New_Links.extend(get_links(Old_Links[q]))
Found = check_links(New_Links,End)
if Found:
print "All done."
break
Old_Links = New_Links
count -= 1
print "_______________________________________________________________ROUND DONE"
if not Found:
print "Did not find the page, you must go deeper!"
wiki_crawl(Start, End, 2)
答案 0 :(得分:0)
以下是从wiki获取信息的一些功能。唯一的问题是,有时它会从网页上的信息中取出一个空格。
def take_out_parenthesis(st):
string = list(st)
for a in string:
if a == '(':
del string[st.find(a)]
if a == ')':
del string[st.find(a) - 1]
return ''.join(string)
def take_out_tags(string):
st = list(string)
odd = ['<', '>']
times = 0
for a in string:
if a in odd:
times += 1
times /= 2
for b in range(times):
start = string.find('<') - 1
end = string.find('>')
bet = end - start + 1
for a in range(bet):
del st[start]
string = ''.join(st)
return string
def take_out_brackets(string):
st = list(string)
odd = ['[', ']']
times = 0
for a in string:
if a in odd:
times += 1
times /= 2
for b in range(times):
start = string.find('[') - 1
end = string.find(']')
bet = end - start + 1
for a in range(bet):
del st[start]
string = ''.join(st)
return string
def take_from_web_page(text):
n = 0
url = text.replace(" ", "_")
search = "http://en.wikipedia.org/wiki/%s" % url
page = urllib2.urlopen(search).read()
start = page.find('<p><b>') + 6
end = page.find('</a>.', start) + 5
new_page = page[start:end]
for a in new_page:
if a == '<':
if new_page[n - 1] != ' ':
lst = list(new_page)
lst.insert(n, ' ')
new_page = ''.join(lst)
n += 1
n += 1
return take_out_parenthesis(take_out_brackets(take_out_tags(new_page)))