Python Wiki路径搜索

时间:2015-02-06 22:58:58

标签: python performance url beautifulsoup

我个人突发奇想,我已经编写了一些代码来搜索任何两篇维基百科文章之间的最短链接。事实证明这是一种非常强大的力量,并且需要很长时间才能找到目标,如果它不仅仅是一个或两个深的链接,但它有效!我最终将跟踪并利用链接路径和内容,但我希望首先使搜索工作最佳。有没有更快的方法来做这个或者在这里削减一些主要角落的好方法?

import urllib2
from bs4 import BeautifulSoup
Start = 'http://en.wikipedia.org/wiki/Alan_Reid_%28politician%29'
End = 'http://en.wikipedia.org/wiki/Ayr'

#Using BeautifulSoup, this grabs the page
def soup_request(target):
    request = urllib2.Request(target)
    request.add_header("User-Agent", "Mozilla/5.0")
    page = urllib2.urlopen(target)
    soup =  BeautifulSoup(page)
    return soup

#This will grab all Wiki links off a given page
def get_links(Start):
    soup = soup_request(Start)
    Wiki_links = []
    #Finds all links
    for url in soup.findAll('a'):
        result = url.get('href')
        try:
            if str(result)[:5] == '/wiki':
                Wiki_links.append(result)
        except:
            pass
    for q in range(len(Wiki_links)):
        Wiki_links[q] = 'http://en.wikipedia.org'+str(Wiki_links[q])
    print "Got new links from",Start
    return Wiki_links

#This will check all the given links to see if the title matches the goal webpage
def check_links(Links,End):
    goalsoup = soup_request(End)
    goaltitle = goalsoup.html.title
    Found = False
    count = 0
    for q in Links:
        if Found:
            break
        length = len(Links)
        #Runs through all the given links and checks their titles for correct one
        if q is not None:
            count += 1
            soup = soup_request(q)
            print "Checked",count,"links out of",length
            try:
                title = soup.html.head.title
                if title == goaltitle:
                    Found = True
                    print "Found it!"
                    break
            except:
                print 'doh'
                pass
    return Found

#Top function to do all the stuff in the right order, applying a maximum depth of how deep into the links
def wiki_crawl(Start, End, depth):
    Old_Links = [Start]
    count = depth
    while count > 0:
        New_Links = []
        for q in range(len(Old_Links)):
            New_Links.extend(get_links(Old_Links[q]))
        Found = check_links(New_Links,End)
        if Found:
            print "All done."
            break
        Old_Links = New_Links
        count -= 1
        print "_______________________________________________________________ROUND DONE"
    if not Found:
        print "Did not find the page, you must go deeper!"

wiki_crawl(Start, End, 2)

1 个答案:

答案 0 :(得分:0)

以下是从wiki获取信息的一些功能。唯一的问题是,有时它会从网页上的信息中取出一个空格。

def take_out_parenthesis(st):
string = list(st)
for a in string:
    if a == '(':
        del string[st.find(a)]
    if a == ')':
        del string[st.find(a) - 1]
return ''.join(string)


def take_out_tags(string):
    st = list(string)
    odd = ['<', '>']
    times = 0
    for a in string:
        if a in odd:
            times += 1
    times /= 2
    for b in range(times):
        start = string.find('<') - 1
        end = string.find('>')
        bet = end - start + 1
        for a in range(bet):
            del st[start]
        string = ''.join(st)
    return string


def take_out_brackets(string):
    st = list(string)
    odd = ['[', ']']
    times = 0
    for a in string:
        if a in odd:
            times += 1
    times /= 2
    for b in range(times):
        start = string.find('[') - 1
        end = string.find(']')
        bet = end - start + 1
        for a in range(bet):
            del st[start]
        string = ''.join(st)
    return string


def take_from_web_page(text):
    n = 0
    url = text.replace(" ", "_")
    search = "http://en.wikipedia.org/wiki/%s" % url
    page = urllib2.urlopen(search).read()
    start = page.find('<p><b>') + 6
    end = page.find('</a>.', start) + 5
    new_page = page[start:end]
    for a in new_page:
        if a == '<':
            if new_page[n - 1] != ' ':
                lst = list(new_page)
                lst.insert(n, ' ')
                new_page = ''.join(lst)
                n += 1
        n += 1
    return take_out_parenthesis(take_out_brackets(take_out_tags(new_page)))