PageRank玩具示例无法收敛

时间:2015-05-03 18:45:02

标签: python web-crawler graph-algorithm pagerank

我正在编写一个玩具PageRank,包括一个爬虫。 它看起来有点奇怪,因为我的代码无法收敛PR值。 我还要注意,每次迭代之间的差值为0,输出的一部分为:

url: http://en.m.wikipedia.org/wiki/Israel_State_Cup
links_to_node: set(['http://en.m.wikipedia.org/wiki/Association_football', 'http://en.m.wikipedia.org/wiki/Wikipedia:General_disclaimer'])
links_from_node: set(['http://en.m.wikipedia.org/wiki/Israel_State_Cup'])
PR_score: 2.41759524248e+38
ttl_time: 1
last_delta: 0

代码如下:

import requests 
import lxml.html
import random

class pr_node:
        """WDM PR node"""
        url = ""
        links_to_node = set([])
        links_from_node = set([])
        PR_score = 0.0001
        ttl_time = 0
        last_delta = 0

        def __init__(self, url, ttl_time):
            self.url = url
            self.links_to_node = set([])
            self.links_from_node = set([])
            self.PR_score = 0.1
            self.ttl_time = ttl_time

        def print_node_out_links(self):
            print "\n\n" + self.url + " with ttl " + str(self.ttl_time) + " = "
            s = self.links_to_node
            print "{" + "\, ".join(str(e) for e in s) + "}"

        def print_node_pr(self):
            print "\n\n" + self.url + " PR is: " + str(self.PR_score)

        def print_all(self):
            print "url: " + self.url
            print "links_to_node: " + repr(self.links_to_node)
            print "links_from_node: " + repr(self.links_from_node)
            print "PR_score: " + str(self.PR_score)
            print "ttl_time: " + str(self.ttl_time)
            print "last_delta: " + str(self.last_delta)



def crawl(url, url_ttl):
        """crawl to new url, if ttl == 0 max depth reached, don't visit same url twice"""
        if url_ttl > 0 and (url not in visited_urls):

            # create new node p from parsed page
            print "crawling to " + url + "...\n"
            res = requests.get(url)
            doc = lxml.html.fromstring(res.content)
            p = pr_node(url, url_ttl)

            # add new PR node
            global pr_nodes
            pr_nodes[url] = p

            # get all wiki links
            all_links_to_node = set([])
            for t in doc.xpath("//a[contains(@href, '/wiki/')]"):
                add_val = ""
                if not t.attrib['href'].startswith("http://") and t.attrib['href'].startswith("/wiki/"):
                    add_val = "http://en.m.wikipedia.org" + t.attrib['href']
                    all_links_to_node.add(add_val)
                elif t.attrib['href'].startswith("http://"):
                    add_val = t.attrib['href']
                    all_links_to_node.add(add_val)
                else:
                    pass

            # select random 10 of them and crawl to them
            iter_count = 0
            iter_stop_lim = 10
            while iter_count < iter_stop_lim and len(p.links_to_node) < iter_stop_lim and len(all_links_to_node) > 0:
                    current_url = random.sample(all_links_to_node, 1)[0]

                    all_links_to_node.remove(current_url)  # don't do it twice...
                    iter_count = + 1
                    if not (current_url in visited_urls) and url_ttl > 1:
                        p.links_to_node.add(current_url)
                        crawl(current_url, url_ttl - 1)
                        visited_urls.add(url)
                    elif current_url in visited_urls and url_ttl == 1:
                        p.links_to_node.add(current_url)

        else:
            print "max depth reached or you've already been here"
        return


def calc_graph_pr(pr_iter_count, damp_factor):
    "print calculating PageRank"
    current_iter = 0
    global pr_nodes

    g1 = {}
    g2 = {}
    for node in pr_nodes.itervalues():
        g1[node.url] = node
        g2[node.url] = node

    g = [g1, g2]

    while current_iter < pr_iter_count:
        print "PageRank iteration #" + str(current_iter)
        for p in g[current_iter % 2].itervalues():
            in_links_addition = 0
            for l in p.links_to_node:
                l_val = g[(current_iter - 1) % 2][l]
                l_val.delta = l_val.PR_score - g[current_iter % 2][l].PR_score
                in_links_addition += l_val.PR_score/len(l_val.links_from_node)
            p.PR_score = damp_factor + (1 - damp_factor) * in_links_addition
        current_iter += 1

    pr_nodes = g[0] #WLOG could be also g[1]...

    for p in pr_nodes.itervalues():
        p.print_all()

    print "check bool:"
    print g1 == g2
    return


def update_graph_links():
    global pr_nodes
    for node in pr_nodes.itervalues():
        for u in node.links_to_node:
            if u in pr_nodes:
                pr_nodes[u].links_from_node.add(u)
    return

visited_urls = set([])
pr_nodes = {}

glob_pr_iter_count = 50
glob_damp_factor = 0.2

crawl("http://en.m.wikipedia.org/wiki/Nikola_Mitrovic", 3)

update_graph_links()
calc_graph_pr(glob_pr_iter_count, glob_damp_factor)

1 个答案:

答案 0 :(得分:1)

边缘添加功能破坏了它。将其修复为:

def update_graph_links():
    """register each node with neighbours pointing at it"""
    global pr_nodes
    for node in pr_nodes.itervalues():
        for u in node.links_to_node:
            if u in pr_nodes:
                pr_nodes[u].links_from_node.add(node.url)
    return

经过一些调整,进行了一些重构并添加了适当的注释,它出现在以下代码中:

import requests 
import lxml.html
import random
import sys

class pr_node:
        """WDM PR node"""

        url = ""
        links_to_node = set([])
        links_from_node = set([])
        PR_score = 0.01
        ttl_time = 0
        last_delta = 0  # used for debug only

        def __init__(self, url, ttl_time):
            """CTOR"""
            self.url = url
            self.links_to_node = set([])
            self.links_from_node = set([])
            self.PR_score = 0.01
            self.ttl_time = ttl_time


        def print_node_out_links(self):
            """print for q1a"""
            print "\n\n" + self.url + " with ttl " + str(self.ttl_time) + " = "
            s = self.links_to_node
            print "{" + "\, ".join(str(e) for e in s) + "}"


        def print_node_pr(self):
            """print for q1b"""
            print "\n\n" + self.url + " PR is: " + str(self.PR_score)


        def print_all(self):
            """print for q1b and debug"""
            print "url: " + self.url
            print "links_to_node: " + repr(self.links_to_node)
            print "links_from_node: " + repr(self.links_from_node)
            print "PR_score: " + str(self.PR_score)
            print "ttl_time: " + str(self.ttl_time)
            print "last_delta: " + str(self.last_delta)


def crawl(url, url_ttl):
        """crawl to new url, if ttl == 0 max depth reached, don't visit same url twice"""
        if url_ttl > 0 and (url not in visited_urls):

            # create new node p from parsed page
            print "crawling to " + url + "...\n"
            res = requests.get(url)
            doc = lxml.html.fromstring(res.content)
            p = pr_node(url, url_ttl)

            # add new PR node
            global pr_nodes
            pr_nodes[url] = p

            # get all wiki links, format to legit URL
            all_links_to_node = set([])
            for t in doc.xpath("//a[contains(@href, '/wiki/')]"):
                add_val = ""
                if not t.attrib['href'].startswith("http://") and t.attrib['href'].startswith("/wiki/"):
                    add_val = "http://en.m.wikipedia.org" + t.attrib['href']
                    all_links_to_node.add(add_val)
                elif t.attrib['href'].startswith("http://"):
                    add_val = t.attrib['href']
                    all_links_to_node.add(add_val)
                else:
                    pass

            # select random 10 of them and crawl to them
            iter_count = 0
            iter_stop_lim = 10
            while iter_count < iter_stop_lim and len(p.links_to_node) < iter_stop_lim and len(all_links_to_node) > 0:
                    # sample random site of linked sites
                    current_url = random.sample(all_links_to_node, 1)[0]
                    # don't sample it twice...
                    all_links_to_node.remove(current_url)
                    iter_count = + 1

                    # crawl if hav'nt been there and TTL enables you to check it
                    if not (current_url in visited_urls) and url_ttl > 1:
                        p.links_to_node.add(current_url)
                        crawl(current_url, url_ttl - 1)
                        visited_urls.add(url)

                    # if reached with TTL == 1 just check links to existing nodes
                    elif current_url in visited_urls and url_ttl == 1:
                        p.links_to_node.add(current_url)

        else:
            print "max depth reached or you've already been here"
        return


def calc_graph_pr(pr_nodes, pr_iter_count, damp_factor):
    """calculate and print the graph's PageRank"""
    current_iter = 0

    # use two graph copies to prevent auto-interference
    g1 = {}
    g2 = {}
    for node in pr_nodes.itervalues():
        g1[node.url] = node
        g2[node.url] = node

    g = [g1, g2]

    # do actual page rank here
    while current_iter < pr_iter_count:
        for p in g[current_iter % 2].itervalues():
            in_links_addition = 0
            # iterate over all pointing nodes and sum their PR/out_link_count
            for l in p.links_to_node:
                l_val = g[(current_iter - 1) % 2][l]
                l_val.delta = l_val.PR_score - g[current_iter % 2][l].PR_score
                in_links_addition += l_val.PR_score/len(l_val.links_from_node)
            # update w.r.t the computed sum and damp_factor
            p.PR_score = damp_factor + (1 - damp_factor) * in_links_addition
        current_iter += 1

    # WLOG could be also g[1]...
    pr_nodes = g[0]

    for p in pr_nodes.itervalues():
        p.print_node_pr()

    return


def update_graph_links():
    """register each node with neighbours pointing at him"""
    global pr_nodes
    for node in pr_nodes.itervalues():
        for u in node.links_to_node:
            if u in pr_nodes:
                pr_nodes[u].links_from_node.add(node.url)
    return


if __name__ == '__main__':

    urlToCrawl = "http://en.m.wikipedia.org/wiki/Nikola_Mitrovic"

    # crawl to the requested site as default
    if len(sys.argv) > 2:
        sys.exit("Unexpected input")
    elif len(sys.argv) == 1:
        pass
    else:
        urlToCrawl = sys.argv[1]

    print_q1a = False
    print_q1b = True

    # set global data structures for crawling and ranking
    visited_urls = set([])
    pr_nodes = {}

    # parameters for PageRank
    glob_pr_iter_count = 100
    glob_damp_factor = 0.2

    # perform crawl in depth 3
    crawl(urlToCrawl, 3)

    if print_q1a:
        for p in pr_nodes.itervalues():
            p.print_node_out_links()

    elif print_q1b:
        # first update the backlinks then start ranking
        update_graph_links()
        calc_graph_pr(pr_nodes, glob_pr_iter_count, glob_damp_factor)
    else:
        pass