我正在编写一个玩具PageRank,包括一个爬虫。 它看起来有点奇怪,因为我的代码无法收敛PR值。 我还要注意,每次迭代之间的差值为0,输出的一部分为:
url: http://en.m.wikipedia.org/wiki/Israel_State_Cup
links_to_node: set(['http://en.m.wikipedia.org/wiki/Association_football', 'http://en.m.wikipedia.org/wiki/Wikipedia:General_disclaimer'])
links_from_node: set(['http://en.m.wikipedia.org/wiki/Israel_State_Cup'])
PR_score: 2.41759524248e+38
ttl_time: 1
last_delta: 0
代码如下:
import requests
import lxml.html
import random
class pr_node:
"""WDM PR node"""
url = ""
links_to_node = set([])
links_from_node = set([])
PR_score = 0.0001
ttl_time = 0
last_delta = 0
def __init__(self, url, ttl_time):
self.url = url
self.links_to_node = set([])
self.links_from_node = set([])
self.PR_score = 0.1
self.ttl_time = ttl_time
def print_node_out_links(self):
print "\n\n" + self.url + " with ttl " + str(self.ttl_time) + " = "
s = self.links_to_node
print "{" + "\, ".join(str(e) for e in s) + "}"
def print_node_pr(self):
print "\n\n" + self.url + " PR is: " + str(self.PR_score)
def print_all(self):
print "url: " + self.url
print "links_to_node: " + repr(self.links_to_node)
print "links_from_node: " + repr(self.links_from_node)
print "PR_score: " + str(self.PR_score)
print "ttl_time: " + str(self.ttl_time)
print "last_delta: " + str(self.last_delta)
def crawl(url, url_ttl):
"""crawl to new url, if ttl == 0 max depth reached, don't visit same url twice"""
if url_ttl > 0 and (url not in visited_urls):
# create new node p from parsed page
print "crawling to " + url + "...\n"
res = requests.get(url)
doc = lxml.html.fromstring(res.content)
p = pr_node(url, url_ttl)
# add new PR node
global pr_nodes
pr_nodes[url] = p
# get all wiki links
all_links_to_node = set([])
for t in doc.xpath("//a[contains(@href, '/wiki/')]"):
add_val = ""
if not t.attrib['href'].startswith("http://") and t.attrib['href'].startswith("/wiki/"):
add_val = "http://en.m.wikipedia.org" + t.attrib['href']
all_links_to_node.add(add_val)
elif t.attrib['href'].startswith("http://"):
add_val = t.attrib['href']
all_links_to_node.add(add_val)
else:
pass
# select random 10 of them and crawl to them
iter_count = 0
iter_stop_lim = 10
while iter_count < iter_stop_lim and len(p.links_to_node) < iter_stop_lim and len(all_links_to_node) > 0:
current_url = random.sample(all_links_to_node, 1)[0]
all_links_to_node.remove(current_url) # don't do it twice...
iter_count = + 1
if not (current_url in visited_urls) and url_ttl > 1:
p.links_to_node.add(current_url)
crawl(current_url, url_ttl - 1)
visited_urls.add(url)
elif current_url in visited_urls and url_ttl == 1:
p.links_to_node.add(current_url)
else:
print "max depth reached or you've already been here"
return
def calc_graph_pr(pr_iter_count, damp_factor):
"print calculating PageRank"
current_iter = 0
global pr_nodes
g1 = {}
g2 = {}
for node in pr_nodes.itervalues():
g1[node.url] = node
g2[node.url] = node
g = [g1, g2]
while current_iter < pr_iter_count:
print "PageRank iteration #" + str(current_iter)
for p in g[current_iter % 2].itervalues():
in_links_addition = 0
for l in p.links_to_node:
l_val = g[(current_iter - 1) % 2][l]
l_val.delta = l_val.PR_score - g[current_iter % 2][l].PR_score
in_links_addition += l_val.PR_score/len(l_val.links_from_node)
p.PR_score = damp_factor + (1 - damp_factor) * in_links_addition
current_iter += 1
pr_nodes = g[0] #WLOG could be also g[1]...
for p in pr_nodes.itervalues():
p.print_all()
print "check bool:"
print g1 == g2
return
def update_graph_links():
global pr_nodes
for node in pr_nodes.itervalues():
for u in node.links_to_node:
if u in pr_nodes:
pr_nodes[u].links_from_node.add(u)
return
visited_urls = set([])
pr_nodes = {}
glob_pr_iter_count = 50
glob_damp_factor = 0.2
crawl("http://en.m.wikipedia.org/wiki/Nikola_Mitrovic", 3)
update_graph_links()
calc_graph_pr(glob_pr_iter_count, glob_damp_factor)
答案 0 :(得分:1)
边缘添加功能破坏了它。将其修复为:
def update_graph_links():
"""register each node with neighbours pointing at it"""
global pr_nodes
for node in pr_nodes.itervalues():
for u in node.links_to_node:
if u in pr_nodes:
pr_nodes[u].links_from_node.add(node.url)
return
经过一些调整,进行了一些重构并添加了适当的注释,它出现在以下代码中:
import requests
import lxml.html
import random
import sys
class pr_node:
"""WDM PR node"""
url = ""
links_to_node = set([])
links_from_node = set([])
PR_score = 0.01
ttl_time = 0
last_delta = 0 # used for debug only
def __init__(self, url, ttl_time):
"""CTOR"""
self.url = url
self.links_to_node = set([])
self.links_from_node = set([])
self.PR_score = 0.01
self.ttl_time = ttl_time
def print_node_out_links(self):
"""print for q1a"""
print "\n\n" + self.url + " with ttl " + str(self.ttl_time) + " = "
s = self.links_to_node
print "{" + "\, ".join(str(e) for e in s) + "}"
def print_node_pr(self):
"""print for q1b"""
print "\n\n" + self.url + " PR is: " + str(self.PR_score)
def print_all(self):
"""print for q1b and debug"""
print "url: " + self.url
print "links_to_node: " + repr(self.links_to_node)
print "links_from_node: " + repr(self.links_from_node)
print "PR_score: " + str(self.PR_score)
print "ttl_time: " + str(self.ttl_time)
print "last_delta: " + str(self.last_delta)
def crawl(url, url_ttl):
"""crawl to new url, if ttl == 0 max depth reached, don't visit same url twice"""
if url_ttl > 0 and (url not in visited_urls):
# create new node p from parsed page
print "crawling to " + url + "...\n"
res = requests.get(url)
doc = lxml.html.fromstring(res.content)
p = pr_node(url, url_ttl)
# add new PR node
global pr_nodes
pr_nodes[url] = p
# get all wiki links, format to legit URL
all_links_to_node = set([])
for t in doc.xpath("//a[contains(@href, '/wiki/')]"):
add_val = ""
if not t.attrib['href'].startswith("http://") and t.attrib['href'].startswith("/wiki/"):
add_val = "http://en.m.wikipedia.org" + t.attrib['href']
all_links_to_node.add(add_val)
elif t.attrib['href'].startswith("http://"):
add_val = t.attrib['href']
all_links_to_node.add(add_val)
else:
pass
# select random 10 of them and crawl to them
iter_count = 0
iter_stop_lim = 10
while iter_count < iter_stop_lim and len(p.links_to_node) < iter_stop_lim and len(all_links_to_node) > 0:
# sample random site of linked sites
current_url = random.sample(all_links_to_node, 1)[0]
# don't sample it twice...
all_links_to_node.remove(current_url)
iter_count = + 1
# crawl if hav'nt been there and TTL enables you to check it
if not (current_url in visited_urls) and url_ttl > 1:
p.links_to_node.add(current_url)
crawl(current_url, url_ttl - 1)
visited_urls.add(url)
# if reached with TTL == 1 just check links to existing nodes
elif current_url in visited_urls and url_ttl == 1:
p.links_to_node.add(current_url)
else:
print "max depth reached or you've already been here"
return
def calc_graph_pr(pr_nodes, pr_iter_count, damp_factor):
"""calculate and print the graph's PageRank"""
current_iter = 0
# use two graph copies to prevent auto-interference
g1 = {}
g2 = {}
for node in pr_nodes.itervalues():
g1[node.url] = node
g2[node.url] = node
g = [g1, g2]
# do actual page rank here
while current_iter < pr_iter_count:
for p in g[current_iter % 2].itervalues():
in_links_addition = 0
# iterate over all pointing nodes and sum their PR/out_link_count
for l in p.links_to_node:
l_val = g[(current_iter - 1) % 2][l]
l_val.delta = l_val.PR_score - g[current_iter % 2][l].PR_score
in_links_addition += l_val.PR_score/len(l_val.links_from_node)
# update w.r.t the computed sum and damp_factor
p.PR_score = damp_factor + (1 - damp_factor) * in_links_addition
current_iter += 1
# WLOG could be also g[1]...
pr_nodes = g[0]
for p in pr_nodes.itervalues():
p.print_node_pr()
return
def update_graph_links():
"""register each node with neighbours pointing at him"""
global pr_nodes
for node in pr_nodes.itervalues():
for u in node.links_to_node:
if u in pr_nodes:
pr_nodes[u].links_from_node.add(node.url)
return
if __name__ == '__main__':
urlToCrawl = "http://en.m.wikipedia.org/wiki/Nikola_Mitrovic"
# crawl to the requested site as default
if len(sys.argv) > 2:
sys.exit("Unexpected input")
elif len(sys.argv) == 1:
pass
else:
urlToCrawl = sys.argv[1]
print_q1a = False
print_q1b = True
# set global data structures for crawling and ranking
visited_urls = set([])
pr_nodes = {}
# parameters for PageRank
glob_pr_iter_count = 100
glob_damp_factor = 0.2
# perform crawl in depth 3
crawl(urlToCrawl, 3)
if print_q1a:
for p in pr_nodes.itervalues():
p.print_node_out_links()
elif print_q1b:
# first update the backlinks then start ranking
update_graph_links()
calc_graph_pr(pr_nodes, glob_pr_iter_count, glob_damp_factor)
else:
pass