在书Programming Collective Intelligence中,我找到了以下函数来计算PageRank:
def calculatepagerank(self,iterations=20):
# clear out the current PageRank tables
self.con.execute("drop table if exists pagerank")
self.con.execute("create table pagerank(urlid primary key,score)")
self.con.execute("create index prankidx on pagerank(urlid)")
# initialize every url with a PageRank of 1.0
self.con.execute("insert into pagerank select rowid,1.0 from urllist")
self.dbcommit()
for i in range(iterations):
print "Iteration %d" % i
for (urlid,) in self.con.execute("select rowid from urllist"):
pr=0.15
# Loop through all the pages that link to this one
for (linker,) in self.con.execute("select distinct fromid from link where toid=%d" % urlid):
# Get the PageRank of the linker
linkingpr=self.con.execute("select score from pagerank where urlid=%d" % linker).fetchone()[0]
# Get the total number of links from the linker
linkingcount=self.con.execute("select count(*) from link where fromid=%d" % linker).fetchone()[0]
pr+=0.85*(linkingpr/linkingcount)
self.con.execute("update pagerank set score=%f where urlid=%d" % (pr,urlid))
self.dbcommit()
但是,由于每次迭代中的所有SQL查询
,此函数非常慢>>> import cProfile
>>> cProfile.run("crawler.calculatepagerank()")
2262510 function calls in 136.006 CPU seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 136.006 136.006 <string>:1(<module>)
1 20.826 20.826 136.006 136.006 searchengine.py:179(calculatepagerank)
21 0.000 0.000 0.528 0.025 searchengine.py:27(dbcommit)
21 0.528 0.025 0.528 0.025 {method 'commit' of 'sqlite3.Connecti
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler
1339864 112.602 0.000 112.602 0.000 {method 'execute' of 'sqlite3.Connec
922600 2.050 0.000 2.050 0.000 {method 'fetchone' of 'sqlite3.Cursor'
1 0.000 0.000 0.000 0.000 {range}
所以我优化了这个功能并提出了这个:
def calculatepagerank2(self,iterations=20):
# clear out the current PageRank tables
self.con.execute("drop table if exists pagerank")
self.con.execute("create table pagerank(urlid primary key,score)")
self.con.execute("create index prankidx on pagerank(urlid)")
# initialize every url with a PageRank of 1.0
self.con.execute("insert into pagerank select rowid,1.0 from urllist")
self.dbcommit()
inlinks={}
numoutlinks={}
pagerank={}
for (urlid,) in self.con.execute("select rowid from urllist"):
inlinks[urlid]=[]
numoutlinks[urlid]=0
# Initialize pagerank vector with 1.0
pagerank[urlid]=1.0
# Loop through all the pages that link to this one
for (inlink,) in self.con.execute("select distinct fromid from link where toid=%d" % urlid):
inlinks[urlid].append(inlink)
# get number of outgoing links from a page
numoutlinks[urlid]=self.con.execute("select count(*) from link where fromid=%d" % urlid).fetchone()[0]
for i in range(iterations):
print "Iteration %d" % i
for urlid in pagerank:
pr=0.15
for link in inlinks[urlid]:
linkpr=pagerank[link]
linkcount=numoutlinks[link]
pr+=0.85*(linkpr/linkcount)
pagerank[urlid]=pr
for urlid in pagerank:
self.con.execute("update pagerank set score=%f where urlid=%d" % (pagerank[urlid],urlid))
self.dbcommit()
此函数的速度要快很多倍(但对所有临时字典使用的内存要多得多),因为它避免了每次迭代中不必要的SQL查询:
>>> cProfile.run("crawler.calculatepagerank2()")
90070 function calls in 3.527 CPU seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.004 0.004 3.527 3.527 <string>:1(<module>)
1 1.154 1.154 3.523 3.523 searchengine.py:207(calculatepagerank2
2 0.000 0.000 0.058 0.029 searchengine.py:27(dbcommit)
23065 0.013 0.000 0.013 0.000 {method 'append' of 'list' objects}
2 0.058 0.029 0.058 0.029 {method 'commit' of 'sqlite3.Connectio
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler
43932 2.261 0.000 2.261 0.000 {method 'execute' of 'sqlite3.Connecti
23065 0.037 0.000 0.037 0.000 {method 'fetchone' of 'sqlite3.Cursor'
1 0.000 0.000 0.000 0.000 {range}
但是是否有可能进一步减少SQL查询的数量以进一步加快功能? 更新:修复了calculatepagerank2()中的缩进。
答案 0 :(得分:2)
如果你有一个非常大的数据库(例如WWW中的#records~#pages),那么使用数据库的方式与本书中建议的方式类似,因为你无法保留所有这些内存中的数据。
如果您的数据集足够小,您可以(可能)通过不执行这么多查询来改进您的第二个版本。尝试使用以下内容替换第一个循环:
for urlid, in self.con.execute('select rowid from urllist'):
inlinks[urlid] = []
numoutlinks[urlid] = 0
pagerank[urlid] = 1.0
for src, dest in self.con.execute('select fromid, toid from link'):
inlinks[dest].append(src)
numoutlinks[src] += 1
此版本正好执行2次查询而不是O(n ^ 2)次查询。
答案 1 :(得分:1)
我相信大部分时间都花在了这些SQL查询上:
for (urlid,) in self.con.execute("select rowid from urllist"):
...
for (inlink,) in self.con.execute("select distinct fromid from link where toid=%d" % urlid):
...
numoutlinks[urlid]=self.con.execute("select count(*) from link where fromid=%d" % urlid).fetchone()[0]
假设您有足够的内存,您可以将其减少到只有两个查询:
SELECT fromid,toid FROM link WHERE toid IN (SELECT rowid FROM urllist)
SELECT fromid,count(*) FROM link WHERE fromid IN (SELECT rowid FROM urllist) GROUP BY fromid
然后,您可以遍历结果并构建inlinks
,numoutlinks
和pagerank
。
您也可以使用collections.defaultdict
:
import collections
import itertools
def constant_factory(value):
return itertools.repeat(value).next
然后,inlinks
成为集合的词典。集合是合适的
你只想要不同的网址
inlinks=collections.defaultdict(set)
这会使pagerank
成为默认值为1.0的字典:
pagerank=collections.defaultdict(constant_factory(1.0))
使用collections.defaultdict的好处就在于你 不需要预先初始化dicts。
所以,放在一起,我建议看起来像这样:
import collections
def constant_factory(value):
return itertools.repeat(value).next
def calculatepagerank2(self,iterations=20):
# clear out the current PageRank tables
self.con.execute("DROP TABLE IF EXISTS pagerank")
self.con.execute("CREATE TABLE pagerank(urlid primary key,score)")
self.con.execute("CREATE INDEX prankidx ON pagerank(urlid)")
# initialize every url with a PageRank of 1.0
self.con.execute("INSERT INTO pagerank SELECT rowid,1.0 FROM urllist")
self.dbcommit()
inlinks=collections.defaultdict(set)
sql='''SELECT fromid,toid FROM link WHERE toid IN (SELECT rowid FROM urllist)'''
for f,t in self.con.execute(sql):
inlinks[t].add(f)
numoutlinks={}
sql='''SELECT fromid,count(*) FROM link WHERE fromid IN (SELECT rowid FROM urllist) GROUP BY fromid'''
for f,c in self.con.execute(sql):
numoutlinks[f]=c
pagerank=collections.defaultdict(constant_factory(1.0))
for i in range(iterations):
print "Iteration %d" % i
for urlid in inlinks:
pr=0.15
for link in inlinks[urlid]:
linkpr=pagerank[link]
linkcount=numoutlinks[link]
pr+=0.85*(linkpr/linkcount)
pagerank[urlid]=pr
sql="UPDATE pagerank SET score=? WHERE urlid=?"
args=((pagerank[urlid],urlid) for urlid in pagerank)
self.con.executemany(sql, args)
self.dbcommit()
答案 2 :(得分:0)
你有足够的RAM来保存某种形式的稀疏矩阵(fromid, toid)
吗?这将允许大的优化(具有大的算法变化)。至少,在内存中缓存你最内层循环中(fromid, numlinks)
所做的select count(*)
应该会有所帮助(我想象那个缓存,是O(N)
在太空中,如果你正在处理N
网址,那么它更有可能适合记忆。)
答案 3 :(得分:0)
我正在回答我自己的问题,因为最终我发现所有答案的混合对我来说效果最好:
def calculatepagerank4(self,iterations=20):
# clear out the current PageRank tables
self.con.execute("drop table if exists pagerank")
self.con.execute("create table pagerank(urlid primary key,score)")
self.con.execute("create index prankidx on pagerank(urlid)")
# initialize every url with a PageRank of 1.0
self.con.execute("insert into pagerank select rowid,1.0 from urllist")
self.dbcommit()
inlinks={}
numoutlinks={}
pagerank={}
for (urlid,) in self.con.execute("select rowid from urllist"):
inlinks[urlid]=[]
numoutlinks[urlid]=0
# Initialize pagerank vector with 1.0
pagerank[urlid]=1.0
for src,dest in self.con.execute("select distinct fromid, toid from link"):
inlinks[dest].append(src)
numoutlinks[src]+=1
for i in range(iterations):
print "Iteration %d" % i
for urlid in pagerank:
pr=0.15
for link in inlinks[urlid]:
linkpr=pagerank[link]
linkcount=numoutlinks[link]
pr+=0.85*(linkpr/linkcount)
pagerank[urlid]=pr
args=((pagerank[urlid],urlid) for urlid in pagerank)
self.con.executemany("update pagerank set score=? where urlid=?" , args)
self.dbcommit()
所以我按照allyourcode
的建议替换了前两个循环,但另外还使用了˜unutbu
解决方案中的executemany()。但与˜unutbu
不同,我使用args的生成器表达式,不会浪费太多内存,尽管使用列表推导更快一些。最后,例行程序比书中提出的例程快100倍:
>>> cProfile.run("crawler.calculatepagerank4()")
33512 function calls in 1.377 CPU seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.004 0.004 1.377 1.377 <string>:1(<module>)
2 0.000 0.000 0.073 0.036 searchengine.py:27(dbcommit)
1 0.693 0.693 1.373 1.373 searchengine.py:286(calculatepagerank4
10432 0.011 0.000 0.011 0.000 searchengine.py:321(<genexpr>)
23065 0.009 0.000 0.009 0.000 {method 'append' of 'list' objects}
2 0.073 0.036 0.073 0.036 {method 'commit' of 'sqlite3.Connectio
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler
6 0.379 0.063 0.379 0.063 {method 'execute' of 'sqlite3.Connecti
1 0.209 0.209 0.220 0.220 {method 'executemany' of 'sqlite3.Conn
1 0.000 0.000 0.000 0.000 {range}
还应该注意以下问题:
%f
而不是使用占位符?
来构造SQL语句,则会丢失精度(例如,使用?
获得2.9796095721920315,使用{{1获得2.9796100000000001 }}。