因此,对于我的一个项目,我必须创建一个类似于pageRank算法的AI来对html文件的重要性进行排名。代码在下面的错误中,当我运行代码时,出现此错误。我没有这个主意。在过去的3个小时中,我一直在查看此代码,并用Google搜索了一种解决方法。我知道,当您调用dict以确保其工作时,必须将其作为数组返回,但是在代码中,我将页面返回为[page]。我需要另一双眼睛看山雀。
Traceback (most recent call last):
File "pagerank.py", line 208, in <module>
main()
File "pagerank.py", line 14, in main
ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
File "pagerank.py", line 119, in sample_pagerank
new_samp_choice = transition_model(corpus, sample, damping_factor)
File "pagerank.py", line 64, in transition_model
num_links = len(corpus([page]))
TypeError: 'dict' object is not callable.
因此,我查看了堆积的溢出情况,并说要调用dict就是将其作为数组打印出来。但是我已经看了一百遍代码了,我不知道这是怎么回事。
import os
import random
import re
import sys
from collections import Counter
DAMPING = 0.85
SAMPLES = 10000
def main():
if len(sys.argv) != 2:
sys.exit("Usage: python pagerank.py corpus")
corpus = crawl(sys.argv[1])
ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
print(f"PageRank Results from Sampling (n = {SAMPLES})")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
ranks = iterate_pagerank(corpus, DAMPING)
print(f"PageRank Results from Iteration")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
def crawl(directory):
"""
Parse a directory of HTML pages and check for links to other pages.
Return a dictionary where each key is a page, and values are
a list of all other pages in the corpus that are linked to by the page.
"""
pages = dict()
# Extract all links from HTML files
for filename in os.listdir(directory):
if not filename.endswith(".html"):
continue
with open(os.path.join(directory, filename)) as f:
contents = f.read()
links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
pages[filename] = set(links) - {filename}
# Only include links to other pages in the corpus
for filename in pages:
pages[filename] = set(
link for link in pages[filename]
if link in pages
)
return pages
def transition_model(corpus, page, damping_factor):
"""
Return a probability distribution over which page to visit next,
given a current page.
With probability `damping_factor`, choose a link at random
linked to by `page`. With probability `1 - damping_factor`, choose
a link at random chosen from all pages in the corpus.
"""
page_mod = {}
#run the number of files on the corpus
num_files = len(corpus)
#receive number of links from the page that was picked at random
num_links = len(corpus([page]))
if num_links != 0:
#Calculate the probability
randonm_set = (1 - damping_factor)/num_files
#calculating the specific page realted probability
specific_set = (1 - damping_factor)/ num_links
else: #Calculate the probability from all pages
randonm_set = (1 - damping_factor) / num_links
specific_set = 0
#iterate over the files
for file in corpus:
#Checking the page to see if there any other links
if len(corpus[page])== 0:
page_mod[file] = 1 / num_files
else:
if file not in corpus[page]:
page_mod[file] = randonm_set
else:
page_mod[file] = specific_set + randonm_set
if round(sum(page_mod.values()) ,5) != 1:
print(f'ERROR! The probabilites add up from {sum(page_mod.values())}')
return page_mod
def sample_pagerank(corpus, damping_factor, n):
"""
Return PageRank values for each page by sampling `n` pages
according to transition model, starting with a page at random.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
sample_PR = {}
# Mappinga variable name to sample generated and make it equal to 0
for page in corpus:
sample_PR[page] = 0
sample = None
for iteration in range(n):
if sample == None:
# list of all the choices
choices = list(corpus.keys())
# choose a choice at random
sample = random.choice(choices)
sample_PR[sample] += 1
else:
#Get the probability based of the current sample choice
new_samp_choice = transition_model(corpus, sample, damping_factor)
#List of all choices
choices = list(new_samp_choice.keys())
# Weights for the distribution for each page and ranking up thier importance
weights = [new_samp_choice[key] for key in choices]
# when you run it you the random.choices method will return a list of values
sample = random.choices(choices,weights).pop()
sample_PR[sample] += 1
#Divide the iterations to get an percentage
sample_PR + {key: value/n for key, value in sample_PR.items()}
#Check if the value sadd up to 1
if round(sum(sample_PR.values()), 5) != 1:
print(f'ERROR! The probabilites add up from {sum(page_mod.values())}')
else:
print(
f'sum of the page Rank files: {round(sum(sample_PR.values()),10)}')
return sample_PR
def iterate_pagerank(corpus, damping_factor):
"""
Return PageRank values for each page by iteratively updating
PageRank values until convergence.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
#Create a dictionary for the iterations
iterate_PR = {}
#The number of pages in the coprus
num_pages = len(corpus)
#Iterate over the copus and assign a number to each page
for page in corpus:
iterate_PR[page] = 1/num_pages
changes = 1
iterations = 1
while changes >= 0.001:
changes = 0
#Copy the current state of the value to make sure it doesn't overide another value
prev_state = iterate_PR.copy()
#Iterate over the pages
for page in iterate_PR:
#Get pparent pages that link to it
parents = [link for link in corpus if page in corpus [link]]
#Add the damping factor/ number of links and create the iteration over the parents as a array
first_eq = (1 - damping_factor)/ num_pages
second_eq = []
if len(parents) != 0:
for parent in parents:
#Start the the number of links from the parent page
num_links = len(corpus[parent])
value = prev_state[parent]/ num_links
second_eq.append(value)
#Start the second list to sum up the values
second = sum(second_eq)
iterate_PR[page] = first_eq + (damping_factor * second)
#Calculating the change of the iteration
new_change = abs(iterate_PR[page] - prev_state[page])
if changes < new_change:
changes = new_change
iterations += 1
dictsum = sum(iterate_PR.values())
iterate_PR = {key: value/dictsum for key, value in iterate_PR.items()}
print(f'\nPageRank value stable after {iterations} iterations.')
print(f' Sum of iterate_pagerank values: {round(sum(iterate_PR.values()),10)}')
return iterate_PR
if __name__ == "__main__":
main()