有没有一种方法可以使dict在您的代码中可调用?

时间:2020-07-30 01:20:10

标签: python dictionary pagerank

因此,对于我的一个项目,我必须创建一个类似于pageRank算法的AI来对html文件的重要性进行排名。代码在下面的错误中,当我运行代码时,出现此错误。我没有这个主意。在过去的3个小时中,我一直在查看此代码,并用Google搜索了一种解决方法。我知道,当您调用dict以确保其工作时,必须将其作为数组返回,但是在代码中,我将页面返回为[page]。我需要另一双眼睛看山雀。

 Traceback (most recent call last):
  File "pagerank.py", line 208, in <module>
    main()
  File "pagerank.py", line 14, in main
    ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
  File "pagerank.py", line 119, in sample_pagerank
    new_samp_choice = transition_model(corpus, sample, damping_factor)
  File "pagerank.py", line 64, in transition_model
    num_links = len(corpus([page]))

TypeError: 'dict' object is not callable.

因此,我查看了堆积的溢出情况,并说要调用dict就是将其作为数组打印出来。但是我已经看了一百遍代码了,我不知道这是怎么回事。

import os
import random
import re
import sys
from collections import Counter
DAMPING = 0.85
SAMPLES = 10000


def main():
    if len(sys.argv) != 2:
        sys.exit("Usage: python pagerank.py corpus")
    corpus = crawl(sys.argv[1])
    ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
    print(f"PageRank Results from Sampling (n = {SAMPLES})")
    for page in sorted(ranks):
        print(f"  {page}: {ranks[page]:.4f}")
    ranks = iterate_pagerank(corpus, DAMPING)
    print(f"PageRank Results from Iteration")
    for page in sorted(ranks):
        print(f"  {page}: {ranks[page]:.4f}")


def crawl(directory):
    """
    Parse a directory of HTML pages and check for links to other pages.
    Return a dictionary where each key is a page, and values are
    a list of all other pages in the corpus that are linked to by the page.
    """
    pages = dict()

    # Extract all links from HTML files
    for filename in os.listdir(directory):
        if not filename.endswith(".html"):
            continue
        with open(os.path.join(directory, filename)) as f:
            contents = f.read()
            links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
            pages[filename] = set(links) - {filename}

    # Only include links to other pages in the corpus
    for filename in pages:
        pages[filename] = set(
            link for link in pages[filename]
            if link in pages
        )

    return pages


def transition_model(corpus, page, damping_factor):
    """
    Return a probability distribution over which page to visit next,
    given a current page.

    With probability `damping_factor`, choose a link at random
    linked to by `page`. With probability `1 - damping_factor`, choose
    a link at random chosen from all pages in the corpus.
    """
    page_mod = {}
    #run the number of files on the corpus
    num_files = len(corpus)
    #receive number of links from the page that was picked at random
    
    num_links = len(corpus([page]))
    
    if num_links != 0:
        #Calculate the probability
        randonm_set = (1 - damping_factor)/num_files
        #calculating the specific page realted probability
        specific_set = (1 - damping_factor)/ num_links
    else: #Calculate the probability from all pages
        randonm_set = (1 - damping_factor) / num_links
        specific_set = 0
    #iterate over the files
    for file in corpus:
        #Checking the page to see if there any other links
        if len(corpus[page])== 0:
            page_mod[file] = 1 / num_files
        else:
            if file not in corpus[page]:
                page_mod[file] = randonm_set
            else:
                page_mod[file] = specific_set + randonm_set
                  
    if round(sum(page_mod.values()) ,5) != 1:
        print(f'ERROR! The probabilites add up from {sum(page_mod.values())}')
        
    return page_mod


    
    


def sample_pagerank(corpus, damping_factor, n):
    """
    Return PageRank values for each page by sampling `n` pages
    according to transition model, starting with a page at random.

    Return a dictionary where keys are page names, and values are
    their estimated PageRank value (a value between 0 and 1). All
    PageRank values should sum to 1.
    """
    sample_PR = {}
    # Mappinga variable name to sample generated and make it equal to 0
    for page in corpus:
        sample_PR[page] = 0
    
    sample = None

    for iteration in range(n):
        if sample == None:
            # list of all the choices 
            choices = list(corpus.keys())
            # choose a choice at random 
            sample = random.choice(choices)
            sample_PR[sample] += 1
        else:
            #Get the probability based of the current sample choice
            new_samp_choice = transition_model(corpus, sample, damping_factor)
            #List of all choices
            choices = list(new_samp_choice.keys())
            # Weights for the distribution for each page and ranking up thier importance
            weights = [new_samp_choice[key] for key in choices]
            # when you run it you the random.choices method will return a list of values
            sample = random.choices(choices,weights).pop()
            sample_PR[sample] += 1
    #Divide the iterations to get an percentage
    sample_PR + {key: value/n for key, value in sample_PR.items()}
    #Check if the value sadd up to 1
    if round(sum(sample_PR.values()), 5) != 1:
        print(f'ERROR! The probabilites add up from {sum(page_mod.values())}')
    else:
        print(
            f'sum of the page Rank files: {round(sum(sample_PR.values()),10)}')
    return sample_PR



            



    


def iterate_pagerank(corpus, damping_factor):
    """
    Return PageRank values for each page by iteratively updating
    PageRank values until convergence.

    Return a dictionary where keys are page names, and values are
    their estimated PageRank value (a value between 0 and 1). All
    PageRank values should sum to 1.
    """
    #Create a dictionary for the iterations 
    iterate_PR = {}
    #The number of pages in the coprus
    num_pages = len(corpus)
    #Iterate over the copus and assign a number to each page 
    for page in corpus:
        iterate_PR[page] = 1/num_pages
    
    changes = 1
    iterations = 1

    while changes >= 0.001:
        changes = 0
        #Copy the current state of the value to make sure it doesn't overide another value
        prev_state = iterate_PR.copy()
        #Iterate over the pages
        for page in iterate_PR:
            #Get pparent pages that link to it
            parents = [link for link in corpus  if page in corpus [link]]
            #Add the damping factor/ number of links and create the iteration over the parents as a array
            first_eq = (1 - damping_factor)/ num_pages
            second_eq = []
            if len(parents) != 0:
                for parent in parents:
                    #Start the the number of links from the parent page
                    num_links = len(corpus[parent])
                    value = prev_state[parent]/ num_links
                    second_eq.append(value)
            #Start the second list to sum up the values
            second = sum(second_eq)
            iterate_PR[page] = first_eq + (damping_factor * second)
            #Calculating the change of the iteration
            new_change = abs(iterate_PR[page] - prev_state[page])
            if changes < new_change:
                changes = new_change
        iterations += 1
    dictsum = sum(iterate_PR.values())
    iterate_PR = {key: value/dictsum for key, value in iterate_PR.items()}
    print(f'\nPageRank value stable after {iterations} iterations.')
    print(f' Sum of iterate_pagerank values: {round(sum(iterate_PR.values()),10)}')
    return iterate_PR

            
            
            
            






if __name__ == "__main__":
    main()

0 个答案:

没有答案