漂亮的汤解析功能不再起作用

时间:2018-07-11 05:05:46

标签: python parsing web-scraping beautifulsoup lxml

我去年写了一个网络抓取器,从一个随机的维基百科页面开始,跟随第一段中的第一个链接,然后抓取下一页,重复此过程直到到达哲学页面。该脚本是this problem的解决方案。

在我的脚本中,代码从固定数量的随机页面开始,并且每个页面都导航到哲学页面,记录从头到尾的路径长度(从随机页面到哲学页面)。然后使用matplotlib绘制这些长度。

去年初我测试脚本时,脚本的所有步骤都在工作,但是自那时以来,从今天开始我第一次运行它时,由于某种原因,漂亮的汤解析代码没有从脚本中提取任何链接。文章正文,甚至提取要解析的任何段落。 自上次工作以来,我没有更改任何代码,而Wikipedia页面DOM的html标签似乎与以前一样,因此可能是因为漂亮的汤类库已更改,或者维基百科DOM是否有一些可能最近改变的微妙之处?

该脚本的代码如下:

import sys
import json
from urlparse import urljoin

import requests
from lxml.html import fromstring
from bs4 import BeautifulSoup,NavigableString, Tag
import matplotlib.pyplot as plt
import scipy
import scipy.stats

reload(sys)
sys.setdefaultencoding('utf-8')


class Crawler:
    """ Class used to crawl wikipedia pages starting from a random article."""
    def __init__(self):
        self.base_url = "https://en.wikipedia.org"
        self.NUM_PAGES_TO_CRAWL = 2
    def get_valid_link(self, curr_response):
        """Takes an html response and returns the first link in the main body of the article."""
        curr_root = BeautifulSoup(curr_response.text,"lxml")
        first = curr_root.select_one("#mw-content-text") # locate main body
        if not first:
            return None
        par = first.find_all("p",recursive = False,limit = 10)
        print par
        heading = curr_root.select_one("#firstHeading").text
        heading = reformat_string('(',heading)
        first_paragraph_found = False
        head_tokens = tokenize(heading)

        # Find which paragraph has the first link
        i = 0
        for i in range(len(par)):
            if par[i].b is not None:
                bold = ""
                for string in par[i].find_all("b"):
                    bold += " " + string.text
                bold = reformat_string('(', bold)
                bold_tokens = tokenize(bold)
                heading_match = check_name_match(head_tokens,bold_tokens)
                if heading_match:
                    first_paragraph_found = True
                if heading_match and par[i].a:
                    break
            if par[i].a is not None:
                anchor = par[i].a.text
                if anchor:
                    anchor = reformat_string('(', anchor)
                    a_tokens = tokenize(anchor)
                    heading_match = check_name_match(head_tokens,a_tokens)
                    if heading_match:
                        break
            if first_paragraph_found and par[i].a:
                break   
            i += 1

        # if none of the paragraphs have a link and article contains only a list
        if i >= len(par)-1 and first_paragraph_found:
            u_list = first.find_all('ul')
            try:
                return u_list[0].li.a['href']
            except (IndexError, AttributeError,TypeError):
                return None
        elif i >= len(par)-1:# Reached article with no main body
            return None

        main_body_idx = i
        stack = []
        # Find the first link before or after parentheses 
        for child in par[main_body_idx].children:
            if isinstance(child,NavigableString):
                if "(" in child:
                    stack.append("(")
                if ")" in child:
                    try:
                        stack.pop()
                    except IndexError: # html malformed
                        return None

            if isinstance(child, Tag) and child.name == "a" and not stack:
                link = child['href']        
                link = reformat_string('#',link)
                try:
                    return str(link)
                except KeyError: # Reached article with no main body
                    return None

    def crawl_to_philosophy(self, start_url,session):
        """Follow the path of each url until the philosophy page is reached and return the path."""
        link_path = []
        # Get first link
        try:
            init_response = session.get(start_url)
        except requests.exceptions.RequestException as e: # bad link
            return None

        init_link = self.get_valid_link(init_response)
        if not init_link:
            return None
        link_path.append(urljoin(self.base_url, init_link))

        # Follow path of links until the philosophy page is reached
        i = 0
        while True:
            if "philosophy" in  link_path[i].lower():
                break
            try:
                curr_response = session.get(link_path[i])
            except requests.exceptions.RequestException as e: # bad link
                return None 

            curr_link = self.get_valid_link(curr_response)
            if not curr_link or "redlink" in curr_link:
                return None
            new_link = urljoin(self.base_url, curr_link)
            for i in range(len(link_path)):
                if new_link in link_path[i] : # loop found
                    return None
            link_path.append(new_link)
            i += 1
        return link_path

    def find_paths_to_philosophy(self,url):
        """Find paths starting from 500 links."""
        i = 0
        crawl_list = []
        with requests.Session() as s:
            while i < self.NUM_PAGES_TO_CRAWL:
                path = self.crawl_to_philosophy(url,s)
                if path is not None:
                    crawl_list.append(len(path))
                    i += 1
            plot_lengths(crawl_list)


def plot_lengths(lens):
    """Plot the distribution of path lengths."""
    freq = {}
    max_len = 0

    for length in lens:
        max_len = max(length,max_len)
        if length in freq:
            freq[length] += 1
        else:
            freq[length] = 1
    max_freq = max(freq.values())
    bins = range(0, max_len + 1, 2)
    plt.hist(lens,bins,histtype = 'bar',rwidth = 0.8)
    plt.xlabel('x')
    plt.ylabel('Path Lengths')
    plt.title('Distribution of path lengths')
    dist_names = ['gamma', 'beta', 'rayleigh', 'norm', 'pareto']

    for dist_name in dist_names:
        dist = getattr(scipy.stats, dist_name)
        param = dist.fit(lens)
        pdf_fitted = dist.pdf(bins, *param[:-2], loc=param[-2], scale=param[-1]) * len(lens)
        plt.plot(bins,pdf_fitted, label=dist_name)
        plt.xlim(0,max_len)
        plt.ylim(0,max_freq)
    plt.legend(loc='upper right')
    plt.show()


# Utility functions used by Crawler class

def reformat_string(char, word):
    """Remove passed in char from a string and convert its characters to lowercase."""
    word = word.lower()
    char_idx = word.find(char)
    if char_idx != -1:
        return word[:char_idx]
    return word

def check_name_match(heading, string):
    """Determine whether or not any part of the article heading is in the string and vice versa."""
    for i in range(len(string)):
        for j in range(len(heading)):
            if heading[j] in string[i] or string[i] in heading[j]:
                return True
    return False

def tokenize(word):
    """Split the passed in 'word' on space characters and return a list of tokens."""
    tokens = []
    curr_word = ""
    for i in range(len(word)):
        if word[i] == " " and i == len(word)-1:
            tokens.append(word.strip(" "))
            return tokens
        curr_word += word[i]
        if word[i] == " " :
            tokens.append(curr_word)    
            curr_word = ""
            i+=1
        if i == len(word)-1:
            tokens.append(curr_word)    
            return tokens


if __name__ == "__main__":
    url = "https://en.wikipedia.org/wiki/Special:Random"
    crawler = Crawler()
    crawler.find_paths_to_philosophy(url)

脚本中不起作用的行似乎如下:

curr_root = BeautifulSoup(curr_response.text,"lxml")
first = curr_root.select_one("#mw-content-text") # locate main body
if not first:
    return None
par = first.find_all("p",recursive = False,limit = 10)
heading = curr_root.select_one("#firstHeading").text
heading = reformat_string('(',heading)
first_paragraph_found = False
head_tokens = tokenize(heading)

编辑:

该代码不适用于所选的任何随机链接,但该代码不适用于例如:en.wikipedia.org/wiki/Modern_Greek

0 个答案:

没有答案