Python web scraping,计算每个页面的单词列表的出现次数

时间:2016-05-19 03:58:36

标签: python regex python-2.7 beautifulsoup frequency

所以我试图找到每个页面的一组特定单词("""可能""必须"等),并将其出现加起来,我使用的代码:

import requests
from bs4 import BeautifulSoup, SoupStrainer
import re


def levelfour(main_url):

    pattern = re.compile(r"\bmay not\b", re.IGNORECASE)
    pattern1 = re.compile(r"\bshall\b", re.IGNORECASE)
    pattern2 = re.compile(r"\bmust\b", re.IGNORECASE)
    pattern3 = re.compile(r"\bprohibited\b", re.IGNORECASE)
    pattern4 = re.compile(r"\brequired\b", re.IGNORECASE)

    r = requests.get(main_url)
    soup = BeautifulSoup((r.content), "html.parser")
    results = soup.find('article', {'id': 'maincontent'})
    results = results.text.encode("utf-8", "ignore")

    total = 0
    total1 = 0
    total2 = 0
    total3 = 0
    total4 = 0

    m = re.findall(pattern, r.content)
    m1 = re.findall(pattern1, r.content)
    m2 = re.findall(pattern2, r.content)
    m3 = re.findall(pattern3, r.content)
    m4 = re.findall(pattern4, r.content)
    total += len(m)
    total1 += len(m1)
    total2 += len(m2)
    total3 += len(m3)
    total4 += len(m4)
    print total, total1, total2, total3, total4

########################################Sections##########################
def levelthree(item2_url):
 r = requests.get(item2_url)
 for sectionlinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
  if sectionlinks.has_attr('href'):
   if 'section' in sectionlinks['href']:
         href = "http://law.justia.com" + sectionlinks.get('href')
         levelfour(href)

########################################Chapters##########################
def leveltwo(item_url):
 r = requests.get(item_url)
 for sublinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
  if sublinks.has_attr('href'):
   if 'chapt' in sublinks['href']:
         chapterlinks = "http://law.justia.com" + sublinks.get('href')
         levelthree(chapterlinks)
         print (chapterlinks)

######################################Titles###############################
def levelone(url):
 r = requests.get(url)
 for links in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
    if links.has_attr('href'):
        if 'title-54' in links['href']:
          titlelinks = "http://law.justia.com" + links.get('href')
          # titlelinks = "\n" + str(titlelinks)
          leveltwo(titlelinks)
          # print (titlelinks)

###########################################################################
base_url = "http://law.justia.com/codes/idaho/2015/"
levelone(base_url)

当我打印出total,total1,total2,total3,total4时,它给出一个零而不是[0,0,0,0,0]我的问题,怎么能恰当地找到并加起来这个集合的出现或者字?

2 个答案:

答案 0 :(得分:1)

使用m = re.findall(pattern, r.content)修复了问题

答案 1 :(得分:0)

为每个短语使用变量是一团糟。试试这个:

from collections import Counter
counter = Counter()
text = r.content.lower()
for phrase in ['may not', 'shall', 'must']:
    counter[phrase] += len(re.findall(r'\b%s\b' % phrase, text))