所以我试图找到每个页面的一组特定单词("""可能""必须"等),并将其出现加起来,我使用的代码:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
def levelfour(main_url):
pattern = re.compile(r"\bmay not\b", re.IGNORECASE)
pattern1 = re.compile(r"\bshall\b", re.IGNORECASE)
pattern2 = re.compile(r"\bmust\b", re.IGNORECASE)
pattern3 = re.compile(r"\bprohibited\b", re.IGNORECASE)
pattern4 = re.compile(r"\brequired\b", re.IGNORECASE)
r = requests.get(main_url)
soup = BeautifulSoup((r.content), "html.parser")
results = soup.find('article', {'id': 'maincontent'})
results = results.text.encode("utf-8", "ignore")
total = 0
total1 = 0
total2 = 0
total3 = 0
total4 = 0
m = re.findall(pattern, r.content)
m1 = re.findall(pattern1, r.content)
m2 = re.findall(pattern2, r.content)
m3 = re.findall(pattern3, r.content)
m4 = re.findall(pattern4, r.content)
total += len(m)
total1 += len(m1)
total2 += len(m2)
total3 += len(m3)
total4 += len(m4)
print total, total1, total2, total3, total4
########################################Sections##########################
def levelthree(item2_url):
r = requests.get(item2_url)
for sectionlinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if sectionlinks.has_attr('href'):
if 'section' in sectionlinks['href']:
href = "http://law.justia.com" + sectionlinks.get('href')
levelfour(href)
########################################Chapters##########################
def leveltwo(item_url):
r = requests.get(item_url)
for sublinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if sublinks.has_attr('href'):
if 'chapt' in sublinks['href']:
chapterlinks = "http://law.justia.com" + sublinks.get('href')
levelthree(chapterlinks)
print (chapterlinks)
######################################Titles###############################
def levelone(url):
r = requests.get(url)
for links in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if links.has_attr('href'):
if 'title-54' in links['href']:
titlelinks = "http://law.justia.com" + links.get('href')
# titlelinks = "\n" + str(titlelinks)
leveltwo(titlelinks)
# print (titlelinks)
###########################################################################
base_url = "http://law.justia.com/codes/idaho/2015/"
levelone(base_url)
当我打印出total,total1,total2,total3,total4时,它给出一个零而不是[0,0,0,0,0]我的问题,怎么能恰当地找到并加起来这个集合的出现或者字?
答案 0 :(得分:1)
使用m = re.findall(pattern, r.content)
修复了问题
答案 1 :(得分:0)
为每个短语使用变量是一团糟。试试这个:
from collections import Counter
counter = Counter()
text = r.content.lower()
for phrase in ['may not', 'shall', 'must']:
counter[phrase] += len(re.findall(r'\b%s\b' % phrase, text))