我正在使用此代码从DreamBank网站刮除梦想。它将DREAMERS字典中列出的集合的所有梦想下载到dreams目录。
import re
import requests
import json
from bs4 import BeautifulSoup
from collections import OrderedDict
DREAMBANK = "http://dreambank.net/random_sample.cgi?series={0}&min=1&max=9999&n=10000"
# Dream collections to download
DREAMERS = {
"alta": "Alta: a detailed dreamer", "angie": "Angie: age 18 & 20", "arlie": "Arlie: a middle-aged woman", "b": "Barb Sanders", "b2": "Barb Sanders #2", "b-baseline": "Barb Sanders: baseline", "bay_area_girls_456": "Bay Area girls: Grades 4-6", "bay_area_girls_789": "Bay Area girls: Grades 7-9", "bea1": "Bea 1: a high school student", "bea2": "Bea 2: a college student", "blind-f": "Blind dreamers (F)", "blind-m": "Blind dreamers (M)", "chris": "Chris: a transvestite", "chuck": "Chuck: a physical scientist", "hall_female": "College women, late 1940s", "dahlia": "Dahlia: concerns with appearance", "david": "David: teenage dreams", "vonuslar.de": "Detlev von Uslar, auf Deutsch", "dorothea": "Dorothea: 53 years of dreams", "ed": "Ed: dreams of his late wife", "edna": "Edna: a blind woman", "elizabeth": "Elizabeth: a woman in her 40s", "emmas_husband": "Emma's Husband", "emma": "Emma: 48 years of dreams", "esther": "Esther: an adolescent girl", "german-f.de": "German dreams (F)", "german-m.de": "German dreams (M)", "norms-f": "Hall/VdC Norms: Female", "norms-m": "Hall/VdC Norms: Male", "jasmine1": "Jasmine 1: middle school", "jasmine2": "Jasmine 2: high school", "jasmine3": "Jasmine 3: college 1", "jasmine4": "Jasmine 4: college 2", "jeff": "Jeff: a lucid dreamer", "joan": "Joan: a lesbian", "kenneth": "Kenneth", "lawrence": "Lawrence, a young man", "mack": "Mack: A poor recaller", "madeline1-hs": "Madeline 1: High School", "madeline2-dorms": "Madeline 2: College Dorms", "madeline3-offcampus": "Madeline 3: Off-Campus", "madeline4-postgrad": "Madeline 4: After College", "mark": "Mark: a young boy", "melissa": "Melissa: a young girl", "melora": "Melora (Melvin's wife)", "melvin": "Melvin (Melora's husband)", "merri": "Merri: an artist", "miami-home": "Miami Home-Lab: Home", "miami-lab": "Miami Home-Lab: Lab", "midwest_teens-f": "Midwest teenagers (F)", "midwest_teens-m": "Midwest teenagers (M)", "nancy": "Nancy: Caring & headstrong", "natural_scientist": "The Natural Scientist", "norman": "Norman: a child molester", "pegasus": "Pegasus: a factory worker", "peru-m": "Peruvian men", "peru-f": "Peruvian women", "phil1": "Phil 1: teens", "phil2": "Phil 2: late 20s", "phil3": "Phil 3: retirement", "physiologist": "The Physiologist", "ringo": "Ringo: from the 1960s", "bosnak": "Robert Bosnak: A dream analyst", "samantha": "Samantha: in her 20s", "seventh_graders": "Seventh grade girls", "zurich-f.de": "Swiss children, auf Deutsch (F)", "zurich-m.de": "Swiss children, auf Deutsch (M)", "toby": "Toby: A friendly party animal", "tom": "Tom: An outgoing man", "ucsc_women": "UCSC women, 1996", "vickie": "Vickie: a 10-year-old girl", "vietnam_vet": "Vietnam Vet: 1970-2008 war dreams", "vietnam_vet2": "Vietnam Vet: 2015 dreams", "wedding": "Wedding dreams", "west_coast_teens": "West Coast teenage girls" }
NUMBER_RE = r'^\#(\S+)'
HEAD_RE = r'^\((.+?)\)(\w)'
def process_dream_span(span):
text = span.text.encode('utf-8').strip()
# remove number
number_groups = re.match(NUMBER_RE, text)
number = number_groups.group(1).strip()
text = re.sub(NUMBER_RE, '', text).strip()
# remove word count
text = re.sub(r'\s*\(\d+\s+words\)\s*$', '', text, re.I)
# Split nb-space as paragraphs
text = re.sub('\\s*?(\xc2\xa0)+\\s*', '\n', text).strip()
# sep desc
head_match = re.match(HEAD_RE, text)
if head_match is None:
return OrderedDict([
('number', number),
('content', text)])
head = head_match.group(1).strip()
text = re.sub(HEAD_RE, lambda x: x.group(2), text).strip()
return OrderedDict([
('number', number),
('head', head),
('content', text)])
def process_dream_page(text):
soup = BeautifulSoup(text, 'html.parser')
dreams = soup.find_all("span")
for dream in soup.find_all("span"):
data = process_dream_span(dream)
if data is not None:
yield data
def download_dreams(dreamer):
url = DREAMBANK.format(dreamer)
r = requests.get(url)
if r.status_code != 200:
print("error getting dreams")
return None
return r.text
def collect_dreams(dreamer, desc):
text = download_dreams(dreamer)
return OrderedDict([
('dreamer', dreamer),
('description', desc),
('dreams', list(process_dream_page(text)))])
for dreamer, desc in DREAMERS.items():
dreams = collect_dreams(dreamer, desc)
if dreams is not None:
with open('dreams/' + dreamer + '.json', 'w') as out_file:
json.dump(dreams, out_file, sort_keys=False, indent=4, ensure_ascii=False)
但是,我遇到错误“ TypeError:无法在类似字节的对象上使用字符串模式”。该代码已经在使用encode('utf-8')。关于应尝试解决该错误的任何想法?
错误回溯
<ipython-input-3-49906dd4d65a> in <module>()
65
66 for dreamer, desc in DREAMERS.items():
--> 67 dreams = collect_dreams(dreamer, desc)
68 if dreams is not None:
69 with open('dreams/' + dreamer + '.json', 'w') as out_file:
<ipython-input-3-49906dd4d65a> in collect_dreams(dreamer, desc)
61 ('dreamer', dreamer),
62 ('description', desc),
--> 63 ('dreams', list(process_dream_page(text)))])
64
65
<ipython-input-3-49906dd4d65a> in process_dream_page(text)
44
45 for dream in soup.find_all('span'):
--> 46 data = process_dream_span(dream)
47 if data is not None:
48 yield data
<ipython-input-3-49906dd4d65a> in process_dream_span(span)
13 text = span.text.encode('utf-8').strip()
14 # remove number
--> 15 number_groups = re.match(NUMBER_RE, text)
16 number = number_groups.group(1).strip()
17
~/anaconda3/lib/python3.6/re.py in match(pattern, string, flags)
170 """Try to apply the pattern at the start of the string, returning
171 a match object, or None if no match was found."""
--> 172 return _compile(pattern, flags).match(string)
173
174 def fullmatch(pattern, string, flags=0):
TypeError: cannot use a string pattern on a bytes-like object