Question

我正在使用此代码从DreamBank网站刮除梦想。它将DREAMERS字典中列出的集合的所有梦想下载到dreams目录。

import re
import requests
import json
from bs4 import BeautifulSoup
from collections import OrderedDict


DREAMBANK = "http://dreambank.net/random_sample.cgi?series={0}&min=1&max=9999&n=10000"

# Dream collections to download
DREAMERS = {
"alta": "Alta: a detailed dreamer", "angie": "Angie: age 18 & 20", "arlie": "Arlie: a middle-aged woman", "b": "Barb Sanders", "b2": "Barb Sanders #2", "b-baseline": "Barb Sanders: baseline", "bay_area_girls_456": "Bay Area girls: Grades 4-6", "bay_area_girls_789": "Bay Area girls: Grades 7-9", "bea1": "Bea 1: a high school student", "bea2": "Bea 2: a college student", "blind-f": "Blind dreamers (F)", "blind-m": "Blind dreamers (M)", "chris": "Chris: a transvestite", "chuck": "Chuck: a physical scientist", "hall_female": "College women, late 1940s", "dahlia": "Dahlia: concerns with appearance", "david": "David: teenage dreams", "vonuslar.de": "Detlev von Uslar, auf Deutsch", "dorothea": "Dorothea: 53 years of dreams", "ed": "Ed: dreams of his late wife", "edna": "Edna: a blind woman", "elizabeth": "Elizabeth: a woman in her 40s", "emmas_husband": "Emma's Husband", "emma": "Emma: 48 years of dreams", "esther": "Esther: an adolescent girl", "german-f.de": "German dreams (F)", "german-m.de": "German dreams (M)", "norms-f": "Hall/VdC Norms: Female", "norms-m": "Hall/VdC Norms: Male", "jasmine1": "Jasmine 1: middle school", "jasmine2": "Jasmine 2: high school", "jasmine3": "Jasmine 3: college 1", "jasmine4": "Jasmine 4: college 2", "jeff": "Jeff: a lucid dreamer", "joan": "Joan: a lesbian", "kenneth": "Kenneth", "lawrence": "Lawrence, a young man", "mack": "Mack: A poor recaller", "madeline1-hs": "Madeline 1: High School", "madeline2-dorms": "Madeline 2: College Dorms", "madeline3-offcampus": "Madeline 3: Off-Campus", "madeline4-postgrad": "Madeline 4: After College", "mark": "Mark: a young boy", "melissa": "Melissa: a young girl", "melora": "Melora (Melvin's wife)", "melvin": "Melvin (Melora's husband)", "merri": "Merri: an artist", "miami-home": "Miami Home-Lab: Home", "miami-lab": "Miami Home-Lab: Lab", "midwest_teens-f": "Midwest teenagers (F)", "midwest_teens-m": "Midwest teenagers (M)", "nancy": "Nancy: Caring & headstrong", "natural_scientist": "The Natural Scientist", "norman": "Norman: a child molester", "pegasus": "Pegasus: a factory worker", "peru-m": "Peruvian men", "peru-f": "Peruvian women", "phil1": "Phil 1: teens", "phil2": "Phil 2: late 20s", "phil3": "Phil 3: retirement", "physiologist": "The Physiologist", "ringo": "Ringo: from the 1960s", "bosnak": "Robert Bosnak: A dream analyst", "samantha": "Samantha: in her 20s", "seventh_graders": "Seventh grade girls", "zurich-f.de": "Swiss children, auf Deutsch (F)", "zurich-m.de": "Swiss children, auf Deutsch (M)", "toby": "Toby: A friendly party animal", "tom": "Tom: An outgoing man", "ucsc_women": "UCSC women, 1996", "vickie": "Vickie: a 10-year-old girl", "vietnam_vet": "Vietnam Vet: 1970-2008 war dreams", "vietnam_vet2": "Vietnam Vet: 2015 dreams", "wedding": "Wedding dreams", "west_coast_teens": "West Coast teenage girls" }


NUMBER_RE = r'^\#(\S+)'

HEAD_RE = r'^\((.+?)\)(\w)'

def process_dream_span(span):
    text = span.text.encode('utf-8').strip()
    # remove number
    number_groups = re.match(NUMBER_RE,  text)
    number = number_groups.group(1).strip()

    text = re.sub(NUMBER_RE, '', text).strip()

    # remove word count
    text = re.sub(r'\s*\(\d+\s+words\)\s*$', '', text, re.I)

    # Split nb-space as paragraphs
    text = re.sub('\\s*?(\xc2\xa0)+\\s*', '\n', text).strip()

    # sep desc
    head_match = re.match(HEAD_RE, text)
    if head_match is None:
        return OrderedDict([
            ('number', number),
            ('content', text)])

    head = head_match.group(1).strip()
    text = re.sub(HEAD_RE, lambda x: x.group(2), text).strip()

    return OrderedDict([
        ('number', number),
        ('head', head),
        ('content', text)])


def process_dream_page(text):
    soup = BeautifulSoup(text, 'html.parser')
    dreams = soup.find_all("span")

    for dream in soup.find_all("span"):
        data = process_dream_span(dream)
        if data is not None:
            yield data



 def download_dreams(dreamer):
     url = DREAMBANK.format(dreamer)
     r = requests.get(url)
    if r.status_code != 200:
        print("error getting dreams")
        return None
    return r.text


 def collect_dreams(dreamer, desc):
     text = download_dreams(dreamer)
     return OrderedDict([
         ('dreamer', dreamer),
         ('description', desc),
         ('dreams', list(process_dream_page(text)))])


 for dreamer, desc in DREAMERS.items():
     dreams = collect_dreams(dreamer, desc)
     if dreams is not None:
         with open('dreams/' + dreamer + '.json', 'w') as out_file:
             json.dump(dreams, out_file, sort_keys=False, indent=4, ensure_ascii=False)

但是，我遇到错误“ TypeError：无法在类似字节的对象上使用字符串模式”。该代码已经在使用encode（'utf-8'）。关于应尝试解决该错误的任何想法？

错误回溯

<ipython-input-3-49906dd4d65a> in <module>()
     65 
     66 for dreamer, desc in DREAMERS.items():
 --> 67     dreams = collect_dreams(dreamer, desc)
     68     if dreams is not None:
     69         with open('dreams/' + dreamer + '.json', 'w') as out_file:

<ipython-input-3-49906dd4d65a> in collect_dreams(dreamer, desc)
    61         ('dreamer', dreamer),
    62         ('description', desc),
--> 63         ('dreams', list(process_dream_page(text)))])
    64 
    65 

<ipython-input-3-49906dd4d65a> in process_dream_page(text)
    44 
    45     for dream in soup.find_all('span'):
--> 46         data = process_dream_span(dream)
    47         if data is not None:
    48             yield data

<ipython-input-3-49906dd4d65a> in process_dream_span(span)
    13     text = span.text.encode('utf-8').strip()
    14     # remove number
--> 15     number_groups = re.match(NUMBER_RE,  text)
    16     number = number_groups.group(1).strip()
    17 

 ~/anaconda3/lib/python3.6/re.py in match(pattern, string, flags)
    170     """Try to apply the pattern at the start of the string, returning
    171     a match object, or None if no match was found."""
--> 172     return _compile(pattern, flags).match(string)
    173 
    174 def fullmatch(pattern, string, flags=0):

TypeError: cannot use a string pattern on a bytes-like object

DreamScrape错误：TypeError：无法在类似字节的对象上使用字符串模式

0 个答案: