Question

我尝试抓取的网站是http://www.boxofficemojo.com/yearly/chart/?yr=2013&p=.htm。我现在关注的具体页面是http://www.boxofficemojo.com/movies/?id=catchingfire.htm。

我需要得到＆＃34;外国毛？＆＃34;金额（在“终身总寿命”下），但由于某种原因，我无法通过循环获取所有电影，但它适用于我输入的单个链接。

这是我为每部电影获取此金额的功能。

def getForeign(item_url):
    s = urlopen(item_url).read()
    soup = BeautifulSoup(s)
    return soup.find(text="Foreign:").find_parent("td").find_next_sibling("td").get_text(strip = True)

这是循环每个链接的功能

def spider(max_pages):
    page = 1
    while page <= max_pages:
        url = 'http://www.boxofficemojo.com/yearly/chart/?page=' + str(page) + '&view=releasedate&view2=domestic&yr=2013&p=.htm'
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text)
        for link in soup.select('td > b > font > a[href^=/movies/?]'):
            href = 'http://www.boxofficemojo.com' + link.get('href')
            details(href)
            listOfDirectors.append(getDirectors(href))
            str(listOfDirectors).replace('[','').replace(']','')
            #getActors(href)
            title = link.string
            listOfTitles.append(title)
        page += 1

我有一个名为listOfForeign = []的列表，我希望将每部电影的外国总金额附加到。问题是，如果我使用我输入的单个完整链接调用getForeign（item_url），例如：

print listOfForeign.append(getForeign(http://www.boxofficemojo.com/movies/?id=catchingfire.htm))

然后再

print listOfForeign

它打印出一个正确的数量。

但是当我运行函数spider（max_pages）时，添加：

listOfForeign.append(getForeign(href))

在for循环中，后来尝试打印listOfForeign，我收到错误

AttributeError: 'NoneType' object has no attribute 'find_parent'

为什么我无法为蜘蛛功能中的每部电影成功添加此金额？在spider（max_pages）函数中，我得到了变量＆＃34; href＆＃34;中的每个电影链接，并且基本上与分别添加每个单独链接做同样的事情。

完整代码：

import requests
from bs4 import BeautifulSoup
from urllib import urlopen
import xlwt
import csv
from tempfile import TemporaryFile

listOfTitles = []
listOfGenre = []
listOfRuntime = []
listOfRatings = []
listOfBudget = []
listOfDirectors = []
listOfActors = []
listOfForeign = []
resultFile = open("movies.csv",'wb')
wr = csv.writer(resultFile, dialect='excel')

def spider(max_pages):
    page = 1
    while page <= max_pages:
        url = 'http://www.boxofficemojo.com/yearly/chart/?page=' + str(page) + '&view=releasedate&view2=domestic&yr=2013&p=.htm'
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text)
        for link in soup.select('td > b > font > a[href^=/movies/?]'):
            href = 'http://www.boxofficemojo.com' + link.get('href')
            details(href)
            listOfForeign.append(getForeign(href))
            listOfDirectors.append(getDirectors(href))
            str(listOfDirectors).replace('[','').replace(']','')
            #getActors(href)
            title = link.string
            listOfTitles.append(title)
        page += 1


def getDirectors(item_url):
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
    tempDirector = []
    for director in soup.select('td > font > a[href^=/people/chart/?view=Director]'):
        tempDirector.append(str(director.string))
    return tempDirector

def getActors(item_url):
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
    tempActors = []
    print soup.find(text="Actors:").find_parent("tr").text[7:]



def details(href):
    response = requests.get(href)
    soup = BeautifulSoup(response.content)
    genre = soup.find(text="Genre: ").next_sibling.text
    rating = soup.find(text='MPAA Rating: ').next_sibling.text
    runtime = soup.find(text='Runtime: ').next_sibling.text
    budget = soup.find(text='Production Budget: ').next_sibling.text

    listOfGenre.append(genre)
    listOfRuntime.append(runtime)
    listOfRatings.append(rating)
    listOfBudget.append(budget)


def getForeign(item_url):
    s = urlopen(item_url).read()
    soup = BeautifulSoup(s)
    try:
        return     soup.find(text="Foreign:").find_parent("td").find_next_sibling("td").get_text(strip = True)
    except AttributeError:
        return "$0"

spider(1)

print listOfForeign
wr.writerow(listOfTitles)
wr.writerow(listOfGenre)
wr.writerow(listOfRuntime)
wr.writerow(listOfRatings)
wr.writerow(listOfBudget)
for item in listOfDirectors:
    wr.writerow(item)

Answer 1

一旦遇到没有外国收入的电影页面，代码就会失败，例如42。你应该处理这样的案件。例如，捕获异常并将其设置为$0。

您还遇到了differences between parsers - specify the lxml or html5lib parser explicitly（您需要lxml或html5lib installed）。

另外，为什么不使用requests解析电影页面：

def getForeign(item_url):
    response = requests.get(item_url)
    soup = BeautifulSoup(response.content, "lxml")  # or BeautifulSoup(response.content, "html5lib")
    try:
        return soup.find(text="Foreign:").find_parent("td").find_next_sibling("td").get_text(strip = True)
    except AttributeError:
        return "$0"

作为旁注，总体而言，由于脚本的阻塞性质，您所拥有的代码变得非常复杂和缓慢，请求将按顺序逐个发送。切换到Scrapy web-scraping framework可能是一个好主意，除了使代码更快，有助于将其组织成逻辑组之外 - 你将拥有一个带有内部抓取逻辑的蜘蛛，项类定义您的提取数据模型，用于将提取的数据写入数据库的管道（如果需要等等）。

Python BeautifulSoup webcrawling：将数据附加到列表

1 个答案: