我正在尝试为我的 items.py 文件中定义的教授统计信息抓取RateMyProfessors:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field
class ScraperItem(Item):
# define the fields for your item here like:
numOfPages = Field() # number of pages of professors (usually 476)
firstMiddleName = Field() # first (and middle) name
lastName = Field() # last name
numOfRatings = Field() # number of ratings
overallQuality = Field() # numerical rating
averageGrade = Field() # letter grade
profile = Field() # url of professor profile
pass
这是我的 scraper_spider.py 文件:
import scrapy
from scraper.items import ScraperItem
from scrapy.contrib.spiders import Rule
from scrapy.contrib.linkextractors import LinkExtractor
class scraperSpider(scrapy.Spider):
name = "scraper"
allowed_domains = ["www.ratemyprofessors.com"]
start_urls = [
"http://www.ratemyprofessors.com/search.jsp?queryBy=teacherName&schoolName=pennsylvania+state+university"
]
rules = (
Rule(LinkExtractor(restrict_xpaths=('//a[@class="nextLink"]')),callback='parse',follow=True),
)
def parse(self, response):
# professors = []
numOfPages = int(response.xpath('((//a[@class="step"])[last()])/text()').extract()[0])
# create array of profile links
profiles = response.xpath('//li[@class="listing PROFESSOR"]/a/@href').extract()
# for each of those links
for profile in profiles:
# define item
professor = ScraperItem();
# add profile to professor
professor["profile"] = profile
# pass each page to the parse_profile() method
request = scrapy.Request("http://www.ratemyprofessors.com"+profile,
callback=self.parse_profile)
request.meta["professor"] = professor
# add professor to array of professors
yield request
def parse_profile(self, response):
professor = response.meta["professor"]
if response.xpath('//*[@class="pfname"]'):
# scrape each item from the link that was passed as an argument and add to current professor
professor["firstMiddleName"] = response.xpath('//h1[@class="profname"]/span[@class="pfname"][1]/text()').extract()
if response.xpath('//*[@class="plname"]'):
professor["lastName"] = response.xpath('//h1[@class="profname"]/span[@class="plname"]/text()').extract()
if response.xpath('//*[@class="table-toggle rating-count active"]'):
professor["numOfRatings"] = response.xpath('//div[@class="table-toggle rating-count active"]/text()').extract()
if response.xpath('//*[@class="grade"]'):
professor["overallQuality"] = response.xpath('//div[@class="breakdown-wrapper"]/div[@class="breakdown-header"][1]/div[@class="grade"]/text()').extract()
if response.xpath('//*[@class="grade"]'):
professor["averageGrade"] = response.xpath('//div[@class="breakdown-wrapper"]/div[@class="breakdown-header"][2]/div[@class="grade"]/text()').extract()
return professor
# add string to rule. linkextractor only gets "/showratings.." not "ratemyprofessors.com/showratings"
我的问题在于上面的 scraper_spider.py 文件。蜘蛛应该转到this RateMyProfessors页面并转到每个教授并获取信息,然后返回目录并获取下一个教授的信息。在页面上没有剩下的教授要抓,它应该找到下一个按钮的 href值并转到该页面并按照相同的方法。
我的刮刀能够刮掉目录第1页上的所有教授,但之后会停止,因为它不会进入下一页。
你能帮助我的刮刀成功找到并转到下一页吗?
我尝试关注this StackOverflow问题,但它太具体而无法使用。