尝试抓取imdb时遇到了几个问题,这里没有找到答案。
我尝试使用以下代码从http://www.imdb.com/search/title?release_date=1950&page=1等网页抓取一些数据:
import scrapy
from tutorial.items import MovieItem, CastItem
class tutorialSpider(scrapy.Spider):
name = "tutorial"
allowed_domains = ["imdb.com"]
# generate start_urls dynamically
def start_requests(self):
for year in range(1950, 1951):
for page in range(1, 3):
yield scrapy.Request('http://www.imdb.com/search/title?release_date=%s&page=%s' % (year, page))
def parse(self, response):
self.wanted_num=50
for sel in response.xpath("//*[contains(@class,'lister-item-content')]"):
item = MovieItem()
item['Title'] = sel.xpath('h3/a/text()').extract()[0]
item['Rating'] = sel.xpath('div[@class="ratings-bar"]/div[@name="ir"]/strong/text()').extract()[0]
item['Ranking']=sel.xpath('h3/span[@class="lister-item-index unbold text-primary"]/text()').extract()[0]
item['ReleaseDate'] = sel.xpath('h3/span[@class="lister-item-year text-muted unbold"]/text()').extract()[0]
item['MianPageUrl'] = "http://imdb.com"+sel.xpath('h3/a/@href').extract()[0]
request = scrapy.Request(item['MianPageUrl'], callback=self.parseMovieDetails)
request.meta['item'] = item
if int(item['Ranking']) >= self.wanted_num + 1:
return
yield request
所以,这里的问题是:
感谢您的帮助!