Scrapy和Python全新,所以不确定这里的问题是什么。 基本上尝试遍历每个页面并在每个页面上存储标题。
这是不起作用的代码。 它使第一页很好,但只为其余部分打印空标题。
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.http import Request
from metacritic.items import MetacriticItem
class MetacriticSpider(BaseSpider):
name = "metacritic"
allowed_domains = ["metacritic.com"]
max_id = 5
start_urls = [
"http://www.metacritic.com/browse/games/title/ps3?page="
#"http://www.metacritic.com/browse/games/title/xbox360?page=0"
]
def start_requests(self):
for i in range(self.max_id):
yield Request('http://www.metacritic.com/browse/games/title/ps3?page=%d' % i, callback = self.parse)
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//ol/li/div/div')
items = []
for site in sites:
item = MetacriticItem()
item['title'] = site.xpath('a/text()').extract()
items.append(item)
return items
答案 0 :(得分:2)
我认为网址http://www.metacritic.com/browse/games/title/ps3?page=%d' % i
是错误的。
尝试打开网址http://www.metacritic.com/browse/games/title/ps3?page=1
并且您将看到以下消息:"未找到结果"。
正确的网址似乎是'http://www.metacritic.com/browse/games/title/ps3/%c?page=%d' % (c, i)
,其中c是小写字母(ex1,ex2)。
所以我修改了你的代码如下。这段代码怎么样?
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.http import Request
from metacritic.items import MetacriticItem
from string import lowercase
class MetacriticSpider(BaseSpider):
name = "metacritic"
allowed_domains = ["metacritic.com"]
max_id = 5
def start_requests(self):
for c in lowercase:
for i in range(self.max_id):
yield Request('http://www.metacritic.com/browse/games/title/ps3/{0}?page={1}'.format(c, i),
callback=self.parse)
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[@class="product_wrap"]/div')
items = []
for site in sites:
titles = site.xpath('a/text()').extract()
if titles:
item = MetacriticItem()
item['title'] = titles[0].strip()
items.append(item)
return items