我终于设法获得了一个工作脚本。 只有一个小问题。除了第一页,我可以抓取所有页面并获取所需的所有信息。
我的错误在哪里?
import scrapy.selector
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from Prijsvergelijking.items import PrijsvergelijkingItem
class MySpider(CrawlSpider):
name = "coolblue"
allowed_domains = ["tvstore.be"]
start_urls = ["http://www.tvstore.be/category/192945/televisies.html"]
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="pagination next secondary"]',)), callback = "parse_items",follow = True),)
def parse_items(self, response):
products = response.xpath("//li[@class='product-list-columns--item product-list-item']")
for product in products:
item = PrijsvergelijkingItem()
item["Product_ref"] = product.xpath(".//h2/a/text()").extract_first().strip()
item["Product_price"] = product.xpath(".//strong[1]/text()").extract_first().strip().replace(",",".").replace("-","")
yield item
答案 0 :(得分:0)
我看起来不够努力。
我找到了答案。我所要做的就是将parse_Items改为parse_start_url。
from scrapy.spiders import CrawlSpider, Rule
import scrapy.selector
from scrapy.linkextractors import LinkExtractor
from Prijsvergelijking.items import PrijsvergelijkingItem
class MySpider(CrawlSpider):
name = "msh"
allowed_domains = ["mediamarkt.be"]
start_urls = ["http://www.mediamarkt.be/mcs/productlist/_TV,98952,452540.html?langId=-17&searchParams=&sort=&view=&page=1"]
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//li[@class="pagination-next"]',)), callback = "parse_start_url",follow = True),)
def parse_start_url(self, response):
products = response.xpath("//ul[@class='products-list']/li/div")
for product in products:
item = PrijsvergelijkingItem()
item["Product_price"] = product.xpath('.//aside/div/div/div/text()').extract_first().replace(",", ".").replace("-", "")
item["Product_ref"] = product.xpath('.//div/h2/a/text()').extract_first().strip()
yield item