import scrapy
from scrapy import Request
#scrapy crawl jobs9 -o jobs9.csv -t csv
class JobsSpider(scrapy.Spider):
name = "jobs9"
allowed_domains = ["vapedonia.com"]
start_urls = ["https://www.vapedonia.com/7-principiantes-kit-s-de-inicio-",
"https://www.vapedonia.com/10-cigarrillos-electronicos-",
"https://www.vapedonia.com/11-mods-potencia-",
"https://www.vapedonia.com/12-consumibles",
"https://www.vapedonia.com/13-baterias",
"https://www.vapedonia.com/23-e-liquidos",
"https://www.vapedonia.com/26-accesorios",
"https://www.vapedonia.com/31-atomizadores-reparables",
"https://www.vapedonia.com/175-alquimia-",
"https://www.vapedonia.com/284-articulos-en-liquidacion"]
def parse(self, response):
products = response.xpath('//div[@class="product-container clearfix"]')
for product in products:
image = product.xpath('div[@class="center_block"]/a/img/@src').extract_first()
link = product.xpath('div[@class="center_block"]/a/@href').extract_first()
name = product.xpath('div[@class="right_block"]/p/a/text()').extract_first()
price = product.xpath('div[@class="right_block"]/div[@class="content_price"]/span[@class="price"]/text()').extract_first().encode("utf-8")
yield{'Image' : image, 'Link' : link, 'Name': name, 'Price': price}
relative_next_url = response.xpath('//*[@id="pagination_next"]/a/@href').extract_first()
absolute_next_url = "https://www.vapedonia.com" + str(relative_next_url)
yield Request(absolute_next_url, callback=self.parse)
使用该代码,我正确地抓取了页面及其子页面的产品。所有页面都已被抓取。
如果我要废弃整个网站,我必须手动将类别网址放在“start_urls”中。 gppd的东西应该抓住那些网址,使其抓取动态。
除了简单的分页抓取之外,如何将抓取与抓取混合起来?
谢谢。
现在,我改进了我的代码,这是新代码:
import scrapy
from scrapy import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
#scrapy crawl jobs10 -o jobs10.csv -t csv
class JobsSpider(scrapy.spiders.CrawlSpider):
name = "jobs10"
allowed_domains = ["vapedonia.com"]
start_urls = ["https://www.vapedonia.com/"]
rules = (Rule(LinkExtractor(allow=(r"https://www.vapedonia.com/\d+.*",)), callback='parse_category'), )
def parse_category(self, response):
products = response.xpath('//div[@class="product-container clearfix"]')
for product in products:
image = product.xpath('div[@class="center_block"]/a/img/@src').extract_first()
link = product.xpath('div[@class="center_block"]/a/@href').extract_first()
name = product.xpath('div[@class="right_block"]/p/a/text()').extract_first()
price = product.xpath('div[@class="right_block"]/div[@class="content_price"]/span[@class="price"]/text()').extract_first().encode("utf-8")
yield{'Image' : image, 'Link' : link, 'Name': name, 'Price': price}
我所做的更改如下:
1-我导入Crawlspider,Rule和LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
2- jobSpider类不再继承“scrapy.Spider”。它现在继承自scrapy.spiders.CrawlSpider(已在上一步中导出)
3-“starts_urls”不是由静态网址列表组成的,我们只是取域名,所以
start_urls = ["https://www.vapedonia.com/7-principiantes-kit-s-de-inicio-",
"https://www.vapedonia.com/10-cigarrillos-electronicos-",
"https://www.vapedonia.com/11-mods-potencia-",
"https://www.vapedonia.com/12-consumibles",
"https://www.vapedonia.com/13-baterias",
"https://www.vapedonia.com/23-e-liquidos",
"https://www.vapedonia.com/26-accesorios",
"https://www.vapedonia.com/31-atomizadores-reparables",
"https://www.vapedonia.com/175-alquimia-",
"https://www.vapedonia.com/284-articulos-en-liquidacion"]
替换为
start_urls = ["https://www.vapedonia.com/"]
4-我们制定了规则
rules = (Rule(LinkExtractor(allow=(r"https://www.vapedonia.com/\d+.*",)), callback='parse_category'), )
我们不再称之为“解析”,而是“parse_category”
5-之前的分页爬行消失。所以,下一个代码就会消失
relative_next_url = response.xpath('//*[@id="pagination_next"]/a/@href').extract_first()
absolute_next_url = "https://www.vapedonia.com" + str(relative_next_url)
yield Request(absolute_next_url, callback=self.parse)
因此,当我看到它并且看起来非常合乎逻辑时,分页抓取过程将被url抓取过程取代。
但是......它不起作用,甚至与编码(“utf-8”)一起使用的“价格”字段也不再起作用。
答案 0 :(得分:1)
在这种情况下,您需要使用带有规则的CrawlSpider。以下是您的刮刀的简单翻译
class JobsSpider(scrapy.spiders.CrawlSpider):
name = "jobs9"
allowed_domains = ["vapedonia.com"]
start_urls = ["https://www.vapedonia.com"]
rules = (Rule(LinkExtractor(allow=(r"https://www.vapedonia.com/\d+.*",)), callback='parse_category'), )
def parse_category(self, response):
products = response.xpath('//div[@class="product-container clearfix"]')
for product in products:
image = product.xpath('div[@class="center_block"]/a/img/@src').extract_first()
link = product.xpath('div[@class="center_block"]/a/@href').extract_first()
name = product.xpath('div[@class="right_block"]/p/a/text()').extract_first()
price = product.xpath(
'div[@class="right_block"]/div[@class="content_price"]/span[@class="price"]/text()').extract_first().encode(
"utf-8")
yield {'Image': image, 'Link': link, 'Name': name, 'Price': price}
上查看不同的蜘蛛