我想抓取整个产品类别,但似乎在某种程度上效果很好,而且还没有停止。
这是我的代码:
import scrapy
from Demo.items import DemoItem
class ProductSpider(scrapy.Spider):
name='black1'
start_urls = [ 'https://octopart.com/search?category_ids=4215&start=0' ]
def parse(self,response):
items = DemoItem()
for product in response.xpath("//div[@class='serp-card-header media']/div[@class='media-body']"):
name = product.xpath(".//a/span[@class='part-card-manufacturer']/text()").extract()
ver = product.xpath(".//a/span[@class='part-card-mpn']/text()").extract()
items['product_name'] = ''.join(name).strip()
items['product_code'] = ''.join(ver).strip()
yield items
next_page = response.xpath("//a[contains(text(), 'Next')]/@href").extract_first()
print next_page
if next_page is not None:
print next_page
next_page_link = response.urljoin(next_page)
print next_page_link
yield scrapy.Request(url=next_page_link, callback=self.parse)
结果:
https://octopart.com/search?category_ids=4215&start=200
2019-03-06 13:51:46 [scrapy.core.engine] DEBUG: Crawled (403) <GET https://octopart.com/search?category_ids=4215&start=200> (referer: https://octopart.com/search?category_ids=4215&start=190)
2019-03-06 13:51:46 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <403 https://octopart.com/search?category_ids=4215&start=200>: HTTP status code is not handled or not allowed