你好我写了这个蜘蛛来获取starturl上的新产品。但是,我在编写另一个回调来解析新产品时遇到问题,然后回到正常的解析回调,其中再次继续抓取新产品。 这就是我现在所拥有的
def parse(self, response):
products = Selector(response).xpath(
'//div[@class="browsing-product-list"]//figure[contains(@class,"browsing-product-item")]')
for product in products:
item = StartItem()
item['name'] = product.xpath('.//a/figcaption/p[2]/text()').extract()[0]
item['link'] = product.xpath('.//meta[3]/@content').extract()[0]
#New callback method to parse new url found not sure how to implement
yield Request(StartURL, callback=self.parse, dont_filter=True, priority=70)
ru = scrapy.Request(url=response.urljoin(item['link']), callback=self.parseProduct)
ru.meta['item'] = item
yield ru
def parseProduct(self, response):
item = response.meta['item']
imageUrls = response.xpath('id("img")/option/text()').extract()
item['image_urls'] = imageUrls
yield item
所以请任何帮助都会很棒 下面的新编辑
Superurl = "https://www.ssense.com/en-us/men/sneakers"
class SuperSpider(Spider):
name = "SuperSpider"
allowded_domains = ["randomtester.com"]
start_urls = [SuperURL]
def __init__(self):
logging.critical("starting superspider.")
def parse(self, response):
products = Selector(response).xpath('//div[@class="browsing-product-list"]//figure[contains(@class,"browsing-product-item")]')
for product in products:
item = SuperItem()
item['name'] =product.xpath('.//a/figcaption/p[2]/text()').extract()[0]
item['link'] = product.xpath('.//meta[3]/@content').extract()[0]
# Not sure how to implement this to request the new url to parse
ru = scrapy.Request(url=response.urljoin(item['link']), callback=self.parseProduct)
ru.meta['item'] = item
yield ru
yield Request(SuperURL, callback=self.parse, dont_filter=True, priority=70)
def parseProduct(self, response):
item = response.meta['item']
imageUrls = response.xpath('id("size")/option/text()').extract()
item['image_urls'] = imageUrls
yield item
答案 0 :(得分:0)
我获取您的代码并创建可以在没有项目的情况下运行的独立脚本 它没有问题 - 所以我不知道你有什么问题。
目前它甚至会下载图片,因为您的版本会收到错误的数据。
from scrapy import Spider, Request
from scrapy.selector import Selector
import logging
import json
SuperURL = "https://www.ssense.com/en-us/men/sneakers"
class SuperSpider(Spider):
name = "SuperSpider"
start_urls = [SuperURL]
def __init__(self):
logging.critical("starting superspider.")
def parse(self, response):
products = Selector(response).xpath('//div[@class="browsing-product-list"]//figure[contains(@class,"browsing-product-item")]')
#products = response.xpath('//figure[@class="browsing-product-item"]')
for product in products:
#item = SuperItem()
item = {}
item['name'] = product.xpath('.//a/figcaption/p[2]/text()').extract()[0]
item['link'] = product.xpath('.//meta[3]/@content').extract()[0]
# Not sure how to implement this to request the new url to parse
ru = Request(url=response.urljoin(item['link']), callback=self.parseProduct)
ru.meta['item'] = item
yield ru
#yield Request(url=response.urljoin(item['link']), callback=self.parseProduct, meta={'item': item})
yield Request(SuperURL, callback=self.parse, dont_filter=True, priority=70)
def parseProduct(self, response):
item = response.meta['item']
all_scripts = response.xpath('//script/text()').extract()
for script in all_scripts:
if 'window.INITIAL_STATE=' in script:
images = json.loads(script[21:])["products"]["current"]["images"]
item['image_urls'] = [x.replace('__IMAGE_PARAMS__', 'b_white,c_lpad,g_center,h_960,w_960/c_scale,h_680/f_auto,dpr_1.0') for x in images]
yield item
# --- it runs without project and saves in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file as CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
# download images and convert to JPG
# it needs `yield {'image_urls': [url]}` in `parse()`
'ITEM_PIPELINES': {'scrapy.pipelines.images.ImagesPipeline': 1},
'IMAGES_STORE': '.',
})
c.crawl(SuperSpider)
c.start()
我在Python 3.6.2
Scrapy 1.4.0
和Linux Mint 18.2
import sys
print('Python:', sys.version)
import scrapy
print('Scrapy:', scrapy.__version__)