简介
我必须在我的搜寻器中添加某些productlink的“其他人也已购买”项目。 对我来说真的很奇怪,因为存在“移动开放”和“内部生成”之类的div,这对我意味着什么?
目标
除了“别人也买了”以外,我已经掌握了所有需要的重要信息,经过数小时的尝试,我决定在这里问一下,然后再浪费更多的时间并变得更加沮丧
HTML构建
我的代码
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import DuifcsvItem
import csv
class DuifSpider(scrapy.Spider):
name = "duif"
allowed_domains = ['duif.nl']
custom_settings = {'FIELD_EXPORT_FIELDS' : ['SKU', 'Title', 'Title_small', 'NL_PL_PC', 'Description']}
with open("duifonlylinks.csv","r") as f:
reader = csv.DictReader(f)
start_urls = [items['Link'] for items in reader]
rules = (
Rule(LinkExtractor(), callback='parse'),
)
def parse(self, response):
card = response.xpath('//div[@class="heading"]')
if not card:
print('No productlink', response.url)
items = DuifcsvItem()
items['Link'] = response.url
items['SKU'] = response.xpath('//p[@class="desc"]/text()').get().strip()
items['Title'] = response.xpath('//h1[@class="product-title"]/text()').get()
items['Title_small'] = response.xpath('//div[@class="left"]/p/text()').get()
items['NL_PL_PC'] = response.xpath('//div[@class="desc"]/ul/li/em/text()').getall()
items['Description'] = response.xpath('//div[@class="item"]/p/text()').getall()
yield items
实际网页:https://www.duif.nl/product/pot-seal-matt-finish-light-pink-large
如果可以使用xpath访问此href,那么效果会很好
我已经尝试过的XPATH
>>> response.xpath('//div[@class="title"]/h3/text()').get()
>>> response.xpath('//div[@class="inner generated"]/div//h3/text()').get()
>>> response.xpath('//div[@class="wrap-products"]/div/div/a/@href').get()
>>> response.xpath('/div[@class="description"]/div/h3/text()').get()
>>> response.xpath('//div[@class="open-on-mobile"]/div/div/div/a/@href').get()
>>> response.xpath('//div[@class="product cross-square white"]/a/@href').get()
>>> response.xpath('//a[@class="product-link"]').get()
>>> response.xpath('//a[@class="product-link"]').getall()
答案 0 :(得分:1)
您可以在HTML的此部分中找到“其他人也购买了”产品ID(请参见createCrossSellItems
部分)
<script>
$(function () {
createUpsellItems("885034747 | 885034800 | 885034900 |")
createCrossSellItems("885034347 | 480010600 | 480010700 | 010046700 | 500061967 | 480011000 |")
})
</script>
但是将所有这些产品的详细信息添加到您的主要产品中会有些棘手。首先,您需要了解如何保存此信息(一对多)。它可能是单个字段OtherAlsoBought
,例如,您将在其中保存类似JSON的结构。或者,您可以使用许多字段,例如OtherAlsoBought_Product_1_Title
,OtherAlsoBought_Product_1_Link
,OtherAlsoBought_Product_2_Title
,OtherAlsoBought_Product_2_Link
等。
收集这些详细信息的一种可能方法是将所有产品ID保存到一个数组中,然后将每个yield
的ID一次保存(简单的GET
https://www.duif.nl/api/v2/catalog/product?itemcode=885034347_Parent
应该可以正常使用Referer
标头),也传递产品数组(使用meta
或cb_kwargs
)来获取下一个ID。当然,您还需要为每个请求传递主item
(向其添加当前产品详细信息,并在末尾yield
所有内容)。
更新 您需要在以下代码中添加所需的字段:
import scrapy
import json
import re
class DuifSpider(scrapy.Spider):
name="duif"
start_urls = ['https://www.duif.nl/product/pot-seal-matt-finish-light-pink-large']
def parse(self, response):
item = {}
item['title'] = response.xpath('//h1[@class="product-title"]/text()').get()
item['url'] = response.url
item['cross_sell'] = []
cross_sell_items_raw = response.xpath('//script[contains(., "createCrossSellItems(")]/text()').re_first(r'createCrossSellItems\("([^"]+)')
cross_sell_items = re.findall(r"\d+", cross_sell_items_raw)
if cross_sell_items:
cross_sell_item_id = cross_sell_items.pop(0)
yield scrapy.Request(
f"https://www.duif.nl/api/v2/catalog/product?itemcode={cross_sell_item_id}_Parent",
headers={
'referer': response.url,
'Content-type': 'application/json',
'Authorization': 'bearer null',
'Accept': '*/*',
},
callback=self.parse_cross_sell,
meta={
'item': item,
'referer': response.url,
'cross_sell_items': cross_sell_items,
}
)
else:
# There is no "Others also bought" items for this page, just save main item
yield item
def parse_cross_sell(self, response):
main_item = response.meta["item"]
cross_sell_items = response.meta["cross_sell_items"]
data = json.loads(response.text)
current_cross_sell_item = {}
current_cross_sell_item['title'] = data["_embedded"]["products"][0]["name"]
current_cross_sell_item['url'] = data["_embedded"]["products"][0]["url"]
current_cross_sell_item['description'] = data["_embedded"]["products"][0]["description"]
main_item['cross_sell'].append(current_cross_sell_item)
if cross_sell_items:
cross_sell_item_id = cross_sell_items.pop(0)
yield scrapy.Request(
f"https://www.duif.nl/api/v2/catalog/product?itemcode={cross_sell_item_id}_Parent",
headers={
'referer': response.meta['referer'],
'Content-type': 'application/json',
'Authorization': 'bearer null',
'Accept': '*/*',
},
callback=self.parse_cross_sell,
meta={
'item': main_item,
'referer': response.meta['referer'],
'cross_sell_items': cross_sell_items,
}
)
else:
# no more cross sell items to process, save output
yield main_item