以下是我用来搜索产品信息的代码。页面上有很多产品。我把它们全部刮掉然后转到下一页。问题是scrapy只选择页面上的第一个产品而不是迭代页面上的所有产品。我哪里错了?
import re
import time
import sys
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.http import Request
import parsedatetime
from datetime import datetime
from airline_sentiment.items import *
from airline_sentiment.spiders.crawlerhelper import *
class TripAdvisorRestaurantBaseSpider(BaseSpider):
name = "shoebuy"
allowed_domains = ["shoebuy.com"]
base_uri = "http://www.shoebuy.com"
start_urls = [
base_uri + "/womens-leather-boots/category_2493?cm_sp=cat-_-d_womensboots_tiles_b1_leather-_-092216"
]
def parse(self, response):
sel = Selector(response)
snode_airline = sel.xpath('//*[starts-with(@class, "pt_grid")]/div[starts-with(@class, "pt_product\")]')
for snode_restaurant in snode_airline:
tripadvisor_item = AirlineSentimentItem()
tripadvisor_item['url'] = self.base_uri + clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/@href'))
tripadvisor_item['name'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/span[@class="pt_title"]/text()'))
tripadvisor_item['price'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/span[@class="pt_price"]/text()'))
tripadvisor_item['discount'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/div[@class="pt_discount"]/span[@class="pt_percent_off"]/text()'))
tripadvisor_item['orig_price'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/div[@class="pt_discount"]/span[@class="pt_price_orig"]/text()'))
tripadvisor_item['stars'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//*[@class="bv-rating-ratio"]/span/span[3]/text()'))
tripadvisor_item['reviews'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "bv-inline-rating-container")]/dl/dd[2]/span/text()'))
yield Request(url=tripadvisor_item['url'], meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_fetch_review)
next_page_url = clean_parsed_string(get_parsed_string(sel, '//div[@class="paging"]/a[@class="next"]/@href'))
if next_page_url and len(next_page_url) > 0:
yield Request(url=self.base_uri + next_page_url, meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_next_page)
def parse_next_page(self, response):
sel = Selector(response)
snode_airline = sel.xpath('//*[starts-with(@class, "pt_grid")]/div[starts-with(@class, "pt_product")]')
for snode_restaurant in snode_airline:
tripadvisor_item = AirlineSentimentItem()
tripadvisor_item['url'] = self.base_uri + clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/@href'))
tripadvisor_item['name'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/span[@class="pt_title"]/text()'))
tripadvisor_item['price'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/span[@class="pt_price"]/text()'))
tripadvisor_item['discount'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/div[@class="pt_discount"]/span[@class="pt_percent_off"]/text()'))
tripadvisor_item['orig_price'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/div[@class="pt_discount"]/span[@class="pt_price_orig"]/text()'))
tripadvisor_item['stars'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//*[@class="bv-rating-ratio"]/span/span[3]/text()'))
tripadvisor_item['reviews'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "bv-inline-rating-container")]/dl/dd[2]/span/text()'))
yield Request(url=tripadvisor_item['url'], meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_fetch_review)
next_page_url = clean_parsed_string(get_parsed_string(sel, '//div[@class="paging"]/a[@class="next"]/@href'))
if next_page_url and len(next_page_url) > 0:
yield Request(url=self.base_uri + next_page_url, meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_next_page)
def parse_fetch_review(self, response):
tripadvisor_item = response.meta['tripadvisor_item']
sel = Selector(response)
snode_reviews = sel.xpath('//*[starts-with(@class, "product_info_wrapper")]')
for snode_review in snode_reviews:
tripadvisor_item['img'] = self.base_uri + clean_parsed_string(get_parsed_string(snode_review, '//div[starts-with(@class,"large_thumb")]/img/@src'))
tripadvisor_item['desc'] = clean_parsed_string(get_parsed_string(snode_review, '//*[starts-with(@class,"product_information")]/div[1]/span/text()'))
tripadvisor_item['brand'] = clean_parsed_string(get_parsed_string(snode_review, '//div[starts-with(@class,"seo_module")]/h3/text()'))
yield tripadvisor_item
答案 0 :(得分:0)
这是错误的一行:
tripadvisor_item['url'] = self.base_uri + clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/@href'))
xpath应该以{{1}}开头,如.
来表示相对节点:
.//div
由于您没有相对于您的节点制作xpath(使用'。'表示法),因此您始终会将页面上的第一个产品链接作为每个项目的网址。现在scrapy有自动重复的url过滤器,所以发生的事情是你以后过滤掉所有检索评论的请求,结果只是得到了第一个项目。
Tl; dr:只需在相对x路中的'.//div[starts-with(@class, "pt_info")]/a/@href'
之前添加.
。