如何从此站点刮取物品位置编号
网站: http://books.toscrape.com/
请检查此屏幕截图
# -*- coding: utf-8 -*-
import scrapy
class ToscrapeSpider(scrapy.Spider):
name = 'toscrape'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
lists=response.css('li.col-xs-6')
for lis in lists:
title=lis.xpath('.//h3//@title').extract_first()
price=lis.xpath('.//[@class="price_color"]//text()').extract_first()
# I need to know How to scrape there position
position=''
yield {
'Title':title,
'Price':price,
'Position':position
}
# next=response.xpath('//*[@class="next"]//@href').extract_first()
# next=response.urljoin(next)
# if next:
# yield scrapy.Request(next)
答案 0 :(得分:0)
尝试循环使用content
,这将解决问题。我记得,是这样的:
enumerate
答案 1 :(得分:0)
您可以简单地使用类变量来跟踪位置,如下所示:
import scrapy
class ToscrapeSpider(scrapy.Spider):
name = 'toscrape'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
position = 0
def parse(self, response):
lists = response.css('li.col-xs-6')
for lis in lists:
title = lis.xpath('.//h3//@title').extract_first()
price = lis.xpath('.//p[@class="price_color"]//text()').extract_first()
self.position += 1
yield {
'Title': title,
'Price': price,
'Position': self.position,
}
next = response.xpath('//li[@class="next"]/a/@href').extract_first()
next = response.urljoin(next)
if next:
yield scrapy.Request(next)
然后:
scrapy runspider myspider.py -o out.json
out.json
文件包含:
[
{"Title": "A Light in the Attic", "Price": "\u00a351.77", "Position": 1},
{"Title": "Tipping the Velvet", "Price": "\u00a353.74", "Position": 2},
{"Title": "Soumission", "Price": "\u00a350.10", "Position": 3},
{"Title": "Sharp Objects", "Price": "\u00a347.82", "Position": 4},
{"Title": "Sapiens: A Brief History of Humankind", "Price": "\u00a354.23", "Position": 5},
{"Title": "The Requiem Red", "Price": "\u00a322.65", "Position": 6},
{"Title": "The Dirty Little Secrets of Getting Your Dream Job", "Price": "\u00a333.34", "Position": 7},
{"Title": "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull", "Price": "\u00a317.93", "Position": 8},
{"Title": "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics", "Price": "\u00a322.60", "Position": 9},
{"Title": "The Black Maria", "Price": "\u00a352.15", "Position": 10},
{"Title": "Starving Hearts (Triangular Trade Trilogy, #1)", "Price": "\u00a313.99", "Position": 11},
{"Title": "Shakespeare's Sonnets", "Price": "\u00a320.66", "Position": 12},
{"Title": "Set Me Free", "Price": "\u00a317.46", "Position": 13},
{"Title": "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", "Price": "\u00a352.29", "Position": 14},
{"Title": "Rip it Up and Start Again", "Price": "\u00a335.02", "Position": 15},
{"Title": "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991", "Price": "\u00a357.25", "Position": 16},
{"Title": "Olio", "Price": "\u00a323.88", "Position": 17},
{"Title": "Mesaerion: The Best Science Fiction Stories 1800-1849", "Price": "\u00a337.59", "Position": 18},
{"Title": "Libertarianism for Beginners", "Price": "\u00a351.33", "Position": 19},
{"Title": "It's Only the Himalayas", "Price": "\u00a345.17", "Position": 20}
]
答案 2 :(得分:0)
import scrapy
class ToscrapeSpider(scrapy.Spider):
name = 'toscrape'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
products_count = response.meta.get('products_count', 0)
products = response.xpath('//article[@class="product_pod"]')
for idx, product in enumerate(products):
_image_container = product.xpath('.//div[@class="image_container"]')
detail_page_url = _image_container.xpath('.//a/@href').extract_first()
image = _image_container.xpath('.//img/@src').extract_first()
name = product.xpath('.//h3/a/@title').extract_first()
ratings = product.xpath('.//p[contains(@class, "star-rating")]/@class').extract_first()
ratings = ratings.replace('star-rating', '').strip() if ratings else ratings
price = product.xpath('.//p[@class="price_color"]/text()').extract_first()
availability = product.xpath('.//p[@class="instock availability"]//text()').extract()
availability = list(map(lambda x: x.replace('\n', '').replace('\t', '').strip(), availability))
availability = list(filter(lambda x: x, availability))
availability = availability[0] if availability else availability
yield dict(
position=products_count + idx + 1,
name=name,
availability=availability,
price=price,
ratings=ratings,
image=image,
pdp_url=detail_page_url,
)
next_page = response.xpath('//li[@class="next"]/a/@href').extract_first()
if next_page:
yield response.follow(next_page, meta=dict(products_count=products_count + len(products)))
答案 3 :(得分:0)
您可以检查此代码吗
如何在此selenium> scrapy代码中应用您的方法
from time import sleep
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.common.exceptions import NoSuchElementException
class ToscrapeSpider(Spider):
name = 'toscrape'
allowed_domains = ['books.toscrape.com']
# start_urls = ['http://books.toscrape.com/']
def start_requests(self):
self.driver = webdriver.Chrome()
self.driver.get('http://books.toscrape.com/')
sel = Selector(text=self.driver.page_source)
lists=sel.css('li.col-xs-6')
for i, lis in enumerate(lists):
position=i+1
links=lis.xpath('.//h3//a//@href').extract_first()
links="http://books.toscrape.com/catalogue/"+links
yield Request(links,meta={'position':position},callback=self.parse_page)
while True:
try:
next_page=self.driver.find_element_by_xpath('//*[@class="next"]//a')
self.logger.info('Sleeping for 10 seconds.')
next_page.click()
sel = Selector(text=self.driver.page_source)
lists=sel.css('li.col-xs-6')
for i, lis in enumerate(lists):
position=i+1
links=lis.xpath('.//h3//a//@href').extract_first()
links="http://books.toscrape.com/catalogue/"+links
yield Request(links,meta={'position':position},callback=self.parse_page)
except NoSuchElementException:
self.logger.info('No more pages to load.')
self.driver.quit()
break
def parse_page(self, response):
title=response.xpath('//h1//text()').extract_first()
positions=response.meta['position']
yield {
'Title':title,
'Position':positions
}