我正在尝试抓取#页数据。我已经做过一个可以从单个#页面上抓取数据的抓取工具。但是在刮掉首页之后,它突然完成了工作
具有解析功能和剪贴功能的整个文件-Scraper.py
# -*- coding: utf-8 -*-
import scrapy
import csv
import os
from scrapy.selector import Selector
from scrapy import Request
class Proddduct(scrapy.Item):
price = scrapy.Field()
description = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
class LapadaScraperSpider(scrapy.Spider):
name = 'lapada_scraper2'
allowed_domains = ['http://www.lapada.org']
start_urls = ['https://lapada.org/art-and-antiques/?search=antique']
def parse(self, response):
next_page_url = response.xpath("//ul/li[@class='next']//a/@href").get()
for item in self.scrape(response):
yield item
if next_page_url:
print("Found url: {}".format(next_page_url))
yield scrapy.Request(url=next_page_url, callback=self.parse)
def scrape(self, response):
parser = scrapy.Selector(response)
products = parser.xpath("//div[@class='content']")
for product in products:
item = Proddduct()
XPATH_PRODUCT_DESCRIPTION = ".//strong/text()"
XPATH_PRODUCT_PRICE = ".//div[@class='price']/text()"
XPATH_PRODUCT_LINK = ".//a/@href"
raw_product_description = product.xpath(XPATH_PRODUCT_DESCRIPTION).extract()
raw_product_price = product.xpath(XPATH_PRODUCT_PRICE).extract()
raw_product_link = product.xpath(XPATH_PRODUCT_LINK).extract_first()
item['description'] = raw_product_description
item['price'] = raw_product_price
item['link'] = raw_product_link
yield item
def get_information(self, response):
item = response.meta['item']
item['phonenumber'] = "12345"
yield item
如何在所有页面中抓取所有物品?
谢谢
答案 0 :(得分:2)
将 CKEDITOR.stylesSet.add('myStylesComboBox',[{
name: 'my span style',
element: 'span',
attributes: {
'class': 'box1',
}
},
{
name: 'my span2 style',
element: 'span',
attributes: {
'class': 'box2'
}
}]);
var ContentsCss = [
'span.box1{padding:10px;border-radius:8px;background-color:#6950ab;color: #ffffff!important;display:inline-block}',
'span.box2{padding:10px;border-radius:8px;background-color:#6770ab;color: #ffffff!important;display:inline-block}'];
CKEDITOR.replace( 'editor1',{
stylesSet: 'myStylesComboBox',
contentsCss: ContentsCss
} );
更改为allowed_domains = ['http://www.lapada.org']