我想用scrapy来搜索一个网站,其中列出了我的产品类别,我是scras的新手,今天只是让我的头围绕它但是虽然我在简单的刮擦上得到了它的要点所以试图刮掉网址并返回他们进一步刮,但似乎我错过了什么。
有人回答修复我的代码这里是最新版本,因为我今天还有另外一个学习scrapy但它仍然没有递归扫描它似乎只是遍历所有页面但从未进入解析项目
似乎永远不会进入else语句
yield scrapy.Request(url = response.url,callback = self.parse_item)
我可以调试它以检查项目是否正确解析,如果我强制它输出项目而不循环
如果我改变以下
if product_pages:
for product_url in product_pages:
product_url2 = str(self.base_url + product_url)
self.log("Queued up: %s" % product_url2)
yield scrapy.Request(url = product_url2,callback = self.parse_product_pages)
else:
yield scrapy.Request(url = response.url,callback = self.parse_item)
到
if product_pages:
for product_url in product_pages:
product_url2 = str(self.base_url + product_url)
self.log("Queued up: %s" % product_url2)
yield scrapy.Request(url = product_url2,callback = self.parse_item)
else:
yield scrapy.Request(url = response.url,callback = self.parse_product_pages)
这是我的代码我正在使用python 2.7
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from ybscrape.items import Product
from scrapy.linkextractors import LinkExtractor
from scrapy.linkextractors.sgml import SgmlLinkExtractor
class ybracingSpider(CrawlSpider):
name = 'ybscrape2'
download_delay = 0.75
def __init__(self, *args, **kwargs):
super(ybracingSpider, self).__init__(*args, **kwargs)
self.allowed_domains = ['http://www.ybracing.com/', 'www.ybracing.com', 'www.esellepro.com']
self.base_url = 'http://www.ybracing.com'
self.start_urls = ['http://www.ybracing.com/karting/']
def parse_start_url(self, response):
category = response.xpath("//h2/a/@href").extract()
#loop over catagory pages take the product link and add all pages url
for product in category:
all_pages = '?itemsperpage=99999'
category_url = str(self.base_url + product + all_pages)
self.log("Queued up: %s" % category_url)
yield scrapy.Request(url = category_url,callback = self.parse_product_pages)
def parse_product_pages(self, response):
product_pages = response.xpath("//li/div/div/h3/a/@href").extract()
#print("debug pause")
#print(product_pages)
#wait = input("PRESS ENTER TO CONTINUE.")
#print("continue")
if product_pages:
for product_url in product_pages:
product_url2 = str(self.base_url + product_url)
self.log("Queued up: %s" % product_url2)
yield scrapy.Request(url = product_url2,callback = self.parse_product_pages)
else:
yield scrapy.Request(url = response.url,callback = self.parse_item)
def parse_item(self, response):
item = Product()
item['description'] = response.xpath("//div[@id='Tabbed-Container-Details']/div[2]/div/text()").extract()
item['product_title'] = response.xpath("//h3[@class='Product-Heading']/text()").extract()
item['price'] = response.xpath("//div[@id='Product-Price']/text()").extract()
table_rows = response.xpath("//table[@id='SpecificationTab']/tr[*]/td[1]//text()").extract()
yield item
my items.py
from scrapy.item import Item, Field
class Product(Item):
product_title = Field()
description = Field()
price = Field()
我期待我的代码在步骤中做什么
答案 0 :(得分:0)
在这里,我对您的代码进行了一些更改,现在它正在运行
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from demo.items import DemoItem
from scrapy.linkextractors import LinkExtractor
from scrapy.linkextractors.sgml import SgmlLinkExtractor
class DemoSpider(CrawlSpider):
name = 'ybracing2'
def __init__(self, *args, **kwargs):
super(DemoSpider, self).__init__(*args, **kwargs)
self.allowed_domains = ['http://www.ybracing.com/', 'www.ybracing.com', 'www.esellepro.com']
self.base_url = 'http://www.ybracing.com'
self.start_urls = ['http://www.ybracing.com/racewear/']
def parse_start_url(self, response):
category = response.xpath("//h2/a/@href").extract()
#loop over catagory pages take the product link and add all pages url
for product in category:
all_pages = '?itemsperpage=99999'
category_url = str(self.base_url + product + all_pages)
self.log("Queued up: %s" % category_url)
yield scrapy.Request(url = category_url,callback = self.parse_product_pages)
def parse_product_pages(self, response):
product_pages = response.xpath("//div[@class='Product']/a/@href").extract()
if product_pages:
for product_url in product_pages:
product_url2 = str(self.base_url + product_url)
self.log("Queued up: %s" % product_url2)
yield scrapy.Request(url = product_url2,callback = self.parse_item)
else:
yield scrapy.Request(url = response.url,callback = self.parse_product_pages)
def parse_item(self, response):
item = DemoItem()
dirty_data ={}
item['product_title'] = response.xpath("//h3[@class='Product-Heading']/text()").extract()
item['price'] = response.xpath("//div[@id='Product-Price']/text()").extract()
item['description'] = response.xpath("//div[@id='Tabbed-Container-Details']/div[2]/div/text()").extract()
#image['product_image'] =
# for variable in dirty_data.keys():
# if dirty_data[variable]:
# if variable == 'price':
# item[variable] = float(''.join(dirty_data[variable]).strip().replace('$', '').replace(',', ''))
# else:
# item[variable] = ''.join(dirty_data[variable]).strip()
yield item