我写了一些scrapy爬虫。这个让我难过。 我正在刮一个龙头comapany的网站。布局非常简单。列表页面。列表页面上的缩略图将带您进入详细页面。我为每个订单项下载了大量信息。我很难获得下一页的URL。我做了几次。我遇到的问题是我的抓取工具无法在由a标记标记的超链接上找到属性href。它能够找到其他属性,如“数据绑定”,但不能找到正确的href。看起来很直接&因此让我感到沮丧。
在下面的代码中,抓取工具将亚马逊列表打印为:
[<Selector xpath='//div[@id="product-list"]//div[@id="displayedProducts"]//div[@data-bind="attr: { id: \'prd\' + ModelName().replace(\'-\', \'\'), \'data-sortorder\': SortOrder, \'class\': \'product \' + ModelName().replace(\'-\', \'\') }"]//div[@class="finishes"]//div[@data-bind="attr: { \'class\': \'finishSwatch \' + FacetCode() }"]' data=u'<div data-bind="attr: { \'class\': \'finish'>]
和页面为[]
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from homedepot.items import LowesItem
from homedepot.items import SwatchcolorItem
class SattySpider(scrapy.Spider):
name = "satty-pfister"
allowed_domains = ["pfisterfaucets.com"]
start_urls = [
"http://www.pfisterfaucets.com/kitchen/category/kitchen-faucets"
]
rules = (
Rule(LinkExtractor(allow='pfisterfaucets\.com/[A-Z][a-zA-Z_/]+$'),
'parse_category', follow=True,
),
)
def parse(self, response):
print 'enterrrrrrrrrrrrr'
#amazonlist = response.xpath('//ul[@id="products"]//li[@class="grid_3 "]//div//a[@class="preview_image js_product-link"]')
amazonlist = response.xpath('//div[@id="product-list"]//div[@id="displayedProducts"]//div[@data-bind="attr: { id: \'prd\' + ModelName().replace(\'-\', \'\'), \'data-sortorder\': SortOrder, \'class\': \'product \' + ModelName().replace(\'-\', \'\') }"]//div[@class="finishes"]//div[@data-bind="attr: { \'class\': \'finishSwatch \' + FacetCode() }"]')
print amazonlist
pages = amazonlist.xpath('.//a//@href').extract()
print 'pages'
print pages
#imglarges = amazonlist.xpath('.//@srcset').extract()
imgs = amazonlist.xpath('.//img//@src').extract()
for idx1,page in enumerate(pages):
print page
url_next = response.urljoin(pages[idx1])
print url_next
yield scrapy.Request(url_next,callback=self.parse_productdetail)
for next_url in response.xpath('//div[@class="float-right pagination-pages"]//div//a'):
urls= next_url.xpath('.//@href').extract()
for idx1,url in enumerate(urls):
url_next = response.urljoin(urls[idx1])
print url_next
yield scrapy.Request(url_next, callback=self.parse)
def parse_productdetail(self, response):
tits='blank'
#<div class="tabs-navigation group long">
#//div[contains(@class, "tabs-navigation")]
for imgs in response.xpath('//div[contains(@class, "tabs-navigation")]//ul[@class="tabs"]//li//img') :
imgurl= imgs.xpath('.//@href').extract()
name = imgs.xpath('.//@alt').extract()
print 'imggggggggggggggggggggggggggggg'
print imgurl
lowesitem = LowesItem(prod=name, file_urls=[imgurl])
yield lowesitem