我一直在编写基于Scrapy的网络抓取工具几周。他们似乎按预期工作。我已成为一名scrapy粉丝。但在过去几天里,我最新的scrapy爬虫拒绝抓取亚马逊网站。我没有得到任何结果。我也没有得到任何错误代码。我甚至尝试过scrapy外壳。它只是没有返回任何结果。我怀疑问题出在xpath或css表达式中,但我无法弄明白。任何帮助都将很高兴。
这是我的蜘蛛的样子 我的代码在
之后打印xxxxx并没有打印import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from amazon.items import LowesItem
from amazon.items import SwatchcolorItem
class SattySpider(scrapy.Spider):
name = "faucets"
allowed_domains = ["amazon.com"]
start_urls = [
"https://www.amazon.com/s?ie=UTF8&page=1&rh=n%3A228013%2Ck%3Abathroom%20faucets"
]
rules = (
Rule(LinkExtractor(allow='amazon\.com/[A-Z][a-zA-Z_/]+$'),
'parse_category', follow=True,
),
)
def parse(self, response):
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
#####
# I even tried xpath
#for sel in response.xpath('.//li[@class="s-result-item celwidget s-hidden-sponsored-item"]'):
# prodDesc= sel.xpath('.//div[@class="s-item-container"]//div[@class="a-row a-spacing-none"]//a[@title]').extract()
#####
for sel in response.css("li.s-result-item.celwidget.s-hidden-sponsored-item > div.s-item-container > div > div > a::attr('href')"):
#for sel in response.xpath('.//li[@class="s-result-item celwidget s-hidden-sponsored-item"]'):
prodDesc= sel.xpath('.//div[@class="s-item-container"]//div[@class="a-row a-spacing-none"]//a[@title]').extract()
print prodDesc
produrls = sel.xpath('.//@data-producturl').extract()
urls = sel.xpath('.//@data-productimg').extract()
#prod_url_det = response.urljoin(produrl.extract())
lowi= LowesItem()
lowi['swatcharray'] = {}
for idx,swatch in enumerate(sel.xpath('.//div[@class="product-container js-product-container"]//a//div[@class="pvs pvs-options-height v-spacing-small"]//ul/li')):
swatchcolor = swatch.xpath('.//img//@alt').extract()
lowi['swatcharray'][idx] =swatchcolor
#yield lowi
#url_prod_det = response.urljoin(produrl)
for idx1,url in enumerate(urls):
url_prod_det = response.urljoin(produrls[idx1])
yield scrapy.Request(url_prod_det,
meta={'lowes': LowesItem(prod=prod[idx1], swatcharray=lowi['swatcharray'], file_urls=['http:' + url])},
callback=self.parse_productdetail)
for next in response.css("div.grid-parent.v-spacing-extra-large > nav > ul > li.page-next > a::attr('href')"):
url_next = response.urljoin(next.extract())
print " url_next : " + url_next
yield scrapy.Request(url_next, callback=self.parse)
def parse_productdetail(self, response):
print 'Testing....'
# for model in response.xpath('//div[@class="pd-numbers grid-50 tablet-grid-100"]//p[@class="secondary-text small-type"]').re('<strong> Model # </strong>'):
for model in response.xpath('//div[@class="pd-numbers grid-50 tablet-grid-100"]//p[@class="secondary-text small-type"]'):
#print model.extract()
modelname = model.xpath('./text()').extract()
#print modelname
#yield lowesItem
lowesItem = response.meta['lowes']
lowesItem['model']=modelname[1]
lowesItem['category']='default'
lowesItem['subcategory']='default'
lowesItem['vendor']='Lowes'
for namevals in response.xpath('//div[@id="collapseSpecs"]//div[@class="panel-body"]//div[@class="grid-100 grid-parent"]//div[@class="grid-50"]//table[@class="table full-width no-borders"]//tbody//tr'):
#print namevals
name = namevals.xpath('.//th/text()').extract()
val = namevals.xpath('.//td//span/text()').extract()
if 'Faucet Type' in name:
lowesItem['faucettype']=val[0]
elif 'Number of Faucet Handles' in name:
lowesItem['numofhandles']=val[0]
elif 'ADA Compliant' in name:
lowesItem['ada']=val[0]
elif 'Built-In Water Filter' in name:
lowesItem['builtinwaterfilter']=val[0]
elif 'Mounting Location' in name:
lowesItem['mountingloc']=val[0]
elif 'Color/Finish Family' in name:
lowesItem['color']=val[0]
elif 'Manufacturer Color/Finish' in name:
lowesItem['manufacturercolor']=val[0]
elif 'Collection Name' in name:
lowesItem['collection']=val[0]
elif 'Soap or Lotion Dispenser' in name:
lowesItem['soapdispenser']=val[0]
elif 'Spout Height (Inches)' in name:
lowesItem['spoutheight']=val[0]
elif 'Max Flow Rate' in name:
lowesItem['maxflowrate']=val[0]
yield lowesItem