我很难实现我的刮刀(我从@alecxe这里[selenium with scrapy for dynamic page获取了最初的示例代码,并完成了获得一些结果,但如果刮刀似乎很多(我们可以观察模拟)单击下一个按钮),它关闭一秒后,不打印或得到任何项目。
这是代码
from scrapy.spider import BaseSpider
from selenium import webdriver
class product_spiderItem(scrapy.Item):
title = scrapy.Field()
price=scrapy.Field()
pass
class ProductSpider(BaseSpider):
name = "product_spider"
allowed_domains = ['ebay.com']
start_urls = ['http://www.ebay.com/sch/i.html?_odkw=books&_osacat=0&_trksid=p2045573.m570.l1313.TR0.TRC0.Xpython&_nkw=python&_sacat=0&_from=R40']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath('//td[@class="pagn-next"]/a')
try:
next.click()
# get the data and write it to scrapy items
response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
print response.url
for prod in response.xpath('//ul[@id="GalleryViewInner"]/li/div/div'):
item = product_spiderItem()
item['title'] = prod.xpath('.//div[@class="gvtitle"]/h3/a/text()').extract()[0]
item['price'] = prid.xpath('.//div[@class="prices"]/span[@class="bold"]/text()').extract()[0]
print item['price']
yield item
except:
break
self.driver.close()
我使用scrapy crawl product_scraper -o products.json来存储结果。我错过了什么?
答案 0 :(得分:2)
在尝试了解您的代码有什么问题时,我做了一些编辑,并提出了以下(经过测试的)代码,这些代码应该更接近您的目标:
import scrapy
from selenium import webdriver
class product_spiderItem(scrapy.Item):
title = scrapy.Field()
price=scrapy.Field()
pass
class ProductSpider(scrapy.Spider):
name = "product_spider"
allowed_domains = ['ebay.com']
start_urls = ['http://www.ebay.com/sch/i.html?_odkw=books&_osacat=0&_trksid=p2045573.m570.l1313.TR0.TRC0.Xpython&_nkw=python&_sacat=0&_from=R40']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
while True:
sel = scrapy.Selector(text=self.driver.page_source)
for prod in sel.xpath('//ul[@id="GalleryViewInner"]/li/div/div'):
item = product_spiderItem()
item['title'] = prod.xpath('.//div[@class="gvtitle"]/h3/a/text()').extract()
item['price'] = prod.xpath('.//div[@class="prices"]//span[@class=" bold"]/text()').extract()
yield item
next = self.driver.find_element_by_xpath('//td[@class="pagn-next"]/a')
try:
next.click()
except:
break
def closed(self, reason):
self.driver.close()
如果此代码效果更好,请尝试。