我写了一只蜘蛛,但每当我运行这只蜘蛛时,我都会收到这个错误:
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/base.py", line 824, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/task.py", line 607, in _tick
taskObj._oneWorkUnit()
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/task.py", line 484, in _oneWorkUnit
result = next(self._iterator)
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 96, in iter_errback
yield it.next()
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/offsite.py", line 28, in process_spider_output
for x in result:
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/vaibhav/scrapyprog/comparison/eScraperInterface/eScraper/spiders/streetstylestoreSpider.py", line 38, in parse
item['productURL'] = site.select('.//a/@href').extract()
exceptions.AttributeError: 'unicode' object has no attribute 'select'
我的代码是:
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
#------------------------------------------------------------------------------
class ESpider(CrawlSpider):
name = "streetstylestoreSpider"
allowed_domains = ["streetstylestore.com"]
start_urls = [
"http://streetstylestore.com/index.php?id_category=16&controller=category",
"http://streetstylestore.com/index.php?id_category=46&controller=category",
"http://streetstylestore.com/index.php?id_category=51&controller=category",
"http://streetstylestore.com/index.php?id_category=61&controller=category",
"http://streetstylestore.com/index.php?id_category=4&controller=category"
]
def parse(self, response):
items = []
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul[@id="product_list"]/li').extract()
for site in sites:
item = EscraperItem()
item['currency'] = 'INR'
item['productSite'] = ["http://streetstylestore.com"]
item['productURL'] = site.select('.//a/@href').extract()
item['productImage'] = site.select('.//a/img/@src').extract()
item['productTitle'] = site.select('.//a/@title').extract()
productMRP = [i.strip().split('Rs')[-1].replace(',','') for i in hxs.select('.//div[@class="price_container"]//span[@class="old_price"]/text()').extract()]
productPrice = [i.strip().split('Rs')[-1].replace(',','') for i in hxs.select('.//div[@class="price_container"]//p[@class="price"]/text()').extract()]
item['productPrice'] = productMRP + productPrice
items.append(item)
secondURL = item['productURL'][0]
request = Request(secondURL,callback=self.parsePage2)
request.meta['item'] = item
yield request
def parsePage2(self, response):
temp = []
item = response.meta['item']
hxs = HtmlXPathSelector(response)
availability = [i for i in hxs.select('//div[@class="details"]/p/text()').extract() if 'In Stock ' in i]
if availability:
item['availability'] = True
else:
item['availability'] = False
hasVariants = hxs.select('//div[@class="attribute_list"]').extract()
if hasVariants:
item['hasVariants'] = True
else:
item['hasVariants'] = False
category = hxs.select('//div[@class="breadcrumb"]/a/text()').extract()
if category:
productCategory = [category[0]]
if len(category) >= 1:
productSubCategory = [category[1]]
else:
productSubCategory = ['']
else:
productCategory = ['']
productSubCategory = ['']
item['productCategory'] = productCategory
item['productSubCategory'] = productSubCategory
for i in hxs.select('//div[@id="thumbs_list"]/ul/li/a/img/@src').extract():
temp.append(i.replace("medium","large"))
item['productDesc'] = " ".join([i for i in hxs.select('//div[@id="short_description_content"]/p/text()').extract()])
item['productImage'] = item['productImage'] + hxs.select('//div[@id="thumbs_list"]/ul/li/a/img/@src').extract() + hxs.select('//div[@id="thumbs_list"]/ul/li/a/@href').extract() + temp
item['image_urls'] = list(set(item['productImage']))
return item
有人可以告诉我我的代码有什么问题......
答案 0 :(得分:7)
请勿在{{1}} - .extract()
上存储您sites
上存储的内容extract()
,但您还不想要文字。此...
sites = hxs.select('//ul[@id="product_list"]/li').extract()
......应该是这样的:
sites = hxs.select('//ul[@id="product_list"]/li')