如何在scrapy工具
中的parse函数中访问Spider命令行参数import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from scrapy.http import Request
import string
import xlrd, xlwt
import time
import json
class Myspider(BaseSpider):
name="doctor"
allowed_domain = ["tolexo.org"]
#start_urls=["http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=%1"]
def __init__(self, pageno='', excelsheetname='',*args, **kwargs):
super(Myspider, self).__init__(*args, **kwargs)
self.start_urls =["http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=%d",pageno]
page=int(pageno)
self.excelname=excelsheetname
self.page=int(pageno)
workbook=xlwt.Workbook()
sheet = workbook.add_sheet('Sheet1')
style=xlwt.easyxf('font : bold 1')
style2=xlwt.easyxf('font :bold 0')
sheet.write(0,0,"category",style)
sheet.col(0).width=256*(30+1)
sheet.write(0,1,"sub-category1",style)
sheet.col(1).width=256*(30+1)
sheet.write(0,2,"sub-category2",style)
sheet.col(2).width=256*(30+1)
sheet.write(0,3,"Title",style)
sheet.col(3).width=256*(30+1)
sheet.write(0,4,"MRP",style)
sheet.col(4).width=256*(20+1)
sheet.write(0,5,"Sale-price",style)
sheet.col(5).width=256*(20+1)
sheet.write(0,6,"Image-link",style)
sheet.col(6).width=256*(60+1)
rows=0
cols=7
specifications={}
rowsbreak=0
colsbreak=0
url=""
def parse(self,response):
hxs=HtmlXPathSelector(response)
url=None
link= hxs.select("//li[@class='fav-item item']")
for href in response.xpath("//li[@class='fav-item item']/a/@href"):
dat=href in response.xpath("//li[@class='fav-item item']/a/@href")
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_dir_contents)
self.workbook.save(self.excelname)
self.page.
if(page<260):
yield Request(url="http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=%d" %self.page,
headers={"Referer": "http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=1", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse,
dont_filter=True)
def parse_dir_contents(self,response):
self.rows=self.rows+1
hxs=HtmlXPathSelector(response)
categories=hxs.select("//div [@class='col-sm-12 a-left']/ul [@typeof='BreadcrumbList']/li/a")
cat=categories.select('text()').extract()
cat=[c.strip() for c in cat]
cat.remove("Home")
category=cat[0]
try:
subcat1=cat[1]
except:
subcat1='-'
try:
subcat2=cat[2]
except:
subcat2='-'
tit=hxs.select("//div[@class='product-name']/h1")
title=tit.select('text()').extract()
titt=title[0]
mpri=hxs.select("//div[@class='mprice strike']/span")
if not mpri:
mpri=hxs.select("//div[@class='mprice strike clearfix']/span")
spri=hxs.select("//span [@itemprop='price']")
saleprice=spri.select('text()').extract()
mrp=mpri.select('text()').extract()
try:
mrpp=mrp[0]
except:
mrpp="-"
try:
sp=saleprice[0]
except:
sp="-"
im=hxs.select("//div[@class='gallery-img']")
img=im.select('img/@data-img-src').extract()
try:
imgg=img[0]
except:
img="-"
pro=hxs.select("//table[@class='product-spec']//td").extract()
pro1=hxs.select("//table[@class='product-spec']//th").extract()
pro_des=[]
pro_sep=[]
sep="View"
print category+"--->"+subcat1+"----->"+subcat2+"----->"+titt+"----->"+mrpp+"---->"+sp
import re
for p in pro:
ppp=re.sub('<[^>]*>', '', p)
ppp=ppp.split(sep,1)[0]
ppp=ppp.strip()
pro_des.append(ppp)
for pp in pro1:
proo=re.sub('<[^>]*>', '', pp)
proo=proo.strip()
pro_sep.append(proo)
print pro_sep
cat_len=len(cat)
title_len=len(title)
mrp_len=len(mrp)
saleprice_len=len(saleprice)
img_len=len(img)
try:
self.sheet.write(self.rows,0,category,self.style2)
self.sheet.write(self.rows,1,subcat1,self.style2)
self.sheet.write(self.rows,2,subcat2,self.style2)
self.sheet.write(self.rows,3,titt,self.style2)
self.sheet.write(self.rows,4,mrpp,self.style2)
self.sheet.write(self.rows,5,sp,self.style2)
self.sheet.write(self.rows,6,imgg,self.style2)
except:
print
for p,pp in zip(pro_sep,pro_des):
try:
if p in self.specifications:
self.sheet.write(self.rows,self.specifications.get(p),pp,self.style2)
else:
self.specifications.update({p:self.cols})
self.sheet.write(0,self.cols,p,self.style)
self.sheet.write(self.rows,self.cols,pp,self.style2)
self.cols=self.cols+1
except:
print
self.rowsbreak=self.rows
self.colsbreak=self.cols
self.urlbreak=str(response)