Question

如何在scrapy工具

中的parse函数中访问Spider命令行参数

import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from scrapy.http import Request
import string
import xlrd, xlwt
import time


import json
class Myspider(BaseSpider):
    name="doctor"
    allowed_domain = ["tolexo.org"]

    #start_urls=["http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=%1"]

    def __init__(self, pageno='', excelsheetname='',*args, **kwargs):
        super(Myspider, self).__init__(*args, **kwargs)
        self.start_urls =["http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=%d",pageno]
        page=int(pageno)
        self.excelname=excelsheetname
        self.page=int(pageno)






    workbook=xlwt.Workbook()
    sheet = workbook.add_sheet('Sheet1')

    style=xlwt.easyxf('font : bold 1')
    style2=xlwt.easyxf('font :bold 0')
    sheet.write(0,0,"category",style)
    sheet.col(0).width=256*(30+1)
    sheet.write(0,1,"sub-category1",style)
    sheet.col(1).width=256*(30+1)
    sheet.write(0,2,"sub-category2",style)
    sheet.col(2).width=256*(30+1)
    sheet.write(0,3,"Title",style)
    sheet.col(3).width=256*(30+1)
    sheet.write(0,4,"MRP",style)
    sheet.col(4).width=256*(20+1)
    sheet.write(0,5,"Sale-price",style)
    sheet.col(5).width=256*(20+1)
    sheet.write(0,6,"Image-link",style)
    sheet.col(6).width=256*(60+1)
    rows=0
    cols=7
    specifications={}

    rowsbreak=0
    colsbreak=0
    url=""



    def parse(self,response):
        hxs=HtmlXPathSelector(response)
        url=None
        link= hxs.select("//li[@class='fav-item item']")
        for href in response.xpath("//li[@class='fav-item item']/a/@href"):

            dat=href in response.xpath("//li[@class='fav-item item']/a/@href")
            url = response.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parse_dir_contents)




        self.workbook.save(self.excelname)
        self.page.
        if(page<260):
                 yield Request(url="http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=%d" %self.page,
                                  headers={"Referer": "http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=1", "X-Requested-With": "XMLHttpRequest"},
                                  callback=self.parse,
                                  dont_filter=True)


    def parse_dir_contents(self,response):
        self.rows=self.rows+1
        hxs=HtmlXPathSelector(response)
        categories=hxs.select("//div [@class='col-sm-12 a-left']/ul [@typeof='BreadcrumbList']/li/a")
        cat=categories.select('text()').extract()
        cat=[c.strip() for c in cat]
        cat.remove("Home")

        category=cat[0]
        try:
            subcat1=cat[1]
        except:
            subcat1='-'
        try:
            subcat2=cat[2]
        except:
            subcat2='-'

        tit=hxs.select("//div[@class='product-name']/h1")
        title=tit.select('text()').extract()
        titt=title[0]
        mpri=hxs.select("//div[@class='mprice strike']/span")
        if not mpri:
            mpri=hxs.select("//div[@class='mprice strike clearfix']/span")
        spri=hxs.select("//span [@itemprop='price']")
        saleprice=spri.select('text()').extract()
        mrp=mpri.select('text()').extract()
        try:
            mrpp=mrp[0]
        except:
            mrpp="-"
        try:
            sp=saleprice[0]
        except:
            sp="-"
        im=hxs.select("//div[@class='gallery-img']")
        img=im.select('img/@data-img-src').extract()
        try:
            imgg=img[0]
        except:
            img="-"
        pro=hxs.select("//table[@class='product-spec']//td").extract()
        pro1=hxs.select("//table[@class='product-spec']//th").extract()
        pro_des=[]
        pro_sep=[]
        sep="View"
        print category+"--->"+subcat1+"----->"+subcat2+"----->"+titt+"----->"+mrpp+"---->"+sp
        import re
        for p in pro:
         ppp=re.sub('<[^>]*>', '', p)
         ppp=ppp.split(sep,1)[0]
         ppp=ppp.strip()
         pro_des.append(ppp)

        for pp in pro1:
            proo=re.sub('<[^>]*>', '', pp)
            proo=proo.strip()
            pro_sep.append(proo)

        print pro_sep
        cat_len=len(cat)
        title_len=len(title)
        mrp_len=len(mrp)
        saleprice_len=len(saleprice)
        img_len=len(img)
        try:
            self.sheet.write(self.rows,0,category,self.style2)

            self.sheet.write(self.rows,1,subcat1,self.style2)

            self.sheet.write(self.rows,2,subcat2,self.style2)

            self.sheet.write(self.rows,3,titt,self.style2)

            self.sheet.write(self.rows,4,mrpp,self.style2)

            self.sheet.write(self.rows,5,sp,self.style2)

            self.sheet.write(self.rows,6,imgg,self.style2)
        except:
            print

        for p,pp in zip(pro_sep,pro_des):
              try:
                if p in self.specifications:
                    self.sheet.write(self.rows,self.specifications.get(p),pp,self.style2)
                else:
                    self.specifications.update({p:self.cols})
                    self.sheet.write(0,self.cols,p,self.style)
                    self.sheet.write(self.rows,self.cols,pp,self.style2)
                    self.cols=self.cols+1
              except:
                  print
              self.rowsbreak=self.rows
              self.colsbreak=self.cols
              self.urlbreak=str(response)

如何在scrapy工具中的parse函数中访问Spider命令行参数

0 个答案: