Question

我的scrapy项目从列表页面“向下钻取”，检索所列项目的数据，其数量级别可达多个深度。每个页面上可能有许多列出的项目，其中包含一些不同的项目/链接。我正在收集每个项目的详细信息（并将其存储在单个CSV Excel文件中）：列出的页面，该列表中的页面链接（“更多详细信息”页面），还有另一个页面 - 我们说，该项目的制造商原始上市。

因为我正在构建一个CSV文件，所以在我的解析过程移动到下一个项目之前将每个项目的数据放在一行上非常有用。我可以很好地做到这一点，只有当我在它出现的列表页面上为该项目写入CSV行时，我需要它时才能获得启动请求。如果需要的话，我只需“向下钻取”我需要的多个级别，每个级别使用不同的解析函数，一直保持单个项目，直到我拥有它需要的整个CSV文件行。

而不是那么容易，看起来我将不得不在每个级别重新编写每个项目的CSV文件，因为我无法让scrapy给我项目“更多细节”链接回复直到我退出项目列表页面的整个解析功能，因此我的CSV文件的末尾不再处于正在处理的项目中，并且我必须在每一行上都有一个唯一的字段来查看每个项目在每个级别，重写文件等

明白了，我不知道哪个回调级别是任何特定项目的最后一个。这是逐项确定的。有些物品甚至没有“更深”的水平。我唯一的想法是只有一个递归回调函数来处理所有回调级别，但这种方式是由你们其他人完成的，或者scrapy有一些“请求并等待响应”或相似的东西？我不想在我的笔记本电脑上安装一个sql数据库，从来没有设置过。

谢谢!!!

from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.exporter import CsvItemExporter
import csv
from meow.items import meowItem, meowPage
from scrapy.http import Request
import os
from mmap import mmap

class meowlistpage(Spider):
    name="melist"
    prefixhref='http://www.meow.com'
    #add '2_p/', '3_p/', or '4_p/', etc. to get to meow's other pages
    start_urls = [prefixhref+"/homes/for_sale/CO/house,mobile,land_type/10_rid/3000-30000_price/11-117_mp/800000-8000000_lot/lot_sort/46.377254,-96.82251,30.845647,-114.312744_rect/5_zm/1_p/1_rs/"]
    print 'Retrieving first page...'
    def parse(self, response):
        print 'First page retrieved'
        name="melist";prefixhref='http://www.meow.com';
        csvfilename = 'C:\\Python27\\My scripts\\meow\\'+name+'.csv';csvfile = open(csvfilename, 'w');pass;csvfile.close()
        hxs = Selector(response)
        page_tags=hxs.xpath("//div[@id='search-results']/article")
        for page_tags in page_tags:
            item = meowItem()
            item['ad_link']=prefixhref+str(page_tags.xpath(".//div[1]/dl[2]/dt[1]/span[1]/span[1]/a/@href").extract())[3:-2]
            idendplace=str(item['ad_link']).index('_zpid')-12; backhashstr=str(item['ad_link'])[idendplace:]; 
            idstartplace=backhashstr.index('/')+1; idendplace=len(backhashstr)-backhashstr.index('_zpid');
            item['zpid']=str(backhashstr)[idstartplace:-idendplace]
            item['sale_sold']=str(page_tags.xpath(".//div[1]/dl[1]/dt[1]/@class").extract())[8:-17]#"recentlySold" or "forSale"
            item['prop_price']=str(page_tags.xpath(".//div[1]/dl[1]/dt[2]/strong/text()").extract())[3:-2]
            if (str(item['sale_sold'])=='recentlySold'):item['prop_price']=str(item['prop_price'])+str(page_tags.xpath(".//div[1]/dl[1]/dt[1]/strong/text()").extract())[3:-2]
            try:
                dollrsgn=item['prop_price'].index('$');item['prop_price']=str(item['prop_price'])[dollrsgn:]
            except:pass
            item['ad_title']=str(page_tags.xpath(".//div[1]/dl[2]/dt[1]/span[1]/span[1]/a/@title").extract())[3:-2]
            prop_latitude1=page_tags.xpath("@latitude").extract();item['prop_latitude']=str(prop_latitude1)[3:-8]+'.'+str(prop_latitude1)[5:-2]
            prop_longitude1=page_tags.xpath("@longitude").extract();item['prop_longitude']=str(prop_longitude1)[3:-8]+'.'+str(prop_longitude1)[7:-2]
            item['prop_address']=str(page_tags.xpath(".//div[1]/dl[2]/dt[1]/span[1]/span[1]/a/span[1]/text()").extract())[3:-2]+', '+str(page_tags.xpath(".//div[1]/dl[2]/dt[1]/span[1]/span[1]/a/span[2]/text()").extract())[3:-2]+', '+str(page_tags.xpath(".//div[1]/dl[2]/dt[1]/span[1]/span[1]/a/span[3]/text()").extract())[3:-2]+'  '+str(page_tags.xpath(".//div[1]/dl[2]/dt[1]/span[1]/span[1]/a/span[4]/text()").extract())[3:-2]
            mightmentionacres = str(page_tags.xpath(".//div[1]/dl[2]/dt[2]/text()").extract())[3:-2]+' | '+str(page_tags.xpath(".//div[1]/dl[2]/dt[2]/text()").extract())[3:-2]+' | '+str(page_tags.xpath(".//div[1]/dl[2]/dt[1]/span[1]/span[1]/a/@title").extract())[3:-2]+' | '#+str()[3:-2]#this last segment comes from full ad
            item['prop_acres'] = mightmentionacres

            #Here is where I'm talking about

            yield Request(str(item['ad_link']), meta={'csvfilename':csvfilename, 'item':item}, dont_filter=True, callback = self.getthispage)

            #By this point, I wanted all the callback[s] to have had executed, but they don't - Scrapy waits to launch them until after this function completes

            csvfile = open(csvfilename, 'ab')
            outwriter = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            outwriter.writerow(item['zpid'], [item['sale_sold'], item['prop_price'], item['ad_title'],
                                             item['prop_address'], item['prop_latitude'],
                                              item['prop_longitude'], item['prop_acres'],
                                               item['ad_link'], item['parcelnum'], item['lot_width']])
            csvfile.close()
        #retrieve href of next page of ads
        next_results_pg=1
        page_tags=hxs.xpath("//div[@id='list-container']/div[@id='search-pagination-wrapper-2']/ul[1]")
        while (str(page_tags.xpath(".//li["+str(next_results_pg)+"]/@class").extract())[3:-2]!='current'):
            next_results_pg+=1;
            if (next_results_pg>80):
                break
        next_results_pg+=1#;item['next_results_pg'] = next_results_pg
        if (str(page_tags.xpath(".//li["+str(next_results_pg)+"]/@class").extract())[3:-2]=='next'):return
        next_results_pg_href = prefixhref+str(page_tags.xpath(".//li["+str(next_results_pg)+"]/a/@href").extract())[3:-2]#
        if (next_results_pg_href != prefixhref):#need to also avoid launching pages otherwise not desired
            page = meowPage()
            page['next_results_pg_href'] = next_results_pg_href
            print 'Retrieving page '+ next_results_pg_href
#           yield Request(next_results_pg_href, dont_filter=True, callback = self.parse)
        return
#       if (item['next_results_pg_href']==prefixhref):
#           print 'No results pages found after this one, next+results_pg='+str(next_results_pg)
#       else:
#           print 'Next page to parse after this one is '+str(item['next_results_pg_href'])

    def getthispage(self, response): 
        #Even though the yield statement was used, 
        #nothing here really gets executed until
        #until the first parse function resumes and 
        #then finishes completely.
        return

Answer 1

我重新安排了你的蜘蛛代码，使“meta中的项目”更加清晰（我希望）

from scrapy.spider import Spider
from scrapy.selector import Selector

from meow.items import meowItem, meowPage
from scrapy.http import Request
import urlparse
import pprint

class meowlistpage(Spider):
    name="melist"
    allowed_domains = ['meow.com']

    start_urls = ['http://www.meow.com'
                  '/homes/for_sale/CO'
                  '/10_rid/42.382894,-101.931152,35.496456,-109.171143_rect/6_zm/']

    def parse(self, response):
        self.log('First page retrieved')
        selector = Selector(response)

        # loop over the subroutine results, yielding each;
        # could be an Item or a Request
        for result in self.parse_page_articles(response, selector):
            yield result

        # look for next pages (except the one marked as "current")
        for next_page in selector.xpath("""
                //div[@id="search-pagination-wrapper-2"]
                    /ul/li[not(@class="current")]/a/@href""").extract():
            yield Request(urlparse.urljoin(response.url, next_page))

    def getthispage(self, response):
        selector = Selector(response)
        item = response.meta["item"]
        # self.log("in %s (getthispage:\nitem to complete \n%r" % (
        #    response.url, pprint.pformat(item)))
        #
        # continue extracting data from this page and store it in item
        # item["some_other_field"] = "info that was missing in listing page"
        # and when done, do:
        yield item

    def parse_page_articles(self, response, selector):
        page_tags = selector.xpath("//div[@id='search-results']/article")
        for article in page_tags:
            item = meowItem()
            item['ad_link'] = urlparse.urljoin(response.url,
                article.xpath(".//div[1]/dl[2]/dt[1]/span[1]/span[1]/a/@href").extract()[0]).encode('utf-8')

            idendplace = item['ad_link'].index('_zpid')-12
            backhashstr = item['ad_link'][idendplace:]
            idstartplace = backhashstr.index('/') + 1
            idendplace = len(backhashstr)-backhashstr.index('_zpid')

            item['zpid'] = backhashstr[idstartplace:-idendplace]
            item['sale_sold'] = article.xpath(".//div[1]/dl[1]/dt[1]/@class").extract()[0][8:-17]#"recentlySold" or "forSale"
            item['prop_price'] = article.xpath(".//div[1]/dl[1]/dt[2]/strong/text()").extract()[0][3:-2]
            if (item['sale_sold'] == 'recentlySold'):
                item['prop_price'] = item['prop_price'] + article.xpath(".//div[1]/dl[1]/dt[1]/strong/text()").extract()[0][3:-2]
            try:
                dollrsgn = item['prop_price'].index('$')
                item['prop_price'] = item['prop_price'][dollrsgn:]
            except:
                pass

            item['ad_title'] = article.xpath(".//div[1]/dl[2]/dt[1]/span[1]/span[1]/a/@title").extract()[0][3:-2]

            prop_latitude1 = article.xpath("@latitude").extract()
            item['prop_latitude'] = (str(prop_latitude1)[3:-8]
                + '.' + str(prop_latitude1)[5:-2])
            prop_longitude1 = article.xpath("@longitude").extract()
            item['prop_longitude'] = (
                str(prop_longitude1)[3:-8]
                + '.' + str(prop_longitude1)[7:-2])

            item['prop_address'] = (
                str(article.xpath(".//div[1]/dl[2]/dt[1]/span[1]/span[1]/a/span[1]/text()").extract())[3:-2]
                + ', ' + str(article.xpath(".//div[1]/dl[2]/dt[1]/span[1]/span[1]/a/span[2]/text()").extract())[3:-2]
                + ', ' + str(article.xpath(".//div[1]/dl[2]/dt[1]/span[1]/span[1]/a/span[3]/text()").extract())[3:-2]
                + '  ' + str(article.xpath(".//div[1]/dl[2]/dt[1]/span[1]/span[1]/a/span[4]/text()").extract())[3:-2])

            mightmentionacres = (
                str(article.xpath(".//div[1]/dl[2]/dt[2]/text()").extract())[3:-2]
                +' | '
                +str(article.xpath(".//div[1]/dl[2]/dt[2]/text()").extract())[3:-2]
                +' | '
                +str(article.xpath(".//div[1]/dl[2]/dt[1]/span[1]/span[1]/a/@title").extract())[3:-2]
                +' | '
                #+str()[3:-2]#this last segment comes from full ad
                )
            item['prop_acres'] = mightmentionacres

            yield Request(item['ad_link'], meta={'item':item},
                dont_filter=True, callback = self.getthispage)

使用scrapy crawl melist -o melist_items.csv -t csv调用您的蜘蛛应该以CSV格式提供您的项目

Answer 2

我的解决方案使用与Python 2.7一起打包的标准SQLite3：

#  items.py contents:
#from scrapy.item import Item, Field

#class TrackItemScrapeItem(Item):
    # define the fields for your item here like:
#    f_1 = Field()
#    f_2 = Field()
#    sale_sold = Field()
#    price = Field()
#    item_ad_link = Field()
#    .and so on
#    .
#    .
#    .
#    <EOF>

#  TrackItemScrapespider.py contents:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.exporter import CsvItemExporter
import csv
import sys
from zillow.items import TrackItemScrapeItem
from scrapy.http import Request
import os
import sqlite3 #allows scrapy asynchronous processes a global storage place for each item's scraped info
import time

class TrackItemScraper(Spider):
    name="buyitemslist"
    start_urls = ['http://www.buythisandthat.com']
    tablecolumns=""
    prikeyfldname='f_1'
    for field in getattr(TrackItemScrapeItem, 'fields'):
        # Just realize the order of these fields in the database has no relation to the order in items.py
        # nor is it consistent each time
        if (field==prikeyfldname):
            tablecolumns+=str(field)+' TEXT PRIMARY KEY NOT NULL, '
        else:
            tablecolumns+=str(field)+' TEXT, '
    tablecolumns=tablecolumns[:-2]
    con=None
    con=sqlite3.connect(name+".db");tablename='CrawlResults'
    if (con==None):
        print "SQL database not getting opened by sqlite3 !  (Is there room for the file ?)";sys.exit(1)
    cur=con.cursor()
    try:
        cur.execute('select * from '+tablename)
        cur.execute('PRAGMA table_info('+tablename+')')
        data = cur.fetchall()
        for d in data:
            print d[0], d[1], d[2]
        cur.execute('select * from '+tablename)
        print '\n'+str(cur.fetchall())
    except:
        cur.execute('DROP TABLE IF EXISTS '+tablename);cur.execute('CREATE TABLE '+tablename+' ('+tablecolumns+')')
    if (raw_input('\n\n Do you want to delete the previous '+name+'.CSV file?').capitalize()=='Y'):
        csvfile=name+'.csv'
        with open(csvfile, 'w') as csv_file:
            csv_file.close()
    if (raw_input('\n\n Do you want to save the results from the previous run to a new '+name+'.CSV file?').capitalize()=='Y'):
        csvfile=name+'.csv'
        with open(csvfile, 'w') as csv_file:
            csv_writer=csv.writer(csv_file)
            csv_writer.writerow([i[0] for i in cur.description])
            cur.execute('select * from '+tablename)
            csv_writer.writerows(cur)
            csv_file.close()
    if (raw_input('\n\n Do you want to clear out previous results from memory now to start clean?  Answer no ONLY if you haven\'t added new fields!').capitalize()=='Y'):
        cur.execute('DROP TABLE IF EXISTS '+tablename);cur.execute('CREATE TABLE '+tablename+' ('+tablecolumns+')')
    instancesrunning=1#start at 1 because a yield is about to happen implicitly for an asynchronous instance
#    seccntdwn=25#for failsafe counter, can used with next-to-last (for example) parse instance to tolerate an instance collision decrementing instancesrunning counter

    def parse(self, response):#recursive but first entry won't have meta args
        try:
            pageschema = response.meta['pageschema']
        except:#entered function without meta args, pageschema 0
            pageschema = 0
        hxs = Selector(response)
        if (pageschema==0):#top level pages
            ad_list=hxs.xpath("//xpath_to_ad_list")
        #    page_tags=''

            for item_ad in ad_list:
                item = TrackItemScrapeItem()
                # parse code for prikeyfldname field must be in here instead of these comment lines
                #item[prikeyfldname]=item_ad.xpath(".//div[whatever...
                # for this example, item['sale_sold'] and item['price'] will need parsing code in here as well
                con=None
                con=sqlite3.connect(self.name+".db")
                with sqlite3.connect(self.name+".db") as con:
                    cur=con.cursor()
                    replacevar = False
                    try:
                        cmd='INSERT INTO ' +str(self.tablename)+ ' (' +str(self.prikeyfldname)+ ') VALUES (\'' +str(item[self.prikeyfldname])+ '\')'
                        cur.execute(cmd)
                        print str(cmd) #won't see this unless insert succeeded 
                        con.commit()
                        #replacevar is for any fields for this item that you might want to keep old contents of, appending current info only when this item was not new in this run
                        replacevar = True
                        self.storthis(str(item[self.prikeyfldname]), 'sale_sold', str(item['sale_sold']), replace=replacevar)
                    except:#option example: if wanting to save old price, moves it into sale_sold if sale_sold field has changed
                        cmd='UPDATE ' +self.tablename+ ' SET sale_sold = \''  +str(item['sale_sold'])+ ', last retrieved advertised price was \' || (SELECT price) WHERE ' +str(self.prikeyfldname)+ ' = ' +str(item[self.prikeyfldname])+ ' AND sale_sold <> \'' +str(item['sale_sold'])+ '\''
                        print str(cmd)#prints even if SQL update fails for troubleshooting
                        cur.execute(cmd)
                        con.commit()
                #now storthis code for price field because we already parsed it
                self.storthis( str(item[self.prikeyfldname]),'price',item['price'], replace=True)
                #
                #remainder of parses and stores in here for this item, saving the ad_link for next yield/Request
                #
                self.instancesrunning+=1    
                yield Request(str(item['ad_link']), meta={'item':item, 'pageschema':1}, dont_filter=True, callback = self.parse)
            #code in here to determine link for next page of list of ads
            if (final_page_of_list_is_complete):
                self.savetofileiflastinstance()#the way to exit all callback instances
                return
            #parse link to next page in here if not done already
            time.sleep(6)#being nice to host
            self.instancesrunning+=1
            yield Request(next_results_pg_href, dont_filter=True, callback = self.parse)#don't need meta for list pages
        elif (pageschema==1): #we are to parse using 2nd schema
            item = response.meta['item']
            page_tags=hxs.xpath("//as you need for this schema"
            #parse and store in here, just remember in this example that we didn't pass along in meta whether this item is new or not this run
            for (every link on this page needing scraped but adjust pageschema for each type):
                self.instancesrunning+=1    
                yield Request(str(item['ad_link']), meta={'item':item, 'pageschema':as_needed}, dont_filter=True, callback = self.parse)
        elif (pageschema==2): #same general code in here as for schema 1
        elif (pageschema==3): #same general code in here as for schema 1
        elif (pageschema==4): #same general code in here as for schema 1
        self.savetofileiflastinstance()
        return

    def storthis (self, uniquefieldvalue, storfieldname, storfieldvalue, replace):
#       check for nulls etc in here, if desired  
        con=None
        con=sqlite3.connect(self.name+".db")
        if (replace==False):
            cmd='UPDATE '+str(self.tablename)+' SET '+storfieldname+' = (SELECT '+storfieldname+') || \''+storfieldvalue+'\' WHERE '+self.prikeyfldname+'=\''+uniquefieldvalue+'\''
        else:
            cmd='UPDATE '+str(self.tablename)+' SET '+storfieldname+'=\''+storfieldvalue+'\' where '+self.prikeyfldname+'=\''+uniquefieldvalue+'\''
        print str(cmd)
        try:
            with con:
                cur=con.cursor()
                cur.execute(cmd)
                con.commit()
        except:pass##we don't want to return with an error unexcepted
        return 

    def savetofileiflastinstance(self):
#        instancesrunningpre=self.instancesrunning #if utilizing the failsafe instance counting
        self.instancesrunning-=1
        if (self.instancesrunning>0):pass #or failsafe counter code in here
#            #if concerned that this might still be the last if an instance collision ever occurred while decrementing self.instancesrunning
#             we'll wait seccntdwn seconds for another instance to have its presence indicated by a change in the counter
#            if ():
                self.savetofileiflastinstance()
        else:   #this was last instance, write the csv file
            con=None
            con=sqlite3.connect(self.name+".db")
            cur=con.cursor()
            csvfile=self.name+'.csv'
            with open(csvfile, 'a') as csv_file:
                cur.execute('select * from '+self.tablename)
                csv_writer=csv.writer(csv_file)
                csv_writer.writerow([i[0] for i in cur.description])
                csv_writer.writerows(cur)
                csv_file.close()
            print '\n\nWrote the results to the '+self.name+'.CSV file.  Make sure this is the LAST line of output from this sript!  If it isn\'t, rewrite the source coding for instance tracking.\n\n'
            con.commit()
        return

Scrapy的异步性是什么阻碍了我的CSV结果文件的直接创建？

2 个答案: