Scrapy:如何处理4深度蜘蛛,所有请求都不起作用

时间:2014-05-07 07:18:49

标签: python-2.7 scrapy

我遇到了parse_page1和parse_page2函数的问题它正常工作直到解析page1函数但是parse_page2只返回一行意味着这个函数只被调用一次我需要它为parse_page1的每个项目运行所以请帮我这个我会非常感激。我尝试了如下

class HuluSpider(CrawlSpider):
    name = "abc"  ## defines name of spider

    allowed_domains = ["abc.com"] ## define the name of main URL
    ## define the name of content URL
    start_urls =["http://example.com/browse/search?keyword=&alphabet=All&fam"]

    def parse(self, response): ###  This block generate the output of URL and id
        data=response.body
        data=data.split('data-show-id=')
        data=data[1:4]
        item = abctem()
        # print data
        for s in data: 

            #item['title'] = block.select('.//a/text()').extract()[0]
            m = re.search('ine-height:1.5em[^>]+>\s*([^<]+)', s)
            if m==None:
               item['TITLE'] = ""
            else:
                item['TITLE'] = re.search('ine-height:1.5em[^>]+>\s*([^<]+)', s).group(1)
            d = re.search('(\d+\s*episodes)',s)
            if d==None:
               item['EPISODE_COUNT'] = ""
            else:
               item['EPISODE_COUNT'] = re.search('(\d+\s*episodes)',s).group(1)
            k=re.search('href=\s*(http://example.com[^>]+)',s)
            try:
               item['REQUEST_URL']=re.search('href=\s*(http://example.com[^>]+)',s).group(1)
               url=re.search('href=\s*(http://example.com[^>]+)',s).group(1)
            except:
                pass
            l=re.search('ine-height:1.5em[^>]+>[^>]+>[^>]+>[^>]+>[^\(]+([^\)]+)',s)
            if l==None:
               item['RELEASE_YEAR']=""
            else:
               item['RELEASE_YEAR']=re.search('ine-height:1.5em[^>]+>[^>]+>[^>]+>[^>]+>[^\(]+\(\s*([^\)]+)',s).group(1)
            # if k==None:
            try:
                callback=lambda response: self.parse_page1(response,url)
                yield Request(url,callback=callback, meta=dict(item=item))
            except:
                pass

    def parse_page1(self,response,url): 
        sc=response.body
        item = response.meta['item']
        url_lst=[]
        if re.search('API_DONUT\s*=\s*\'([^\']+)', sc)==None:
            api_dnt=None
        else:
            api_dnt=re.search('API_DONUT\s*=\s*\'([^\']+)', sc).group(1)
        if re.search('contentPgid\s*=\s*([^;]+);', sc)==None:
            pgid=None
        else:
            pgid=re.search('contentPgid\s*=\s*([^;]+);', sc).group(1)
        if re.search('"version"\s*:\s*"([^"]+)', sc)==None:
            pgid=version
        else:
            version=re.search('"version"\s*:\s*"([^"]+)', sc).group(1)
        id=url.split('/')[-1]
        item['ID']=id
        url1="http://example.com/mozart/v1.h2o/canonical/"+str(id)+"?_user_pgid=1&_content_pgid="+str(pgid)+"_device_id=1&region=us&locale=en&language=en&include_pages=1&access_token="+api_dnt
        yield Request(url1,meta=dict(item=item),callback=lambda response: self.parse_page2(response,pgid,api_dnt))
        # yield item
    def parse_page2(self,response,pgid,api_dnt): ###  This block generate the output of season and episode
        sc1=json.loads(response.body)
        #pgid=test.split("//")[1]
        #api_dnt=test.split("//")[-1]
        item = response.meta['item']
        item['CLIP_COUNT']=sc1['data'][0]['show']['clips_count']
        print'----------------------------------------'
        print sc1['data'][0]['show']['clips_count']
        print'----------------------------------------'
        item['GENRE']=sc1['data'][0]['show']['genre']
        item['DESCRIPTION']=sc1['data'][0]['show']['description']
        item['USER_RATING_AVERAGE']=round(float(sc1['data'][0]['show']['rating']),1)
        id=item['CLIP_COUNT']=sc1['data'][0]['show']['id']
        seasons_count=sc1['data'][0]['show']['seasons_count']
        #url1="http://example.com/tempo/v1/h2o/show/"+version+"?_user_pgid=1&_content_pgid="+pgid+"&_device_id=1&region=us&locale=en&language=en&name=1-800-missing&scope_name=h2o%3Aus&logged_in=false&treatments=reco%3Acontrol%2Cweb%3Asmart_start_ctrl%2Cweb_guid_only%3Acontrol&access_token="+api_dnt
        url2="http://example.com/mozart/v1.h2o/shows/"+str(id)+"/seasons?_user_pgid=1&_content_pgid="+str(pgid)+"&_device_id=1&region=us&locale=en&language=en&video_type=episode&sort=seasons_and_release&free_only=0&show_id="+str(id)+"&include_nonbrowseable=1&access_token="+api_dnt
       #callback=functools.partial(self.parse_page3, meta=dict(item=item),seasons_count)
        callback=lambda response: self.parse_page3(response,seasons_count)
        #yield Request(url2,meta=dict(item=item),callback=callback) 
        yield item
    # def parse_page3(self,response,seasons_count): ###  This block generate the output of season and episode
        # sc3=json.loads(response.body)
        # item = response.meta['item']
        # for inc in xrange(0,int(seasons_count)):
            # item['SEASON']=sc3['seasons'][inc]['season_number']
            # print"--------------------------------------------"
            # print sc3['seasons'][inc]['season_number']
            # print'---------------------------------------------'
            # item['EPISODE_COUNT']=sc3['seasons'][inc]['count']
            # yield item                

0 个答案:

没有答案