我遇到了parse_page1和parse_page2函数的问题它正常工作直到解析page1函数但是parse_page2只返回一行意味着这个函数只被调用一次我需要它为parse_page1的每个项目运行所以请帮我这个我会非常感激。我尝试了如下
class HuluSpider(CrawlSpider):
name = "abc" ## defines name of spider
allowed_domains = ["abc.com"] ## define the name of main URL
## define the name of content URL
start_urls =["http://example.com/browse/search?keyword=&alphabet=All&fam"]
def parse(self, response): ### This block generate the output of URL and id
data=response.body
data=data.split('data-show-id=')
data=data[1:4]
item = abctem()
# print data
for s in data:
#item['title'] = block.select('.//a/text()').extract()[0]
m = re.search('ine-height:1.5em[^>]+>\s*([^<]+)', s)
if m==None:
item['TITLE'] = ""
else:
item['TITLE'] = re.search('ine-height:1.5em[^>]+>\s*([^<]+)', s).group(1)
d = re.search('(\d+\s*episodes)',s)
if d==None:
item['EPISODE_COUNT'] = ""
else:
item['EPISODE_COUNT'] = re.search('(\d+\s*episodes)',s).group(1)
k=re.search('href=\s*(http://example.com[^>]+)',s)
try:
item['REQUEST_URL']=re.search('href=\s*(http://example.com[^>]+)',s).group(1)
url=re.search('href=\s*(http://example.com[^>]+)',s).group(1)
except:
pass
l=re.search('ine-height:1.5em[^>]+>[^>]+>[^>]+>[^>]+>[^\(]+([^\)]+)',s)
if l==None:
item['RELEASE_YEAR']=""
else:
item['RELEASE_YEAR']=re.search('ine-height:1.5em[^>]+>[^>]+>[^>]+>[^>]+>[^\(]+\(\s*([^\)]+)',s).group(1)
# if k==None:
try:
callback=lambda response: self.parse_page1(response,url)
yield Request(url,callback=callback, meta=dict(item=item))
except:
pass
def parse_page1(self,response,url):
sc=response.body
item = response.meta['item']
url_lst=[]
if re.search('API_DONUT\s*=\s*\'([^\']+)', sc)==None:
api_dnt=None
else:
api_dnt=re.search('API_DONUT\s*=\s*\'([^\']+)', sc).group(1)
if re.search('contentPgid\s*=\s*([^;]+);', sc)==None:
pgid=None
else:
pgid=re.search('contentPgid\s*=\s*([^;]+);', sc).group(1)
if re.search('"version"\s*:\s*"([^"]+)', sc)==None:
pgid=version
else:
version=re.search('"version"\s*:\s*"([^"]+)', sc).group(1)
id=url.split('/')[-1]
item['ID']=id
url1="http://example.com/mozart/v1.h2o/canonical/"+str(id)+"?_user_pgid=1&_content_pgid="+str(pgid)+"_device_id=1®ion=us&locale=en&language=en&include_pages=1&access_token="+api_dnt
yield Request(url1,meta=dict(item=item),callback=lambda response: self.parse_page2(response,pgid,api_dnt))
# yield item
def parse_page2(self,response,pgid,api_dnt): ### This block generate the output of season and episode
sc1=json.loads(response.body)
#pgid=test.split("//")[1]
#api_dnt=test.split("//")[-1]
item = response.meta['item']
item['CLIP_COUNT']=sc1['data'][0]['show']['clips_count']
print'----------------------------------------'
print sc1['data'][0]['show']['clips_count']
print'----------------------------------------'
item['GENRE']=sc1['data'][0]['show']['genre']
item['DESCRIPTION']=sc1['data'][0]['show']['description']
item['USER_RATING_AVERAGE']=round(float(sc1['data'][0]['show']['rating']),1)
id=item['CLIP_COUNT']=sc1['data'][0]['show']['id']
seasons_count=sc1['data'][0]['show']['seasons_count']
#url1="http://example.com/tempo/v1/h2o/show/"+version+"?_user_pgid=1&_content_pgid="+pgid+"&_device_id=1®ion=us&locale=en&language=en&name=1-800-missing&scope_name=h2o%3Aus&logged_in=false&treatments=reco%3Acontrol%2Cweb%3Asmart_start_ctrl%2Cweb_guid_only%3Acontrol&access_token="+api_dnt
url2="http://example.com/mozart/v1.h2o/shows/"+str(id)+"/seasons?_user_pgid=1&_content_pgid="+str(pgid)+"&_device_id=1®ion=us&locale=en&language=en&video_type=episode&sort=seasons_and_release&free_only=0&show_id="+str(id)+"&include_nonbrowseable=1&access_token="+api_dnt
#callback=functools.partial(self.parse_page3, meta=dict(item=item),seasons_count)
callback=lambda response: self.parse_page3(response,seasons_count)
#yield Request(url2,meta=dict(item=item),callback=callback)
yield item
# def parse_page3(self,response,seasons_count): ### This block generate the output of season and episode
# sc3=json.loads(response.body)
# item = response.meta['item']
# for inc in xrange(0,int(seasons_count)):
# item['SEASON']=sc3['seasons'][inc]['season_number']
# print"--------------------------------------------"
# print sc3['seasons'][inc]['season_number']
# print'---------------------------------------------'
# item['EPISODE_COUNT']=sc3['seasons'][inc]['count']
# yield item