import itertools
class MySpiderSpider(CrawlSpider):
name = 'example.com'
allowed_domains = ["example.com"]
def start_requests(self):
makes_list_all = []
for page, make in itertools.product(range(1,21), makes_list_all):
yield self.make_requests_from_url('https://www.example.com/?page={page}&mmm={make}||'.format(page=page, make=make))
/* Do something */
我在这里答复所有ID,并将其设置为makes_list_all
列表
def parse_item(self, response):
html = Selector(response)
item = ItemnameItem()
products = html.xpath('//body')
items = []
for product in products:
item = ItemnameItem()
#Get IDs
try:
ids_step_one = r'"searchFilters":{"makes":\[(.*?)}]'
ids_step_two = r'{"id":(.*?),"name":"'
current_page_ids = product.xpath('//script[@id="initial-state"]')[0].re(ids_step_one)[0]
current_page_ids = re.findall(ids_step_two, current_page_ids)
print ('current_page_ids :')
print (current_page_ids) #[u'9', u'13', u'29', u'47', u'16338', u'52', u'54', u'57', u'60', u'64', u'65', u'74']
try:
print ('ok')
except IndexError:
print ('-= NOT AVAILABLE =-')
current_page_ids = 'null'
except IndexError:
print ('-= IndexError =-')
current_page_ids = ''
pass
except KeyError:
print ('-= KeyError =-')
current_page_ids = ''
pass
items.append(item)
return items
示例:
current_page_ids:
[u'9', u'13', u'29', u'47', u'16338', u'52', u'54', u'57', u'60', u'64', u'65', u'74']
我的问题是:如何将此ID添加到类start_requests
中?
我需要在current_page_ids
后面附加makes_list_all
,才能开始解析该网址: