响应中的Scrapy Iterate start_requests

时间:2019-12-14 12:38:29

标签: python scrapy

import itertools

class MySpiderSpider(CrawlSpider):
    name = 'example.com'
    allowed_domains = ["example.com"]
        def start_requests(self):
            makes_list_all = []
            for page, make in itertools.product(range(1,21), makes_list_all):
                yield self.make_requests_from_url('https://www.example.com/?page={page}&mmm={make}||'.format(page=page, make=make))


/* Do something */ 

我在这里答复所有ID,并将其设置为makes_list_all列表

    def parse_item(self, response):
        html = Selector(response)
        item = ItemnameItem()
        products = html.xpath('//body')
        items = []
        for product in products:
            item = ItemnameItem()
#Get IDs
            try:
                ids_step_one = r'"searchFilters":{"makes":\[(.*?)}]'
                ids_step_two = r'{"id":(.*?),"name":"'
                current_page_ids = product.xpath('//script[@id="initial-state"]')[0].re(ids_step_one)[0]
                current_page_ids = re.findall(ids_step_two, current_page_ids)
                print ('current_page_ids :')
                print (current_page_ids) #[u'9', u'13', u'29', u'47', u'16338', u'52', u'54', u'57', u'60', u'64', u'65', u'74']
                try:
                    print ('ok')
                except IndexError:
                    print ('-= NOT AVAILABLE =-')
                    current_page_ids = 'null'
            except IndexError:
                print ('-= IndexError =-')
                current_page_ids = ''
                pass
            except KeyError:
                print ('-= KeyError =-')
                current_page_ids = ''
                pass
        items.append(item)
        return items

示例:

current_page_ids:

[u'9', u'13', u'29', u'47', u'16338', u'52', u'54', u'57', u'60', u'64', u'65', u'74']

我的问题是:如何将此ID添加到类start_requests中?

我需要在current_page_ids后面附加makes_list_all,才能开始解析该网址:

  1. example.com/?page=1&mmm=9,
  2. example.com/?page=1&mmm=13等。(来自列表)
  3. example.com/?page=2&mmm=9,
  4. example.com/?page=2&mmm=13 ...

0 个答案:

没有答案