Scrapy输出文件以递归方式运行所有yield请求 - 如何

时间:2016-12-06 05:15:07

标签: python json scrapy yield scrapy-spider

所以我有一只scrapy蜘蛛如下:

class CoursesSpider(scrapy.Spider):
name = "courses"
start_urls = [
    'http://example.com'
]

def parse(self, response):
    for subject in response.css('subject'):

        subject_name = subject.css('subject::text').extract_first().strip()
        subject_link = subject.css('subject::attr(href)').extract_first().strip()
        subject_id = subject.css('subject::attr(id)').extract_first().strip()

        if subject_link is not None:
            subject_data = scrapy.Request(subject_link, callback=self.parse_course)

        yield {
            'subject_name': subject_name,
            'subject_link': subject_link,
            'subject_id': subject_id,
            'subject_data': subject_data,
        }


def parse_course(self, response):

    subject_id = response.css('::attr(id)').extract_first().strip()

    for course in response.css('course'):

        course_name = course.css('course::text').extract_first().strip()
        course_link = course.css('course::attr(href)').extract_first().strip()
        course_id = course.css('course::attr(id)').extract_first().strip()

        if course_link is not None:
            course_data = scrapy.Request(course_link, callback=self.parse_class)

        yield {
            'course_name': course_name,
            'course_link': course_link,
            'course_id': subject_id + " " + course_id,
            'course_data': course_data,
        }

def parse_class(self, response):

    course_id = response.css('::attr(id)').extract_first().strip()

    for section in response.css('section'):
        section_name = section.css('section::text').extract_first().strip()
        section_link = section.css('section::attr(href)').extract_first().strip()

        yield {
            'section_name': section_name,
            'section_link': section_link,
            'course_id': course_id,
        }

我希望获得一个具有如下树结构的输出json文件:

{"subject_id": "...", "subject_name": "...", "subject_link": "...", "subject_data": 
  {"course_id": "...", "course_link": "...", "course_name": "...", "course_data": 
    {"course_id": "...", "section_link": "...", "section_name": "..."}
  }
}

但是我只能得到这个:

{"subject_id": "...", "subject_data": "<Request GET http://example.com/something>", "subject_name": "...", "subject_link": "..."}

据我所知,这是因为收益代码尚未执行。我将如何调用相当于&#34; scrapy爬行课程-o courses.json&#34;完全调用所有请求?如果这不可能开箱即用我怎么能自己做?我以后可以在python文件中导入json并运行http://example.com/something>和以下的某种方式?

我知道有很多代码,但应该澄清一下。 谢谢你的帮助!

1 个答案:

答案 0 :(得分:2)

我看到了两种方法:

  1. 以递增方式构建数据,并使用Request.meta dict将数据传递给每个回调。请参阅Passing additional data to callback functions
    1. 使用类似scrapy-inline-requests(待测试)
    2. 的内容

      方法1。

      class CoursesSpider(scrapy.Spider):
          name = "courses"
          start_urls = [
              'http://example.com'
          ]
      
          def parse(self, response):
              for subject in response.css('subject'):
      
                  subject_name = subject.css('subject::text').extract_first().strip()
                  subject_link = subject.css('subject::attr(href)').extract_first().strip()
                  subject_id = subject.css('subject::attr(id)').extract_first().strip()
      
                  if subject_link is not None:
                      subject_data = scrapy.Request(subject_link, callback=self.parse_course)
      
                  # build a dict with the info we have so far
                  subject_info = {
                      'subject_name': subject_name,
                      'subject_link': subject_link,
                      'subject_id': subject_id,
                  }
                  # add this to the Request's meta dict
                  subject_data.meta['subject_info'] = subject_info
      
                  # ask Scrapy to fetch additional data
                  yield subject_data
      
          def parse_course(self, response):
      
              # get back the data that was passed previously
              subject_info = response.request.meta['subject_info']
      
              subject_id = response.css('::attr(id)').extract_first().strip()
      
              for course in response.css('course'):
      
                  course_name = course.css('course::text').extract_first().strip()
                  course_link = course.css('course::attr(href)').extract_first().strip()
                  course_id = course.css('course::attr(id)').extract_first().strip()
      
                  if course_link is not None:
                      course_data = scrapy.Request(course_link, callback=self.parse_class)
      
                  # build a dict with the data in this page
                  # + the data scraped previously
                  course_info = {
                      'course_name': course_name,
                      'course_link': course_link,
                      'course_id': subject_id + " " + course_id,
                      'subject_info': subject_info,
                  }
      
                  # pass that data to the next callback
                  course_data.meta['course_info'] = subject_info
      
                  # fetch the class page
                  yield course_data
      
          def parse_class(self, response):
      
              # get course data from previous callbacks
              course_info = response.request.meta['course_info']
      
              course_id = response.css('::attr(id)').extract_first().strip()
      
              for section in response.css('section'):
                  section_name = section.css('section::text').extract_first().strip()
                  section_link = section.css('section::attr(href)').extract_first().strip()
      
                  yield {
                      'section_name': section_name,
                      'section_link': section_link,
                      'course_id': course_id,
                      'course_info': course_info
                  }
      

      所以你不会得到包含课程的科目,这些科目本身包含部分, 相反,每个部分都有关于他们所属课程的信息,他们自己有关于他们所涉及的主题的信息。

      方法2.(警告:我没有在实践中对此进行测试,但可能有效)

      from inline_requests import inline_requests
      
      class CoursesSpider(scrapy.Spider):
          name = "courses"
          start_urls = [
              'http://example.com'
          ]
      
          # this decorator is important
          @inline_requests
          def parse(self, response):
      
              for subject in response.css('subject'):
      
                  subject_name = subject.css('subject::text').extract_first().strip()
                  subject_link = subject.css('subject::attr(href)').extract_first().strip()
                  subject_id = subject.css('subject::attr(id)').extract_first().strip()
      
                  # this list will collect information on courses for this subject
                  subject_data = []
      
                  if subject_link is not None:
                      try:
                          # you ask scrapy to fetch the page
                          # but you do not set a callback
                          subject_response = yield scrapy.Request(subject_link)
                          # and you get a Response to work on when it's fetched,
                          # without going through a callback
      
                          subject_id = subject_response.css('::attr(id)').extract_first().strip()
      
                          for course in subject_response.css('course'):
      
                              course_name = course.css('course::text').extract_first().strip()
                              course_link = course.css('course::attr(href)').extract_first().strip()
                              course_id = course.css('course::attr(id)').extract_first().strip()
      
                              # this list will collect information on sections for this course
                              course_data = []
                              if course_link is not None:
                                  try:
                                      # same thing here, you ask Scrapy to fetch a Response
                                      course_response = yield scrapy.Request(course_link)
      
                                      course_id = course_response.css('::attr(id)').extract_first().strip()
      
                                      for section in course_response.css('section'):
                                          section_name = section.css('section::text').extract_first().strip()
                                          section_link = section.css('section::attr(href)').extract_first().strip()
      
                                          # add each section item
                                          course_data.append(
                                              {
                                                  'section_name': section_name,
                                                  'section_link': section_link,
                                                  'course_id': course_id,
                                              }
                                          )
      
                                  except:
                                      raise
      
                              # add each course item
                              subject_data.append(
                                  {
                                      'course_name': course_name,
                                      'course_link': course_link,
                                      'course_id': subject_id + " " + course_id,
                                      'course_data': course_data,
                                  }
                              )
      
                      except:
                          raise
      
      
                  yield {
                      'subject_name': subject_name,
                      'subject_link': subject_link,
                      'subject_id': subject_id,
                      'subject_data': subject_data,
                  }