在scrapy中创建嵌套项

时间:2016-06-14 15:40:15

标签: scrapy

我是scrapy的新手,并且正在尝试使用以下格式抓取网站以获取内容:

{
    "book": {
        "title": "xyz,",
        "chapters": {
            "title": "xyz",
            "link": "xyz",
            "articles": [
                {
                "article_name": "xyz",
                "article_content": "xyz",
                },
                {
                "article_name": "xyz",
                "article_content": "xyz",
                }
            ]
        }
    }
}

我的蜘蛛在下面。我还没有能够更新章节["文章"]字段,使其等于请求调用的结果。

有没有人有任何帮助我的指示?

import time
import scrapy

from selenium import webdriver

class BookItem(scrapy.Item):
    title = scrapy.Field()
    chapters = scrapy.Field()

class ChapterItem(scrapy.Item):
    title = scrapy.Field()
    link = scrapy.Field()
    articles = scrapy.Field()

class ArticleItem(scrapy.Item):
    article_name = scrapy.Field()
    article_content = scrapy.Field()


class MySpider(scrapy.Spider):
    name = "book"
    allowed_domains = ["example.com"]
    start_urls = ["https://example.com"]

    def __init__(self):
        self.driver = webdriver.Chrome()

    # GO THRU DIFFERENT BOOKS
    def parse(self, response):


        self.driver.get(response.url)

        books = response.css("ul.book-list li a::text").extract()

        for book in books[0:1]:
            book_list = BookItem()
            book_list["title"] = book.strip()

            # CLICK BOOK LINK
            el = self.driver.find_element_by_xpath("//a[text()='" + book.strip() + "']")


            if el:
                el.click()

            time.sleep(5)

            el = self.driver.find_element_by_xpath("//a[contains(@class,'list-view')]")
            if el:
                el.click()

            time.sleep(5)

            s1 = scrapy.Selector(text=self.driver.page_source)

            # GET CHAPTERS
            chapters = s1.css("#list div.grid__item li a")

            output = []

            for chapter in chapters[0:2]:
                item = ChapterItem()
                item["title"] = chapter.xpath('./text()').extract()[0].strip()
                item["link"] = chapter.xpath("./@href").extract()[0].strip()
                item["articles"] = ""

                yield scrapy.Request(item["link"], meta={'item': item}, callback=self.parse_chapter)

                output.append(dict(item))

            book_list["chapters"] = output
            yield book_list


        self.driver.close()

    def parse_chapter(self, response):

        item = response.meta['item']
        item['address'] = response.css('div.chapter-detail p.mv0::text').extract()[0].strip()

        article_item = ArticleItem()
        article_selector = response.css('ul.article-data')
        articles = []
        for article in article_selector:
            article_item['article_name'] = article.css('li.col01 a::text').extract()[0].strip()
            article_item['article_content'] = article.css('li.col02::text').extract()[0].strip()

            articles.append(dict(article_item))

        item['article'] = articles

        yield item

0 个答案:

没有答案