我是scrapy的新手,并且正在尝试使用以下格式抓取网站以获取内容:
{
"book": {
"title": "xyz,",
"chapters": {
"title": "xyz",
"link": "xyz",
"articles": [
{
"article_name": "xyz",
"article_content": "xyz",
},
{
"article_name": "xyz",
"article_content": "xyz",
}
]
}
}
}
我的蜘蛛在下面。我还没有能够更新章节["文章"]字段,使其等于请求调用的结果。
有没有人有任何帮助我的指示?
import time
import scrapy
from selenium import webdriver
class BookItem(scrapy.Item):
title = scrapy.Field()
chapters = scrapy.Field()
class ChapterItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
articles = scrapy.Field()
class ArticleItem(scrapy.Item):
article_name = scrapy.Field()
article_content = scrapy.Field()
class MySpider(scrapy.Spider):
name = "book"
allowed_domains = ["example.com"]
start_urls = ["https://example.com"]
def __init__(self):
self.driver = webdriver.Chrome()
# GO THRU DIFFERENT BOOKS
def parse(self, response):
self.driver.get(response.url)
books = response.css("ul.book-list li a::text").extract()
for book in books[0:1]:
book_list = BookItem()
book_list["title"] = book.strip()
# CLICK BOOK LINK
el = self.driver.find_element_by_xpath("//a[text()='" + book.strip() + "']")
if el:
el.click()
time.sleep(5)
el = self.driver.find_element_by_xpath("//a[contains(@class,'list-view')]")
if el:
el.click()
time.sleep(5)
s1 = scrapy.Selector(text=self.driver.page_source)
# GET CHAPTERS
chapters = s1.css("#list div.grid__item li a")
output = []
for chapter in chapters[0:2]:
item = ChapterItem()
item["title"] = chapter.xpath('./text()').extract()[0].strip()
item["link"] = chapter.xpath("./@href").extract()[0].strip()
item["articles"] = ""
yield scrapy.Request(item["link"], meta={'item': item}, callback=self.parse_chapter)
output.append(dict(item))
book_list["chapters"] = output
yield book_list
self.driver.close()
def parse_chapter(self, response):
item = response.meta['item']
item['address'] = response.css('div.chapter-detail p.mv0::text').extract()[0].strip()
article_item = ArticleItem()
article_selector = response.css('ul.article-data')
articles = []
for article in article_selector:
article_item['article_name'] = article.css('li.col01 a::text').extract()[0].strip()
article_item['article_content'] = article.css('li.col02::text').extract()[0].strip()
articles.append(dict(article_item))
item['article'] = articles
yield item