我具有从一页提取文章的功能,但是我无法导航到下一页以刮取所有页面:
下面是我的尝试方式:
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
class MedicalSpider(scrapy.Spider):
name = 'medical'
# allowed_domains = ['https://blogs.webmd.com/diabetes/default.htm']
allowed_domains = ['blogs.webmd.com'] # Only the domain, not the URL
start_urls = ['https://blogs.webmd.com/diabetes/default.htm']
def parse(self, response):
article_links = response.css('.posts-list-post-content a ::attr(href)')
print(article_links)
for link in article_links:
url = link.get()
if url:
yield response.follow(url=url, callback=self.parse_article)
def parse_article(self, response):
headline = response.css('.blog-header-container h1::text').get()
article_sections = response.css('.article-body .article-page section p::text')
body = ""
for article_sections in article_sections:
body += article_sections.get() + "\n"
yield {
'headline': headline,
'body': body
}
# url_apnd = "https://blogs.webmd.com/diabetes"
next_page = response.css('.next a ::attr(href)').get()
print(next_page)
# print("URL " + response.urljoin(next_page))
if next_page:
yield scrapy.Request(response.urljoin(next_page),callback=self.parse)
请帮助我正确导航到下一页。
答案 0 :(得分:1)
由于下一页按钮位于parse
中定义的url中,因此您需要在start_urls
函数中移动下一页逻辑。
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
class MedicalSpider(scrapy.Spider):
name = 'medical'
allowed_domains = ['blogs.webmd.com'] # Only the domain, not the URL
start_urls = ['https://blogs.webmd.com/diabetes/default.htm']
def parse(self, response):
article_links = response.css('.posts-list-post-content a ::attr(href)')
print(article_links)
for link in article_links:
url = link.get()
if url:
yield response.follow(url=url, callback=self.parse_article)
next_page = response.css('.next a ::attr(href)').get()
print(next_page)
# print("URL " + response.urljoin(next_page))
if next_page:
yield scrapy.Request(response.urljoin(next_page),callback=self.parse)
def parse_article(self, response):
headline = response.css('.blog-header-container h1::text').get()
article_sections = response.css('.article-body .article-page section p::text')
body = ""
for article_sections in article_sections:
body += article_sections.get() + "\n"
yield {
'headline': headline,
'body': body
}