我有一个密码。
import scrapy
import requests
class cvbankas(scrapy.Spider):
name ='bankas'
allowed_domains =['cvbankas.lt']
start_urls = ['https://www.cvbankas.lt/']
def parse(self,response):
job_position_tag = response.css("h3.list_h3::text").extract()
city_tag = response.css("span.list_city::text").extract()
company_tag = response.css("span.dib.mt5::text").extract()
salary_tag = response.css("span.salary_amount::text").extract()
for item in zip(job_position_tag,city_tag,company_tag,salary_tag):
scraped_info={
'company':company_tag,
'city': city_tag,
'position': job_position_tag,
'salary': salary_tag,
}
yield scraped_info
next_page = response.css('li > a::attr(href)').extract_first()
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(url = next_page, callback = self.parse)
我不知道为什么它只能刮3页
Output marked in red is only 3 pages of 88
分页问题在哪里?
答案 0 :(得分:1)
您的选择器正在找到他可以找到的第一个<a>
标签,即语言<a>
标签。您正在更改语言而不是页面。
import scrapy
import requests
class cvbankas(scrapy.Spider):
name ='bankas'
allowed_domains =['cvbankas.lt']
start_urls = ['https://www.cvbankas.lt/']
def parse(self,response):
job_position_tag = response.css("h3.list_h3::text").extract()
city_tag = response.css("span.list_city::text").extract()
company_tag = response.css("span.dib.mt5::text").extract()
salary_tag = response.css("span.salary_amount::text").extract()
for item in zip(job_position_tag,city_tag,company_tag,salary_tag):
scraped_info={
'company':company_tag,
'city': city_tag,
'position': job_position_tag,
'salary': salary_tag,
}
yield scraped_info
next_page = response.xpath('//a[@class="prev_next"]/@href').extract()[-1]
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(url = next_page, callback = self.parse)
答案 1 :(得分:0)
我似乎您要抓取的网站使用的网址格式为uri?page=x
一个简单的循环来替换x可以解决您的问题。