对不起,这很愚蠢,但是我在使刮板工作时遇到了问题,我不知道该在哪里。
单独的xpath项目可以工作,但总体而言,spider不会产生任何结果。
有人可以看看这个,让我知道我在想什么吗?我是一个完整的初学者,如果这很傻,我深表歉意。
import scrapy
from ..items import IndeedItem
class JobsSpider(scrapy.Spider):
name = "jobs"
start_urls = [
'https://www.indeed.com/jobs?q=remote&l=United+States',
]
def parse(self, response):
items = project111Item()
#all_rows = response.xpath('.//*[@data-tn-component="organicJob"]')
for response in response.xpath('.//*[@data-tn-component="organicJob"]'):
item = project111Item()
item['job_title'] = response.xpath('.//a[@data-tn-element="jobTitle"]/@title[1]').extract(),
item['company'] = response.xpath(".//span[@class='company']//a/text()").extract(),
item['salary'] = response.xpath(".//span[@class='company']//a/text()").extract(),
item['location'] = response.xpath('.//span[@class="location accessible-contrast-color-location"]/text()').extract(),
item['link'] = response.xpath(".//h2[@class='title']//a/@href").extract()
#if link:
#item['link'] = 'https://www.indeed.com' + link
yield item
# follow pagination link
next_page_url = response.css('#resultsCol > nav > div > ul > li > a::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
答案 0 :(得分:1)
我认为主要问题是您在for循环中的命名。尝试将for response in response.xpath(...)
更改为其他内容。以下可能有效:
def parse(self, response):
for job in response.xpath('.//*[@data-tn-component="organicJob"]'):
item = IndeedItem()
item['job_title'] = job.xpath('.//a[@data-tn-element="jobTitle"]/@title[1]').extract(),
item['company'] = job.xpath(".//span[@class='company']//a/text()").extract(),
item['salary'] = job.xpath(".//span[@class='company']//a/text()").extract(),
item['location'] = job.xpath('.//span[@class="location accessible-contrast-color-location"]/text()').extract(),
item['link'] = job.xpath(".//h2[@class='title']//a/@href").extract()
yield item
next_page_url = response.css('#resultsCol > nav > div > ul > li > a::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)