我修改了一个旧脚本并使其运行。但是,旧值未将任何内容输出到JSON。我是报废的新手。我正在练习scraping确实。另外,我将如何拉出我正在搜索的“远程”关键字以列为“作业类型”。我也不确定我是否有正确的网址和规则。谢谢
我知道脚本可以运行,但是我需要响应CSS或response.xpath的帮助。我可以找到所有的xpath值,但是有些不起作用。 xpathing“ jobtitle”我得到了一堆代码/就像onmouse click一样。提供代码。
class IndeedSpider(CrawlSpider):
name = "indeed"
allowed_domains = ["indeed.com"]
start_urls = [
"https://www.indeed.com/jobs?q=remote&l=",
]
rules = (
Rule(LinkExtractor(allow=('/jobs.q=linux&l=remote&l$','q=linux&l=remote&sort=l&start=[0-9]+$',),deny=('/my/mysearches', '/preferences', '/advanced_search','/my/myjobs')), callback='parse_item', follow=True),
)
def parse_next_site(self, response):
item = response.request.meta['item']
item['source_url'] = response.url
item['source_page_body'] = response.body
item['crawl_timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S')
def parse_item(self, response):
self.log('\n Crawling %s\n' % response.url)
hxs = Selector(response)
sites = hxs.select("//div[@class='row ' or @class='row lastRow']")
#sites = hxs.select("//div[@class='row ']")
items = []
for site in sites:
item = IndeedItem(company='none')
# Producing output with onmouse click. etc. Gets title as well.
item['job_title'] = site.select("//a[contains(concat(' ', normalize-space(@class), ' '),' jobtitle ')]").extract()
# link not working
link_url= site.select('h2/a/@href').extract()
item['link_url'] = link_url
item['crawl_url'] = response.url
item['location'] = site.select("//span[contains(concat(' ', normalize-space(@class), ' '),' location ')]/text()").extract()
# salary returns ''
item['salary'] = site.select("//span[contains(concat(' ', normalize-space(@class), ' '),' salaryText ')]").extract()
# Not all entries have a company. got a lot of , '\n
if site.select("//span[contains(concat(' ', normalize-space(@class), ' '),' company ')]/text()").extract() == []:
item['company'] = [u'']
else:
item['company'] = site.select("//span[contains(concat(' ', normalize-space(@class), ' '),' company ')]/text()").extract()
# Summary seems to work
item['summary'] = site.select("//div[contains(concat(' ', normalize-space(@class), ' '),' summary ')]").extract()
item['source'] = site.select("table/tr/td/span[@class='source']/text()").extract()
item['found_date'] = site.select("table/tr/td/span[@class='date']/text()").extract()
#item['source_url'] = self.get_source(link_url)
request = Request("http://www.indeed.com" + item['link_url'][0], callback=self.parse_next_site)
request.meta['item'] = item
yield request
items.append(item)
return
SPIDER=IndeedSpider()
也许有人可以测试现有代码以查看一些输出,并告诉我该怎么做才能修复不起作用。确实可以帮助我继续前进,弄清自己在做错什么,并更好地了解这些事情的工作原理。再次感谢。
答案 0 :(得分:1)
当使用xpath遍历草率选择器时,使用'.//myxpath'使用实际路径,您可以在此处查看代码示例 希望对您有帮助:)
from scrapy.spiders import CrawlSpider
from scrapy.http import Request, Response
from scrapy.linkextractors import LinkExtractor
import time
class IndeedSpider(CrawlSpider):
name = "indeed"
allowed_domains = ["indeed.com"]
start_urls = [
"https://www.indeed.com/jobs?q=remote&l=",
]
def start_requests(self):
for link in IndeedSpider.start_urls:
yield Request(url=link, callback=self.parse_site)
def parse_site(self, response: Response):
extracted_links = LinkExtractor(
allow=['/jobs.q=linux&l=remote&l$', 'q=linux&l=remote&sort=l&start=[0-9]+$'],
deny=['/my/mysearches', '/preferences', '/advanced_search', '/my/myjobs']) \
.extract_links(response)
for link in extracted_links:
yield Request(url=link.url, callback=self.parse_item)
def parse_item(self, response: Response):
self.log('\n Crawling %s\n' % response.url)
sites = response.xpath("//div[@class='row ' or @class='row lastRow']")
# sites = hxs.select("//div[@class='row ']")
items = []
for site in sites:
item = IndeedItem(company='none')
# Producing output with onmouse click. etc. Gets title as well.
# when Iterating over selectors use .// to use a relative xpath
item['job_title'] = site.xpath(".//a[has-class('jobtitle')]").get()
# link not working
link_url = site.xpath('.//h2/a/@href').get()
item['link_url'] = link_url
item['crawl_url'] = response.url
item['location'] = site.xpath(".//span[has-class('location')]/text()").get()
# salary returns ''
item['salary'] = site.xpath(".//span[has-class('salaryText')]").get()
# Not all entries have a company. got a lot of , '\n
if not site.xpath(".//span[has-class('company')]/text()").getall():
item['company'] = [u'']
else:
item['company'] = site.xpath(".//span[has-class('company')/text()").get()
# Summary seems to work
item['summary'] = site.xpath("//div[has-class('summary')]").get()
item['source'] = site.xpath(".//table/tr/td/span[@class='source']/text()").get()
item['found_date'] = site.xpath(".//table/tr/td/span[@class='date']/text()").get()
# item['source_url'] = self.get_source(link_url)
request = Request("http://www.indeed.com" + item['link_url'][0], callback=self.parse_next_site)
request.meta['item'] = item
yield request
items.append(item)
def parse_next_site(self, response: Response):
item = response.request.meta['item']
item['source_url'] = response.url
item['source_page_body'] = response.body
item['crawl_timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S')