Scrapy JSON输出-值为空-尝试XPath

时间:2019-10-03 00:23:59

标签: python-3.x scrapy

我修改了一个旧脚本并使其运行。但是,旧值未将任何内容输出到JSON。我是报废的新手。我正在练习scraping确实。另外,我将如何拉出我正在搜索的“远程”关键字以列为“作业类型”。我也不确定我是否有正确的网址和规则。谢谢

我知道脚本可以运行,但是我需要响应CSS或response.xpath的帮助。我可以找到所有的xpath值,但是有些不起作用。 xpathing“ jobtitle”我得到了一堆代码/就像onmouse click一样。提供代码。

class IndeedSpider(CrawlSpider):
name = "indeed"
allowed_domains = ["indeed.com"]
start_urls = [
    "https://www.indeed.com/jobs?q=remote&l=",
]


rules = ( 
    Rule(LinkExtractor(allow=('/jobs.q=linux&l=remote&l$','q=linux&l=remote&sort=l&start=[0-9]+$',),deny=('/my/mysearches', '/preferences', '/advanced_search','/my/myjobs')), callback='parse_item', follow=True),


    )

def parse_next_site(self, response):
    item = response.request.meta['item'] 
    item['source_url'] = response.url
    item['source_page_body'] = response.body
    item['crawl_timestamp'] =  time.strftime('%Y-%m-%d %H:%M:%S')



    def parse_item(self, response):
    self.log('\n Crawling  %s\n' % response.url)
    hxs = Selector(response)
    sites = hxs.select("//div[@class='row ' or @class='row lastRow']")
    #sites = hxs.select("//div[@class='row ']")
    items = []
    for site in sites:
        item = IndeedItem(company='none')
        # Producing output with onmouse click. etc. Gets title as well.
        item['job_title'] = site.select("//a[contains(concat(' ', normalize-space(@class), ' '),' jobtitle ')]").extract()
        # link not working
        link_url= site.select('h2/a/@href').extract()
        item['link_url'] = link_url
        item['crawl_url'] = response.url
        item['location'] = site.select("//span[contains(concat(' ', normalize-space(@class), ' '),' location ')]/text()").extract()
        # salary returns ''
        item['salary'] = site.select("//span[contains(concat(' ', normalize-space(@class), ' '),' salaryText ')]").extract()
        # Not all entries have a company. got a lot of , '\n
        if  site.select("//span[contains(concat(' ', normalize-space(@class), ' '),' company ')]/text()").extract() == []:
            item['company'] = [u'']
        else:
            item['company'] = site.select("//span[contains(concat(' ', normalize-space(@class), ' '),' company ')]/text()").extract()
                    # Summary seems to work
            item['summary'] = site.select("//div[contains(concat(' ', normalize-space(@class), ' '),' summary ')]").extract()
        item['source'] = site.select("table/tr/td/span[@class='source']/text()").extract()
        item['found_date'] = site.select("table/tr/td/span[@class='date']/text()").extract()
        #item['source_url'] = self.get_source(link_url)
        request = Request("http://www.indeed.com" +   item['link_url'][0], callback=self.parse_next_site)
    request.meta['item'] = item
    yield request
    items.append(item)
    return


SPIDER=IndeedSpider()

也许有人可以测试现有代码以查看一些输出,并告诉我该怎么做才能修复不起作用。确实可以帮助我继续前进,弄清自己在做错什么,并更好地了解这些事情的工作原理。再次感谢。

1 个答案:

答案 0 :(得分:1)

当使用xpath遍历草率选择器时,使用'.//myxpath'使用实际路径,您可以在此处查看代码示例 希望对您有帮助:)

from scrapy.spiders import CrawlSpider
from scrapy.http import Request, Response
from scrapy.linkextractors import LinkExtractor
import time


class IndeedSpider(CrawlSpider):
    name = "indeed"
    allowed_domains = ["indeed.com"]
    start_urls = [
        "https://www.indeed.com/jobs?q=remote&l=",
    ]

    def start_requests(self):
        for link in IndeedSpider.start_urls:
            yield Request(url=link, callback=self.parse_site)

    def parse_site(self, response: Response):
        extracted_links = LinkExtractor(
            allow=['/jobs.q=linux&l=remote&l$', 'q=linux&l=remote&sort=l&start=[0-9]+$'],
            deny=['/my/mysearches', '/preferences', '/advanced_search', '/my/myjobs']) \
            .extract_links(response)

        for link in extracted_links:
            yield Request(url=link.url, callback=self.parse_item)

    def parse_item(self, response: Response):
        self.log('\n Crawling  %s\n' % response.url)
        sites = response.xpath("//div[@class='row ' or @class='row lastRow']")
        # sites = hxs.select("//div[@class='row ']")
        items = []
        for site in sites:
            item = IndeedItem(company='none')
            # Producing output with onmouse click. etc. Gets title as well.

            # when Iterating over selectors use .// to use a relative xpath
            item['job_title'] = site.xpath(".//a[has-class('jobtitle')]").get()
            # link not working
            link_url = site.xpath('.//h2/a/@href').get()
            item['link_url'] = link_url
            item['crawl_url'] = response.url
            item['location'] = site.xpath(".//span[has-class('location')]/text()").get()
            # salary returns ''
            item['salary'] = site.xpath(".//span[has-class('salaryText')]").get()
            # Not all entries have a company. got a lot of , '\n
            if not site.xpath(".//span[has-class('company')]/text()").getall():
                item['company'] = [u'']
            else:
                item['company'] = site.xpath(".//span[has-class('company')/text()").get()
                # Summary seems to work
                item['summary'] = site.xpath("//div[has-class('summary')]").get()
            item['source'] = site.xpath(".//table/tr/td/span[@class='source']/text()").get()
            item['found_date'] = site.xpath(".//table/tr/td/span[@class='date']/text()").get()
            # item['source_url'] = self.get_source(link_url)
            request = Request("http://www.indeed.com" + item['link_url'][0], callback=self.parse_next_site)
            request.meta['item'] = item
            yield request
            items.append(item)

    def parse_next_site(self, response: Response):
        item = response.request.meta['item']
        item['source_url'] = response.url
        item['source_page_body'] = response.body
        item['crawl_timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S')