--编码：utf-8--

Question

我面临此页面的问题：

https://www.ouedkniss.com/op%C3%A9rateur-sur-machine-bejaia-boudjellil-algerie-offres-d-emploi-d19820393

我要删除此元素：

雇主：SARL UFMATP AZIEZ ET ASSOCIES

Poste：Opérateursur机器

但是当我使用xpath查找元素时，它看不到它们并跳转到其他元素

        'Titre': response.xpath("normalize-space(//h1[@id='Title']/text())").get(),

        'Boss':response.xpath("//*[@id="Employeur"]/span/text()").get(),

此脚本返回“ Titre”的值，但不返回Boss的值，我检查是否存在iframe，但没有任何帮助将不胜感激

import scrapy
import requests
from scrapy.http import Request
from urllib.parse import urljoin
from scrapy import Selector
from sidahmed.items import sidahmedItem
from scrapy.pipelines.files import FilesPipeline
from scrapy.http import HtmlResponse
from scrapy.utils.markup import remove_tags
custom_settings = {
    'ITEM_PIPELINES': {'sidahmed.pipelines.MyImagesPipeline': 1},
}

starting_number = 1
number_of_pages = 10
class sidahmed (scrapy.Spider):
   name = "sidahmed"
   allowed_domains = ["ouedkniss.com"]
   start_urls =  ["https://www.ouedkniss.com/emploi_demandes/industrie-production/{}".format(i) for i in range(1)]
   def __init__(self):
        self.page_number = 1



  # def parse(self, response):
        #print (self.page_number)
        #print ("----------")
        #sel = Selector(response)
        #titles = sel.xpath("//div[@class='magicCard']")
        #if not titles:
            #raise CloseSpider('No more pages')

   #def start_requests(self):
        #for i in range (self.page_number, number_of_pages, +1):
            #suivante ="[adressesite % i]"
            #yield Request(suivante, callback=self.parse)
   def parse (self, response):
       urls=response.xpath("//a[@class = 'button button_details']/@href"). extract ()

       for p in urls:
           url = urljoin(response.url, p)
           yield scrapy.Request(url, callback=self.parse_annonces)


   def parse_annonces(self, response):
       for annonce in response.xpath("//div[@id='annonce']"):

           yield {
                   'Titre': response.xpath("normalize-space(//h1[@id='Title']/text())").get(),
            #'Boss': response.xpath("//*[@id='Sexe']/span[contains(., 'Homme')]").get(),
             'Boss': response.xpath("//*[@id='Employeur']/span/text()").get()
                      #'Ville': response.xpath("normalize-space(//h2[@class='country-wilaya']/text())").get(),
            #'Annonceur': response.xpath("normalize-space(//p[@class='nom-entreprise orange']/text())").get(),
            #'Prix': response.xpath("normalize-space(//span[@itemprop='price']/text())").get(),
            #'Boitevitesse':response.xpath("normalize-space(//li[2][@class='col-md-6']/text())").get(),
            #'Carburant':response.xpath("normalize-space(//li[3][@class='col-md-6']/text())").get(),
            #'Annee':response.xpath("normalize-space(//li[4][@class='col-md-6']/text())").get(),
            #'Etat':response.xpath("normalize-space(//li[5][@class='col-md-6']/text())").get(),
            #'Statut': response.xpath("normalize-space(//p[@class='type-inscription orange']/text())").get(),
            #'Description': response.xpath("normalize-space(//div[@id='Description']/node()/text())").get(),
            #'Tel': response.xpath("normalize-space(//div[@class='contact-box']//p[last()]/text())").get(),
            #'index':response.xpath("//span[@id='imgrightspan']/text()").get(),
            #'image_urls': response.xpath("//ul[@id='foo3']//li//img/@src").extract()

                 }


       #self.page_number += 1
       #yield Request(adressesite % self.page_number)

items.py

--编码：utf-8--

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

    import scrapy
    from scrapy.item import Item

    class sidahmedItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        image_urls=scrapy.Field()
        images=scrapy.Field()
        Titre = scrapy.Field()
        Ville = scrapy.Field()
        Carburant = scrapy.Field()
        Boss = scrapy.Field()
        Prix = scrapy.Field()
        Statut = scrapy.Field()
        Description = scrapy.Field()
        Tel = scrapy.Field()
        index =scrapy.Field()

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for sidahmed project
#
# For simplicity, this file contains only settings considspidered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'sidahmed'

SPIDER_MODULES = ['sidahmed.spiders']
NEWSPIDER_MODULE = 'sidahmed.spiders'
#ITEM_PIPELINES = {'sidahmed.pipelines.MyFilesPipeline': 1}
ITEM_PIPELINES = {'sidahmed.pipelines.MyImagesPipeline': 1}
#ITEM_PIPELINES = {'sidahmed.pipelines.CustomImagesPipeline': 1,}
IMAGES_STORE = "./images"




# Obey robots.txt rules
ROBOTSTXT_OBEY = False

Xpath找不到元素也不返回

--编码：utf-8--

0 个答案: