Scrapy不会进入解析

时间:2016-01-27 00:54:12

标签: xpath scrapy scrapy-spider

我是scrapy的新手,我正在尝试获取这些页面的出版物。前3个函数工作(发布存储在数据库中),但似乎第四个没有运行(parse_web3,我做错了?

class crawler(scrapy.Spider):
   name = "crawler"

   start_urls = ["http://www.sac.org.ar/argentine-cardiology-journal-archive",
                "http://rinfi.fi.mdp.edu.ar/xmlui/recent-submissions",
                "http://road.issn.org/issn_search?afs:query=&show-adv=0&afs:replies=100#.VqaLtl4oDtR",
                "http://www.intechopen.com/books"]



   def parse(self, response):

           for href in response.css("a::attr('href')"):
               print href
               url = response.urljoin(href.extract())
               yield scrapy.Request(url, callback=self.parse_web0)
               url2 = "http://rinfi.fi.mdp.edu.ar/xmlui/recent-submissions"
               yield scrapy.Request(url2,callback=self.parse_web1)
               url3 = "http://road.issn.org/issn_search?afs:query=&show-adv=0&afs:replies=100#.VqaLtl4oDtR"
               yield scrapy.Request(url3,callback=self.parse_web2)
               url4 = "http://www.intechopen.com/books"
               yield scrapy.Request(url4,callback=self.parse_web3)


   def parse_web0(self, response):

       i=0
       for sel in response.xpath("//ul[@class='d3s-revista']"):
           publicaciones = sel.xpath("//li[contains(p, 'Scientific Letters')]/p[@class='d3s-titulo-post']/text()").extract()
           autores = sel.xpath("//li[contains(p, 'Scientific Letters')]/p[@class='d3s-titulo-autores']/text()").extract()
           links = sel.xpath("////li[contains(p, 'Scientific Letters')]/a[contains(@href,'.pdf')]/@href").extract()
           if i == 0:
               o=0
               while o != len(publicaciones):
                   publicacion = PublicacionItem()
                   publicacion['titulo_publicacion'] = publicaciones[o]
                   publicacion['anio_publicacion'] = response.xpath("//p[@class='d3s-titulo-numero']/text()").re(r'\d\d\d\d')[0].strip()
                   publicacion['isbn'] = response.xpath("//div[@id='d3s-page-content']/div/div/div/text()").re(r'\d\d\d\d-\d\d\d\d')[0].strip()
                   publicacion['nombre_autor'] = autores[o]
                   publicacion['url_link'] = links[o]
                   yield publicacion
                  o+=1
               i+=1 #este funciona
       #url = "http://rinfi.fi.mdp.edu.ar/xmlui/recent-submissions"
       #yield scrapy.Request(url,callback=self.parse_web1)

   def parse_web1(self, response):

       i=0
       for sel in response.xpath("//div[@id='ds-main']/div[@id='ds-content-wrapper']/div[@id='ds-content']/div[@id='ds-body']"):
           publicaciones = sel.xpath("//div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item odd' or @class='ds-artifact-item even']/div[@class='artifact-description']/div[@class='artifact-title']/a/text()").extract()
           #publicaciones = sel.xpath("//div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item even']/div[@class='artifact-description']/div[@class='artifact-title']/a/text()").extract()
           autores = sel.xpath("//div[@id='ds-main']/div[@id='ds-content-wrapper']/div[@id='ds-content']/div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item odd']/div[@class='artifact-description']/div[@class='artifact-info']/span[@class='author']/span/text()").extract()
           links = sel.xpath("//div[@id='ds-main']/div[@id='ds-content-wrapper']/div[@id='ds-content']/div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item odd']/div[@class='artifact-description']/div[@class='artifact-title']/a/@href").extract()
           if i == 0:
               o=0
               while o != len(publicaciones):
                   publicacion = PublicacionItem()
                   publicacion['titulo_publicacion'] = publicaciones[o]
                   publicacion['anio_publicacion'] = (response.xpath("//div[@id='ds-main']/div[@id='ds-content-wrapper']/div[@id='ds-content']/div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item odd']/div[@class='artifact-description']/div[@class='artifact-info']/span[@class='publisher-date']/span[@class='date']/text()").re(r'\d\d\d\d')[0].strip())
                   publicacion['isbn'] = "ISBN dentro del PDF"
                   publicacion['nombre_autor'] = autores[o]
                   publicacion['url_link'] = "http://rinfi.fi.mdp.edu.ar"+ links[o]
                   yield publicacion
                   o+=1
               i+=1
       #url = "http://road.issn.org/issn_search?afs%3Aquery=ciencia#.VqZ3v14oDtQ"
       #yield scrapy.Request(url,callback=self.parse_web2)
       #url = "http://rinfi.fi.mdp.edu.ar/xmlui/recent-submissions?offset=20"
       #yield scrapy.Request(url,callback=self.parse_web2)      

   def parse_web2(self, response):
       i=0
       for sel in response.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']"):
           publicaciones = sel.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-title']/a/text()").extract()
           #publicaciones = sel.xpath("//div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item even']/div[@class='artifact-description']/div[@class='artifact-title']/a/text()").extract()
           autores = sel.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-publisher']/text()").extract()
           links = sel.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-title']/a/@href").extract()
           if i == 0:
               o=0
               while o != len(publicaciones):
                   publicacion = PublicacionItem()
                   publicacion['titulo_publicacion'] = publicaciones[o]
                   publicacion['anio_publicacion'] = (response.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-registration_year']").re(r'\d\d\d\d')[0].strip())
                   publicacion['isbn'] = response.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-issn']").re(r'\d\d\d\d-\d\d\d\d')[0].strip()
                   publicacion['nombre_autor'] = autores[o]
                   publicacion['url_link'] = links[o]
                   yield publicacion
                   o+=1
               i+=1


   def parse_web3(self, response): #Books
       i=0
       for sel in response.xpath("//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div"):
           publicaciones = sel.xpath("//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div/ul[@class='book-listing entity-listing']/li/dl/dt/a/text()").extract() #publicacion
           #publicaciones = sel.xpath("//div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item even']/div[@class='artifact-description']/div[@class='artifact-title']/a/text()").extract()
           #autores = sel.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-publisher']/text()").extract()
           links = sel.xpath("//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div/ul[@class='book-listing entity-listing']/li/dl/dt/a/@href").extract() #links
           if i == 0:
               o=0
               while o != len(publicaciones):
                   publicacion = PublicacionItem()
                   publicacion['titulo_publicacion'] = publicaciones[o]
                   publicacion['anio_publicacion'] = (response.xpath("substring-after(//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div/ul[@class='book-listing entity-listing']/li/dl/dd[@class='meta']/text()[count(preceding-sibling::br) = 2], ', ')").re(r'\d\d\d\d')[0].strip()) #Fecha, ultimos cuatro digitos.
                   publicacion['isbn'] = response.xpath("substring-after(//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div/ul[@class='book-listing entity-listing']/li/dl/dd[@class='meta']/text()[count(preceding-sibling::br) = 1],'ISBN')").re(r'\d\d\d-\d\d\d-\d\d-\d\d\d\d-\d')[0].strip() #ISBN 
                   publicacion['nombre_autor'] = sel.xpath("substring-after(//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div/ul[@class='book-listing entity-listing']/li/dl/dd[@class='meta']/text()[count(preceding-sibling::br) = 0],'Editor')").extract() #Autor
                   publicacion['url_link'] = links[o]
                   print(publicacion['titulo_publicacion'])
                   raw_input()
                   yield publicacion
                   o+=1
               i+=1 

PS:对不起我的英语,我来自阿根廷

1 个答案:

答案 0 :(得分:0)

问题是您正在制作重复的请求,请检查您的日志应该在那里列出:

{p} http://www.intechopen.com/books列在start_urls列表中,这意味着蜘蛛开始工作,对该网址发出请求,您可以使用parse方法获取start_urls方法的默认回调方法{1}}请求)。

现在在parse方法中,您再次发出相同的请求(访问相同的网址)

url4 = "http://www.intechopen.com/books"
yield scrapy.Request(url4,callback=self.parse_web3)

默认情况下会被scrapy过滤。

如果您想控制蜘蛛应该从哪些请求开始,并且各自的回调使用start_requests方法。