我是scrapy的新手,我正在尝试获取这些页面的出版物。前3个函数工作(发布存储在数据库中),但似乎第四个没有运行(parse_web3,我做错了?
class crawler(scrapy.Spider):
name = "crawler"
start_urls = ["http://www.sac.org.ar/argentine-cardiology-journal-archive",
"http://rinfi.fi.mdp.edu.ar/xmlui/recent-submissions",
"http://road.issn.org/issn_search?afs:query=&show-adv=0&afs:replies=100#.VqaLtl4oDtR",
"http://www.intechopen.com/books"]
def parse(self, response):
for href in response.css("a::attr('href')"):
print href
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_web0)
url2 = "http://rinfi.fi.mdp.edu.ar/xmlui/recent-submissions"
yield scrapy.Request(url2,callback=self.parse_web1)
url3 = "http://road.issn.org/issn_search?afs:query=&show-adv=0&afs:replies=100#.VqaLtl4oDtR"
yield scrapy.Request(url3,callback=self.parse_web2)
url4 = "http://www.intechopen.com/books"
yield scrapy.Request(url4,callback=self.parse_web3)
def parse_web0(self, response):
i=0
for sel in response.xpath("//ul[@class='d3s-revista']"):
publicaciones = sel.xpath("//li[contains(p, 'Scientific Letters')]/p[@class='d3s-titulo-post']/text()").extract()
autores = sel.xpath("//li[contains(p, 'Scientific Letters')]/p[@class='d3s-titulo-autores']/text()").extract()
links = sel.xpath("////li[contains(p, 'Scientific Letters')]/a[contains(@href,'.pdf')]/@href").extract()
if i == 0:
o=0
while o != len(publicaciones):
publicacion = PublicacionItem()
publicacion['titulo_publicacion'] = publicaciones[o]
publicacion['anio_publicacion'] = response.xpath("//p[@class='d3s-titulo-numero']/text()").re(r'\d\d\d\d')[0].strip()
publicacion['isbn'] = response.xpath("//div[@id='d3s-page-content']/div/div/div/text()").re(r'\d\d\d\d-\d\d\d\d')[0].strip()
publicacion['nombre_autor'] = autores[o]
publicacion['url_link'] = links[o]
yield publicacion
o+=1
i+=1 #este funciona
#url = "http://rinfi.fi.mdp.edu.ar/xmlui/recent-submissions"
#yield scrapy.Request(url,callback=self.parse_web1)
def parse_web1(self, response):
i=0
for sel in response.xpath("//div[@id='ds-main']/div[@id='ds-content-wrapper']/div[@id='ds-content']/div[@id='ds-body']"):
publicaciones = sel.xpath("//div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item odd' or @class='ds-artifact-item even']/div[@class='artifact-description']/div[@class='artifact-title']/a/text()").extract()
#publicaciones = sel.xpath("//div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item even']/div[@class='artifact-description']/div[@class='artifact-title']/a/text()").extract()
autores = sel.xpath("//div[@id='ds-main']/div[@id='ds-content-wrapper']/div[@id='ds-content']/div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item odd']/div[@class='artifact-description']/div[@class='artifact-info']/span[@class='author']/span/text()").extract()
links = sel.xpath("//div[@id='ds-main']/div[@id='ds-content-wrapper']/div[@id='ds-content']/div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item odd']/div[@class='artifact-description']/div[@class='artifact-title']/a/@href").extract()
if i == 0:
o=0
while o != len(publicaciones):
publicacion = PublicacionItem()
publicacion['titulo_publicacion'] = publicaciones[o]
publicacion['anio_publicacion'] = (response.xpath("//div[@id='ds-main']/div[@id='ds-content-wrapper']/div[@id='ds-content']/div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item odd']/div[@class='artifact-description']/div[@class='artifact-info']/span[@class='publisher-date']/span[@class='date']/text()").re(r'\d\d\d\d')[0].strip())
publicacion['isbn'] = "ISBN dentro del PDF"
publicacion['nombre_autor'] = autores[o]
publicacion['url_link'] = "http://rinfi.fi.mdp.edu.ar"+ links[o]
yield publicacion
o+=1
i+=1
#url = "http://road.issn.org/issn_search?afs%3Aquery=ciencia#.VqZ3v14oDtQ"
#yield scrapy.Request(url,callback=self.parse_web2)
#url = "http://rinfi.fi.mdp.edu.ar/xmlui/recent-submissions?offset=20"
#yield scrapy.Request(url,callback=self.parse_web2)
def parse_web2(self, response):
i=0
for sel in response.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']"):
publicaciones = sel.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-title']/a/text()").extract()
#publicaciones = sel.xpath("//div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item even']/div[@class='artifact-description']/div[@class='artifact-title']/a/text()").extract()
autores = sel.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-publisher']/text()").extract()
links = sel.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-title']/a/@href").extract()
if i == 0:
o=0
while o != len(publicaciones):
publicacion = PublicacionItem()
publicacion['titulo_publicacion'] = publicaciones[o]
publicacion['anio_publicacion'] = (response.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-registration_year']").re(r'\d\d\d\d')[0].strip())
publicacion['isbn'] = response.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-issn']").re(r'\d\d\d\d-\d\d\d\d')[0].strip()
publicacion['nombre_autor'] = autores[o]
publicacion['url_link'] = links[o]
yield publicacion
o+=1
i+=1
def parse_web3(self, response): #Books
i=0
for sel in response.xpath("//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div"):
publicaciones = sel.xpath("//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div/ul[@class='book-listing entity-listing']/li/dl/dt/a/text()").extract() #publicacion
#publicaciones = sel.xpath("//div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item even']/div[@class='artifact-description']/div[@class='artifact-title']/a/text()").extract()
#autores = sel.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-publisher']/text()").extract()
links = sel.xpath("//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div/ul[@class='book-listing entity-listing']/li/dl/dt/a/@href").extract() #links
if i == 0:
o=0
while o != len(publicaciones):
publicacion = PublicacionItem()
publicacion['titulo_publicacion'] = publicaciones[o]
publicacion['anio_publicacion'] = (response.xpath("substring-after(//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div/ul[@class='book-listing entity-listing']/li/dl/dd[@class='meta']/text()[count(preceding-sibling::br) = 2], ', ')").re(r'\d\d\d\d')[0].strip()) #Fecha, ultimos cuatro digitos.
publicacion['isbn'] = response.xpath("substring-after(//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div/ul[@class='book-listing entity-listing']/li/dl/dd[@class='meta']/text()[count(preceding-sibling::br) = 1],'ISBN')").re(r'\d\d\d-\d\d\d-\d\d-\d\d\d\d-\d')[0].strip() #ISBN
publicacion['nombre_autor'] = sel.xpath("substring-after(//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div/ul[@class='book-listing entity-listing']/li/dl/dd[@class='meta']/text()[count(preceding-sibling::br) = 0],'Editor')").extract() #Autor
publicacion['url_link'] = links[o]
print(publicacion['titulo_publicacion'])
raw_input()
yield publicacion
o+=1
i+=1
PS:对不起我的英语,我来自阿根廷
答案 0 :(得分:0)
问题是您正在制作重复的请求,请检查您的日志应该在那里列出:
{p}http://www.intechopen.com/books
列在start_urls
列表中,这意味着蜘蛛开始工作,对该网址发出请求,您可以使用parse
方法获取start_urls
方法的默认回调方法{1}}请求)。
现在在parse
方法中,您再次发出相同的请求(访问相同的网址)
url4 = "http://www.intechopen.com/books"
yield scrapy.Request(url4,callback=self.parse_web3)
默认情况下会被scrapy过滤。
如果您想控制蜘蛛应该从哪些请求开始,并且各自的回调使用start_requests
方法。