目标是编写一个能够执行以下操作的爬虫:
1)检索此页面表格中链接的网址:http://cordis.europa.eu/fp7/security/projects_en.html
2)按照所有这些URL的AJAX调用,找出包含我想要抓取的数据的最终(“AJAX”)URL
3)刮掉AJAX URL标识的最终页面。
到目前为止,我已经在Scrapy下写了两个蜘蛛:
1)第一个从起始页面上的链接中检索URL。这是代码:
from scrapy.spider import Spider
from scrapy.selector import HtmlXPathSelector
from cordis.items import CordisItem
class MySpider(Spider):
name = "Cordis1"
allowed_domains = ["cordis.europa.eu"]
start_urls = ["http://cordis.europa.eu/fp7/security/projects_en.html"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p")
items = []
for titles in titles:
item = CordisItem()
item ["link"] = titles.select("//ul/li/span/a/@href").extract()
return item
2)第二个从“AJAX”URL中删除数据。这是代码:
from scrapy.spider import Spider
from scrapy.selector import Selector
class EssaiSpider(Spider):
name = "aze"
allowed_domains = ["cordis.europa.eu"]
start_urls = ["http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=95607",
"http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=93528"]
def parse(self, response):
sel = Selector(response)
acronym = sel.xpath("//*[@class='projttl']/h1/text()").extract()
short_desc = sel.xpath("//*[@class='projttl']/h2/text()").extract()
start = sel.xpath("//*[@class='projdates']/b[1]/following::text()[1]").extract()
end = sel.xpath("//*[@class='projdates']/b[2]/following::text()[1]").extract()
long_desc = sel.xpath("//*[@class='tech']/p/text()").extract()
cost = sel.xpath("//*[@class='box-left']/b[3]/following::text()[1]").extract()
contrib = sel.xpath("//*[@class='box-left']/b[4]/following::text()[1]").extract()
type = sel.xpath("//*[@class='box-right']/p[3]/br/following::text()[1]").extract()
sujet = sel.xpath("//*[@id='subjects']/h2/following::text()[1]").extract()
coord = sel.xpath("//*[@class='projcoord']/div[1]/div[1]/text()").extract()
coord_nat = sel.xpath("//*[@class='projcoord']/div[1]/div[2]/text()").extract()
part = sel.xpath("//*[@class='participants']")
for part in part:
part1 = sel.xpath("//*[@id='part1']/div[1]/div[1]/text()").extract()
part1_nat = sel.xpath("//*[@id='part1']/div[1]/div[2]/text()").extract()
part2 = sel.xpath("//*[@id='part2']/div[1]/div[1]/text()").extract()
part2_nat = sel.xpath("//*[@id='part2']/div[1]/div[2]/text()").extract()
part3 = sel.xpath("//*[@id='part3']/div[1]/div[1]/text()").extract()
part3_nat = sel.xpath("//*[@id='part3']/div[1]/div[2]/text()").extract()
part4 = sel.xpath("//*[@id='part4']/div[1]/div[1]/text()").extract()
part4_nat = sel.xpath("//*[@id='part4']/div[1]/div[2]/text()").extract()
part5 = sel.xpath("//*[@id='part5']/div[1]/div[1]/text()").extract()
part5_nat = sel.xpath("//*[@id='part5']/div[1]/div[2]/text()").extract()
part6 = sel.xpath("//*[@id='part6']/div[1]/div[1]/text()").extract()
part6_nat = sel.xpath("//*[@id='part6']/div[1]/div[2]/text()").extract()
part7 = sel.xpath("//*[@id='part7']/div[1]/div[1]/text()").extract()
part7_nat = sel.xpath("//*[@id='part7']/div[1]/div[2]/text()").extract()
part8 = sel.xpath("//*[@id='part8']/div[1]/div[1]/text()").extract()
part8_nat = sel.xpath("//*[@id='part8']/div[1]/div[2]/text()").extract()
part9 = sel.xpath("//*[@id='part9']/div[1]/div[1]/text()").extract()
part9_nat = sel.xpath("//*[@id='part9']/div[1]/div[2]/text()").extract()
part10 = sel.xpath("//*[@id='part10']/div[1]/div[1]/text()").extract()
part10_nat = sel.xpath("//*[@id='part10']/div[1]/div[2]/text()").extract()
part11 = sel.xpath("//*[@id='part11']/div[1]/div[1]/text()").extract()
part11_nat = sel.xpath("//*[@id='part11']/div[1]/div[2]/text()").extract()
part12 = sel.xpath("//*[@id='part11']/div[1]/div[1]/text()").extract()
part12_nat = sel.xpath("//*[@id='part11']/div[1]/div[2]/text()").extract()
part13 = sel.xpath("//*[@id='part13']/div[1]/div[1]/text()").extract()
part13_nat = sel.xpath("//*[@id='part13']/div[1]/div[2]/text()").extract()
part13 = sel.xpath("//*[@id='part13']/div[1]/div[1]/text()").extract()
part13_nat = sel.xpath("//*[@id='part13']/div[1]/div[2]/text()").extract()
part14 = sel.xpath("//*[@id='part14']/div[1]/div[1]/text()").extract()
part14_nat = sel.xpath("//*[@id='part14']/div[1]/div[2]/text()").extract()
part15 = sel.xpath("//*[@id='part15']/div[1]/div[1]/text()").extract()
part15_nat = sel.xpath("//*[@id='part15']/div[1]/div[2]/text()").extract()
part16 = sel.xpath("//*[@id='part16']/div[1]/div[1]/text()").extract()
part16_nat = sel.xpath("//*[@id='part16']/div[1]/div[2]/text()").extract()
part17 = sel.xpath("//*[@id='part17']/div[1]/div[1]/text()").extract()
part17_nat = sel.xpath("//*[@id='part17']/div[1]/div[2]/text()").extract()
part18 = sel.xpath("//*[@id='part18']/div[1]/div[1]/text()").extract()
part18_nat = sel.xpath("//*[@id='part18']/div[1]/div[2]/text()").extract()
part19 = sel.xpath("//*[@id='part19']/div[1]/div[1]/text()").extract()
part2_nat = sel.xpath("//*[@id='part19']/div[1]/div[2]/text()").extract()
part20 = sel.xpath("//*[@id='part20']/div[1]/div[1]/text()").extract()
part20_nat = sel.xpath("//*[@id='part20']/div[1]/div[2]/text()").extract()
part21 = sel.xpath("//*[@id='part21']/div[1]/div[1]/text()").extract()
part21_nat = sel.xpath("//*[@id='part21']/div[1]/div[2]/text()").extract()
part22 = sel.xpath("//*[@id='part22']/div[1]/div[1]/text()").extract()
part22_nat = sel.xpath("//*[@id='part22']/div[1]/div[2]/text()").extract()
part23 = sel.xpath("//*[@id='part23']/div[1]/div[1]/text()").extract()
part23_nat = sel.xpath("//*[@id='part23']/div[1]/div[2]/text()").extract()
part24 = sel.xpath("//*[@id='part24']/div[1]/div[1]/text()").extract()
part24_nat = sel.xpath("//*[@id='part24']/div[1]/div[2]/text()").extract()
part25 = sel.xpath("//*[@id='part25']/div[1]/div[1]/text()").extract()
part25_nat = sel.xpath("//*[@id='part25']/div[1]/div[2]/text()").extract()
part26 = sel.xpath("//*[@id='part26']/div[1]/div[1]/text()").extract()
part26_nat = sel.xpath("//*[@id='part26']/div[1]/div[2]/text()").extract()
part27 = sel.xpath("//*[@id='part27']/div[1]/div[1]/text()").extract()
part27_nat = sel.xpath("//*[@id='part27']/div[1]/div[2]/text()").extract()
part28 = sel.xpath("//*[@id='part28']/div[1]/div[1]/text()").extract()
part28_nat = sel.xpath("//*[@id='part28']/div[1]/div[2]/text()").extract()
part29 = sel.xpath("//*[@id='part29']/div[1]/div[1]/text()").extract()
part29_nat = sel.xpath("//*[@id='part29']/div[1]/div[2]/text()").extract()
part30 = sel.xpath("//*[@id='part30']/div[1]/div[1]/text()").extract()
part30_nat = sel.xpath("//*[@id='part30']/div[1]/div[2]/text()").extract()
part31 = sel.xpath("//*[@id='part31']/div[1]/div[1]/text()").extract()
part31_nat = sel.xpath("//*[@id='part31']/div[1]/div[2]/text()").extract()
part32 = sel.xpath("//*[@id='part32']/div[1]/div[1]/text()").extract()
part32_nat = sel.xpath("//*[@id='part32']/div[1]/div[2]/text()").extract()
part33 = sel.xpath("//*[@id='part33']/div[1]/div[1]/text()").extract()
part33_nat = sel.xpath("//*[@id='part33']/div[1]/div[2]/text()").extract()
part34 = sel.xpath("//*[@id='part34']/div[1]/div[1]/text()").extract()
part34_nat = sel.xpath("//*[@id='part34']/div[1]/div[2]/text()").extract()
part35 = sel.xpath("//*[@id='part35']/div[1]/div[1]/text()").extract()
part35_nat = sel.xpath("//*[@id='part35']/div[1]/div[2]/text()").extract()
part36 = sel.xpath("//*[@id='part36']/div[1]/div[1]/text()").extract()
part36_nat = sel.xpath("//*[@id='part36']/div[1]/div[2]/text()").extract()
part37 = sel.xpath("//*[@id='part37']/div[1]/div[1]/text()").extract()
part37_nat = sel.xpath("//*[@id='part37']/div[1]/div[2]/text()").extract()
part38 = sel.xpath("//*[@id='part38']/div[1]/div[1]/text()").extract()
part38_nat = sel.xpath("//*[@id='part38']/div[1]/div[2]/text()").extract()
part39 = sel.xpath("//*[@id='part39']/div[1]/div[1]/text()").extract()
part39_nat = sel.xpath("//*[@id='part39']/div[1]/div[2]/text()").extract()
part40 = sel.xpath("//*[@id='part40']/div[1]/div[1]/text()").extract()
part40_nat = sel.xpath("//*[@id='part40']/div[1]/div[2]/text()").extract()
print acronym, short_desc, start, end, long_desc, cost, contrib, type, sujet, coord, coord_nat, part1, part1_nat, part2, part2_nat, part5, part5_nat, part10, part10_nat, part20, part20_nat, part30, part30_nat, part40, part40_nat
我可以手动检索什么,由于缺乏更好的术语,我通过使用Netbug为第一个Spider产生的每个URL过滤XHR请求来调用“AJAX”URL。然后,我只需要将这些“AJAX”URL提供给第二个蜘蛛。
但是可以自动检索那些“AJAX”网址吗?
更一般地说,如何编写执行上述所有三种操作的单个爬行蜘蛛?
答案 0 :(得分:1)
是的,可以自动检索这些网址,但你必须弄清楚ajax加载内容的网址是什么。这是一个简单的教程。
<强> 1。做你的研究
在Chrome控制台中,如果打开网络选项卡,并按xml请求进行过滤,则会显示“启动器”字段。在右侧,您有javascript文件,其中包含负责生成请求的代码。 Chrome控制台会显示正在调用请求的行。
在您的情况下,最重要的代码是 在文件jquery-projects.js第415行,该行说的是这样的:
$.ajax({
async: true,
type: 'GET',
url: URL,
如你所见,这里有一个URL变量。您需要找到它的编码位置,上面几行:
var URL = '/projects/index.cfm?fuseaction=app.csa'; // production
switch(type) {
...
case 'doc':
URL += '&action=read&xslt-template=projects/xsl/projectdet_' + I18n.locale + '.xslt&rcn=' + me.ref;
break;
}
因此,通过添加基本URL,一些以action开头的字符串,然后是两个变量I18n.locale和me.ref来生成url。请记住,此网址是相对的,因此您还需要获取网址。
I18n.locale只是一个字符串“_en”,me.ref来自哪里?
再次按ctrl +在源选项卡的控制台中查找,你会发现这行jQuery:
// record reference
me.ref = $("#PrjSrch>input[name='REF']").val();
原来每个网址都有一个隐藏的表单,每次生成请求时都会从此me.ref字段获取值。
现在您只需将此知识应用于您的scrapy项目。
<强> 2。在scrapy spider中使用你的知识。
此时你知道你要做什么。您需要从所有项目的启动URL开始,获取所有链接,请求这些链接,然后从每个请求后收到的内容中提取ajax url,并生成我们从那里获得的URL请求。
from scrapy.selector import Selector
from scrapy.spider import Spider
from scrapy.http import Request
from eu.items import EuItem
from urlparse import urljoin
class CordisSpider(Spider):
name = 'cordis'
start_urls = ['http://cordis.europa.eu/fp7/security/projects_en.html']
base_url = "http://cordis.europa.eu/projects/"
# template string for ajax request based on what we know from investigating webpage
base_ajax_url = "http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=%s"
def parse(self, response):
"""
Extract project links from start_url, for each generate GET request,
and then assign a function self.get_ajax_content to handle response.
"""
hxs = Selector(response)
links = hxs.xpath("//ul/li/span/a/@href").extract()
for link in links:
link = urljoin(self.base_url,link)
yield Request(url=link,callback=self.get_ajax_content)
def get_ajax_content(self,response):
"""
Extract AJAX link and make a GET request
for the desired content, assign callback
to handle response from this request.
"""
hxs = Selector(response)
# xpath analogy of jquery line we've seen
ajax_ref = hxs.xpath('//form[@id="PrjSrch"]//input[@name="REF"]/@value').extract()
ajax_ref = "".join(ajax_ref)
ajax_url = self.base_ajax_url % (ajax_ref,)
yield Request(url=ajax_url,callback=self.parse_items)
def parse_items(self,response):
"""
Response here should contain content
normally loaded asynchronously with AJAX.
"""
xhs = Selector(response)
# you can do your processing here
title = xhs.xpath("//div[@class='projttl']//text()").extract()
i = EuItem()
i["title"] = title
return i