我想为此页面上的每个链接搜索后面的页面详细信息页面。
我可以在此页面上获取所有信息:PAGE
但是,我想在详细信息页面上获取所有信息,但href链接看起来像这样,例如:
href="javascript:subOpen('9ca8ed0fae15d43dc1257e7300345b99')"
以下是使用ImportHTML
功能获取概述的示例电子表格。
有关如何获取详细信息页面的任何建议吗?
更新
我实现了以下方法:
function doGet(e){
var base = 'http://www.ediktsdatei.justiz.gv.at/edikte/ex/exedi3.nsf/'
var feed = UrlFetchApp.fetch(base + 'suche?OpenForm&subf=e&query=%28%5BVKat%5D%3DEH%20%7C%20%5BVKat%5D%3DZH%20%7C%20%5BVKat%5D%3DMH%20%7C%20%5BVKat%5D%3DMW%20%7C%20%5BVKat%5D%3DMSH%20%7C%20%5BVKat%5D%3DGGH%20%7C%20%5BVKat%5D%3DRH%20%7C%20%5BVKat%5D%3DHAN%20%7C%20%5BVKat%5D%3DWE%20%7C%20%5BVKat%5D%3DEW%20%7C%20%5BVKat%5D%3DMAI%20%7C%20%5BVKat%5D%3DDTW%20%7C%20%5BVKat%5D%3DDGW%20%7C%20%5BVKat%5D%3DGA%20%7C%20%5BVKat%5D%3DGW%20%7C%20%5BVKat%5D%3DUL%20%7C%20%5BVKat%5D%3DBBL%20%7C%20%5BVKat%5D%3DLF%20%7C%20%5BVKat%5D%3DGL%20%7C%20%5BVKat%5D%3DSE%20%7C%20%5BVKat%5D%3DSO%29%20AND%20%5BBL%5D%3D0').getContentText();
var d = document.createElement('div'); //assuming you can do this
d.innerHTML = feed;//make the text a dom structure
var arr = d.getElementsByTagName('a') //iterate over the page links
var response = "";
for(var i = 0;i<arr.length;i++){
var atr = arr[i].getAttribute('onclick');
if(atr) atr = atr.match(/subOpen\((.*?)\)/) //if onclick calls subOpen
if(atr && atr.length > 1){ //get the id
var detail = UrlFetchApp.fetch(base + '0/'+atr[1]).getContentText();
response += detail//process the relevant part of the content and append to the reposnse text
}
}
return ContentService.createTextOutput(response);
}
但是,运行方法时出错:
ReferenceError: "document" is not defined. (line 6, file "")
document
的对象是什么?
我已使用网络应用更新了Google Spreadsheet。
感谢您的回复!
答案 0 :(得分:6)
您可以使用Firebug来检查页面内容和javascript。例如,您可以发现subOpen实际上是xmlhttp01.js中声明的subOpenXML的别名。
function subOpenXML(unid) {/*open found doc from search view*/
if (waiting) return alert(bittewar);
var wState = dynDoc.getElementById('windowState');
wState.value = 'H';/*httpreq pending*/
var last = '';
if (unid==docLinks[0]) {last += '&f=1'; thisdocnum = 1;}
if (unid==docLinks[docLinks.length-1]) {
last += '&l=1';
thisdocnum = docLinks.length;
} else {
for (var i=1;i<docLinks.length-1;i++)
if (unid==docLinks[i]) {thisdocnum = i+1; break;}
}
var url = unid + html_delim + 'OpenDocument'+last + '&bm=2';
httpreq.open('GET', // &rand=' + Math.random();
/*'/edikte/test/ex/exedi31.nsf/0/'+*/ '0/'+url, true);
httpreq.onreadystatechange=onreadystatechange;
// httpreq.setRequestHeader('Accept','text/xml');
httpreq.send(null);
waiting = true;
title2src = firstTextChild(dynDoc.getElementById('title2')).nodeValue;
}
因此,在复制函数源并在firebug的控制台选项卡中修改它之后,在http调用之前添加console.log(url)
,如下所示:
var url = unid + html_delim + 'OpenDocument'+last + '&bm=2';
console.log(url)
httpreq.open('GET', // &rand=' + Math.random();
/*'/edikte/test/ex/exedi31.nsf/0/'+*/ '0/'+url, true);
您可以在firebug的Console选项卡中执行函数声明,并使用修改后的源覆盖subOpen。 然后在链接中单击将显示被调用的URL由作为参数传递的id组成,前缀为&#39; 0 /&#39;,因此在您发布的示例中,它将是GET:
http://www.ediktsdatei.justiz.gv.at/edikte/ex/exedi3.nsf/0/1fd2313c2e0095bfc1257e49004170ca?OpenDocument&f=1&bm=2
您还可以通过打开firebug中的“网络”标签并单击链接来验证这一点。
因此,为了抓住您需要
的详细信息页面在firebug的网络选项卡中查看请求响应显示,您可能需要进行类似的解析才能真正获得所显示的内容,但我还没有深入了解它。
<强>更新强>
importHTML函数不适合您想要的那种抓取。 Google的HTML或Content服务更适合这种情况。您需要创建web app并实施doGet
功能:
function doGet(e){
var base = 'http://www.ediktsdatei.justiz.gv.at/edikte/ex/exedi3.nsf/'
var feed = UrlFetchApp.fetch(base + 'suche?OpenForm&subf=e&query=%28%5BVKat%5D%3DEH%20%7C%20%5BVKat%5D%3DZH%20%7C%20%5BVKat%5D%3DMH%20%7C%20%5BVKat%5D%3DMW%20%7C%20%5BVKat%5D%3DMSH%20%7C%20%5BVKat%5D%3DGGH%20%7C%20%5BVKat%5D%3DRH%20%7C%20%5BVKat%5D%3DHAN%20%7C%20%5BVKat%5D%3DWE%20%7C%20%5BVKat%5D%3DEW%20%7C%20%5BVKat%5D%3DMAI%20%7C%20%5BVKat%5D%3DDTW%20%7C%20%5BVKat%5D%3DDGW%20%7C%20%5BVKat%5D%3DGA%20%7C%20%5BVKat%5D%3DGW%20%7C%20%5BVKat%5D%3DUL%20%7C%20%5BVKat%5D%3DBBL%20%7C%20%5BVKat%5D%3DLF%20%7C%20%5BVKat%5D%3DGL%20%7C%20%5BVKat%5D%3DSE%20%7C%20%5BVKat%5D%3DSO%29%20AND%20%5BBL%5D%3D0').getContentText();
var response = "";
var match = feed.match(/subOpen\('.*?'\)/g)
if(match){
for(var i = 0; i < match.length;i++){
var m = match[i].match(/\('(.*)'\)/);
if(m && m.length > 1){
var detailText = UrlFetchApp.fetch(base + '0/'+m[1]);
response += //dosomething with detail text
//and concatenate in the response
}
}
}
return ContentService.createTextOutput(response);
}