从网站刮痧表,用javascript:subOpen href链接

时间:2015-07-22 08:05:41

标签: javascript google-sheets

我想为此页面上的每个链接搜索后面的页面详细信息页面。

我可以在此页面上获取所有信息:PAGE

但是,我想在详细信息页面上获取所有信息,但href链接看起来像这样,例如:

href="javascript:subOpen('9ca8ed0fae15d43dc1257e7300345b99')"

以下是使用ImportHTML功能获取概述的示例电子表格。

Google Spreadsheet

有关如何获取详细信息页面的任何建议吗?

更新

我实现了以下方法:

function doGet(e){
  var base = 'http://www.ediktsdatei.justiz.gv.at/edikte/ex/exedi3.nsf/'
  var feed =  UrlFetchApp.fetch(base + 'suche?OpenForm&subf=e&query=%28%5BVKat%5D%3DEH%20%7C%20%5BVKat%5D%3DZH%20%7C%20%5BVKat%5D%3DMH%20%7C%20%5BVKat%5D%3DMW%20%7C%20%5BVKat%5D%3DMSH%20%7C%20%5BVKat%5D%3DGGH%20%7C%20%5BVKat%5D%3DRH%20%7C%20%5BVKat%5D%3DHAN%20%7C%20%5BVKat%5D%3DWE%20%7C%20%5BVKat%5D%3DEW%20%7C%20%5BVKat%5D%3DMAI%20%7C%20%5BVKat%5D%3DDTW%20%7C%20%5BVKat%5D%3DDGW%20%7C%20%5BVKat%5D%3DGA%20%7C%20%5BVKat%5D%3DGW%20%7C%20%5BVKat%5D%3DUL%20%7C%20%5BVKat%5D%3DBBL%20%7C%20%5BVKat%5D%3DLF%20%7C%20%5BVKat%5D%3DGL%20%7C%20%5BVKat%5D%3DSE%20%7C%20%5BVKat%5D%3DSO%29%20AND%20%5BBL%5D%3D0').getContentText();

       var d = document.createElement('div'); //assuming you can do this
       d.innerHTML = feed;//make the text a dom structure
       var arr = d.getElementsByTagName('a') //iterate over the page links
       var response = "";
       for(var i = 0;i<arr.length;i++){
         var atr = arr[i].getAttribute('onclick');
         if(atr) atr = atr.match(/subOpen\((.*?)\)/) //if onclick calls subOpen
         if(atr && atr.length > 1){ //get the id
            var detail = UrlFetchApp.fetch(base + '0/'+atr[1]).getContentText();
            response += detail//process the relevant part of the content and append to the reposnse text
         }
        }      
       return ContentService.createTextOutput(response);
}

但是,运行方法时出错:

ReferenceError: "document" is not defined. (line 6, file "")

document的对象是什么?

我已使用网络应用更新了Google Spreadsheet

感谢您的回复!

1 个答案:

答案 0 :(得分:6)

您可以使用Firebug来检查页面内容和javascript。例如,您可以发现subOpen实际上是xmlhttp01.js中声明的subOpenXML的别名。

function subOpenXML(unid) {/*open found doc from search view*/
 if (waiting) return alert(bittewar);
 var wState = dynDoc.getElementById('windowState');
 wState.value = 'H';/*httpreq pending*/
 var last = '';
 if (unid==docLinks[0]) {last += '&f=1'; thisdocnum = 1;}
 if (unid==docLinks[docLinks.length-1]) {
  last += '&l=1';
  thisdocnum = docLinks.length;
 } else {
  for (var i=1;i<docLinks.length-1;i++)
   if (unid==docLinks[i]) {thisdocnum = i+1; break;}
 }
 var url = unid + html_delim + 'OpenDocument'+last + '&bm=2';
 httpreq.open('GET',    // &rand=' + Math.random();
  /*'/edikte/test/ex/exedi31.nsf/0/'+*/ '0/'+url, true);
 httpreq.onreadystatechange=onreadystatechange;
// httpreq.setRequestHeader('Accept','text/xml');
 httpreq.send(null);
 waiting = true;
 title2src = firstTextChild(dynDoc.getElementById('title2')).nodeValue;
}

因此,在复制函数源并在firebug的控制台选项卡中修改它之后,在http调用之前添加console.log(url),如下所示:

 var url = unid + html_delim + 'OpenDocument'+last + '&bm=2';
 console.log(url)
 httpreq.open('GET',    // &rand=' + Math.random();
  /*'/edikte/test/ex/exedi31.nsf/0/'+*/ '0/'+url, true);

您可以在firebug的Console选项卡中执行函数声明,并使用修改后的源覆盖subOpen。 然后在链接中单击将显示被调用的URL由作为参数传递的id组成,前缀为&#39; 0 /&#39;,因此在您发布的示例中,它将是GET:

http://www.ediktsdatei.justiz.gv.at/edikte/ex/exedi3.nsf/0/1fd2313c2e0095bfc1257e49004170ca?OpenDocument&f=1&bm=2

您还可以通过打开firebug中的“网络”标签并单击链接来验证这一点。

因此,为了抓住您需要

的详细信息页面
  1. 解析传递给subOpen的id
  2. 拨打&#39; 0 /&#39;
  3. 解析请求响应
  4. 在firebug的网络选项卡中查看请求响应显示,您可能需要进行类似的解析才能真正获得所显示的内容,但我还没有深入了解它。

    <强>更新 importHTML函数不适合您想要的那种抓取。 Google的HTMLContent服务更适合这种情况。您需要创建web app并实施doGet功能:

    function doGet(e){
      var base = 'http://www.ediktsdatei.justiz.gv.at/edikte/ex/exedi3.nsf/'
      var feed =  UrlFetchApp.fetch(base + 'suche?OpenForm&subf=e&query=%28%5BVKat%5D%3DEH%20%7C%20%5BVKat%5D%3DZH%20%7C%20%5BVKat%5D%3DMH%20%7C%20%5BVKat%5D%3DMW%20%7C%20%5BVKat%5D%3DMSH%20%7C%20%5BVKat%5D%3DGGH%20%7C%20%5BVKat%5D%3DRH%20%7C%20%5BVKat%5D%3DHAN%20%7C%20%5BVKat%5D%3DWE%20%7C%20%5BVKat%5D%3DEW%20%7C%20%5BVKat%5D%3DMAI%20%7C%20%5BVKat%5D%3DDTW%20%7C%20%5BVKat%5D%3DDGW%20%7C%20%5BVKat%5D%3DGA%20%7C%20%5BVKat%5D%3DGW%20%7C%20%5BVKat%5D%3DUL%20%7C%20%5BVKat%5D%3DBBL%20%7C%20%5BVKat%5D%3DLF%20%7C%20%5BVKat%5D%3DGL%20%7C%20%5BVKat%5D%3DSE%20%7C%20%5BVKat%5D%3DSO%29%20AND%20%5BBL%5D%3D0').getContentText();
           var response = "";
           var match = feed.match(/subOpen\('.*?'\)/g)
           if(match){
             for(var i = 0; i < match.length;i++){
                  var m = match[i].match(/\('(.*)'\)/);
                  if(m && m.length > 1){
                    var detailText = UrlFetchApp.fetch(base + '0/'+m[1]);
                    response += //dosomething with detail text 
                                //and concatenate in the response
                  }
             }
           }
           return ContentService.createTextOutput(response);
    
    
    }