使用 Google Apps 脚本登录外部网站以执行网页抓取

时间:2021-07-10 12:00:01

标签: authentication google-apps-script cookies urlfetch

我正在尝试使用 Google Apps 脚本 (GAS) 登录外部网站 https://login.test-aankoop.be/SignIn?wa=wsignin1.0&wtrealm=eur://euroconsumers.pro.flinesc.nl-be/。目标是验证和执行网络抓取。 我已经完成了研究,编写了代码并在下面概述了一些信息。

一般步骤

  1. GET 使用 UrlFetchApp.Fetch(url) method 获取网址并从响应标头中获取 set-cookie headers(多个)(请参阅 GET Raw 响应标头)。
  2. 使用 UrlFetchApp.Fetch(url, parameters) method 的高级参数发布表单。高级参数是:
  • headers,它又包含一个类似于请求标头中使用的 cookie (请参阅代码 GAS 中的 POST 原始请求标头和 GetConstructedCookie(responseSetCookie))。
  • payload,表单数据。请注意,表单数据还包含密钥 __RequestVerificationToken。该键的值由代码 GAS 中的 GetReqVerTokenFromHtml(response) 方法提取。
  1. GET 获取 url 并分析 HTML(登录(未)成功)。

问题

在浏览器中,使用无效凭据登录会导致显示 div .error-panel。 但是,当我运行 GAS 代码时,页面中不存在 div .error-panel。此外,网页的标题是“Fout”(英文 = 错误,另见 GAS 输出)意味着身份验证有问题。有人可以帮助我吗?我的 GAS 代码中遗漏了什么?

在浏览器中,无效凭据会导致警告 div .error-panel

enter image description here

HTML(来自 Firefox Web 开发者工具)

<div class="error-panel">De gebruikersnaam waarmee je probeert aan te melden is bij ons onbekend. Je moet eerst <a href="...')">een account aanmaken</a> voor je kunt aanmelden.</div>

天然气

代码

function login() {
  const url = "https://login.test-aankoop.be/SignIn?wa=wsignin1.0&wtrealm=eur://euroconsumers.pro.flinesc.nl-be/"; //URL with login form (Dutch)
     
  try {

    Logger.log("make a GET for %s", url);
    var response = UrlFetchApp.fetch(url);    

    var responseSetCookie = response.getAllHeaders()['Set-Cookie']; // returns an attribute/value map of headers for the HTTP response, with headers that have multiple values returned as arrays.
    Logger.log("Response headers - raw Set-Cookie \n %s", responseSetCookie);      
        
    var payload = {
      "__RequestVerificationToken": GetReqVerTokenFromHtml(response),
      "Identification": "bob@example.com", //not a real e-mail address      
      "Password": "ThePassword", //not a real password
      "RememberMe" : "0", 
      "RememberMe" : "1"
      }

    var headers = {
      "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
      "Accept-Encoding": "gzip, deflate, br",
      "Accept-Language": "en-US,en;q=0.5",
      "Connection": "keep-alive",
      "Cookie": GetConstructedCookie(responseSetCookie), //include the constructed cookie in the header for the POST
      "Origin": "https://login.test-aankoop.be",
      "Referer": "https://login.test-aankoop.be/SignIn?wa=wsignin1.0&wtrealm=eur%3A%2F%2Feuroconsumers.pro.flinesc.nl-be%2F",
      "TE": "Trailers",
      "Upgrade-Insecure-Requests":"1",
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"     
    }
      
    var parameters = { 
      "headers": headers,
      "method" : "post",
      "payload": payload      
      }

    //2. submit the form (make a POST)
    Logger.log("make a POST for %s", url);
    response = UrlFetchApp.fetch(url,parameters); //Note: even with invalid credentials the server returns status 200
   
    //a div with class 'error-panel' appears when user tries to authenticate with invalid credentials
    $ = Cheerio.load(response.getContentText()); 
    if ($('.error-panel').html() == null) {
      Logger.log("No error panel found!");
      Logger.log("Title of page is %s", $("title").text()); //page title is fout ("fout" means Error in English)
      } else { Logger.log("OK error panel found")}
    }
  catch (ex) { Logger.log("something went wrong...\n %s", ex); }
}

function GetReqVerTokenFromHtml(response) {
    var $ = Cheerio.load(response.getContentText());
    var reqVerTokenForm = $("form.form-horizontal").find("input[name='__RequestVerificationToken']").val(); 
    Logger.log("the __RequestVerificationToken of the form is %s \t", reqVerTokenForm);
    return reqVerTokenForm;
}

function GetConstructedCookie(responseSetCookie) {  
   var setCookieParts = [];

   //extract set-cookies from from response headers. Only the first part is needed for the cookie in de response header
   for (var i = 0; i < responseSetCookie.length; i++) { setCookieParts.push(responseSetCookie[i].split('; ')[0]); }   
   
   //First use Set for array with unique values. Finally add ecpolicy=1 twice and join with a semi-colon followed by a space
   var constructedCookie = Array.from(new Set(setCookieParts));
   constructedCookie.push("ecpolicy=1");
   constructedCookie.push("ecpolicy=1");
   constructedCookie = constructedCookie.join("; ");

   Logger.log("Constructed cookie \n %s:", constructedCookie); 

  return constructedCookie;
}

输出

make a GET for https://login.test-aankoop.be/SignIn?wa=wsignin1.0&wtrealm=eur://euroconsumers.pro.flinesc.nl-be/
Response headers - raw Set-Cookie 
 [ASP.NET_SessionId=a3sk22fbb0kbunu1dlbrmabj; path=/; HttpOnly; SameSite=None; Secure, ASP.NET_SessionId=a3sk22fbb0kbunu1dlbrmabj; path=/; HttpOnly; SameSite=None; Secure, beanContextCookie=latestMessage=d2E9d3NpZ25pbjEuMCZ3dHJlYWxtPWV1ciUzYSUyZiUyZmV1cm9jb25zdW1lcnMucHJvLmZsaW5lc2MubmwtYmUlMmY; path=/; HttpOnly; SameSite=None; Secure, __RequestVerificationToken=gKk6_0AFD9R9rXPAUsRh0LqDrG1-7JlrmHPAiBUX-wz0ojhKaacF3Yt9NFZvWggyv7ysv6cm4XGkbbKB6kFrMmRr1FgIEfKqup6_AD_luX41; path=/; HttpOnly; SameSite=None; Secure]
the __RequestVerificationToken of the form is 08bqWPQ995Sbm1qlqJCp8a1qkV-pvzfSDnUTnVEEg-M6NhQmcpNV_XXizKlCKsiKmyMrTpdb2xuW7witkILktmYsLqPNHIFeSAfQrS64qWk1
Constructed cookie 
 ASP.NET_SessionId=a3sk22fbb0kbunu1dlbrmabj; beanContextCookie=latestMessage=d2E9d3NpZ25pbjEuMCZ3dHJlYWxtPWV1ciUzYSUyZiUyZmV1cm9jb25zdW1lcnMucHJvLmZsaW5lc2MubmwtYmUlMmY; __RequestVerificationToken=gKk6_0AFD9R9rXPAUsRh0LqDrG1-7JlrmHPAiBUX-wz0ojhKaacF3Yt9NFZvWggyv7ysv6cm4XGkbbKB6kFrMmRr1FgIEfKqup6_AD_luX41; ecpolicy=1; ecpolicy=1:
make a POST for https://login.test-aankoop.be/SignIn?wa=wsignin1.0&wtrealm=eur://euroconsumers.pro.flinesc.nl-be/
No error panel found!
Title of page is Fout

标题

GET 原始响应标头

HTTP/2 200 OK
date: Sat, 10 Jul 2021 06:28:15 GMT
content-type: text/html; charset=utf-8
cache-control: private
vary: Accept-Encoding
p3p: CP="NONE"
set-cookie: ASP.NET_SessionId=...; path=/; HttpOnly; SameSite=None; Secure
ASP.NET_SessionId=...; path=/; HttpOnly; SameSite=None; Secure
beanContextCookie=latestMessage=...; path=/; HttpOnly; SameSite=None; Secure
__RequestVerificationToken=...; path=/; HttpOnly; SameSite=None; Secure
x-aspnetmvc-version: 5.2
x-frame-options: ALLOW-FROM https://www.test-aankoop.be/ https://*.test-aankoop.be
content-security-policy: upgrade-insecure-requests; frame-ancestors https://www.test-aankoop.be/ https://*.test-aankoop.be http://*.conseur.org
x-aspnet-version: 4.0.30319
request-context: appId=cid-v1:40f5f67e-1270-480e-b3cc-f108255e2977
access-control-expose-headers: Request-Context
x-powered-by: ASP.NET
cf-cache-status: DYNAMIC
expect-ct: max-age=604800, report-uri="https://report-uri.cloudflare.com/cdn-cgi/beacon/expect-ct"
strict-transport-security: max-age=15552000; includeSubDomains; preload
x-content-type-options: nosniff
server: cloudflare
cf-ray: 66c7bb19aa842dd6-BRU
content-encoding: br
alt-svc: h3-27=":443"; ma=86400, h3-28=":443"; ma=86400, h3-29=":443"; ma=86400, h3=":443"; ma=86400
X-Firefox-Spdy: h2

POST 原始请求标头

POST /SignIn?wa=wsignin1.0&wtrealm=eur%3A%2F%2Feuroconsumers.pro.flinesc.nl-be%2F%22 HTTP/1.1
Cookie: ASP.NET_SessionId=...; beanContextCookie=latestMessage=...; __RequestVerificationToken=...; ecpolicy=1; ecpolicy=1
Host: login.test-aankoop.be
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Content-Type: application/x-www-form-urlencoded
Content-Length: 212
Origin: https://login.test-aankoop.be
DNT: 1
Connection: keep-alive
Referer: https://login.test-aankoop.be/SignIn?wa=wsignin1.0&wtrealm=eur://euroconsumers.pro.flinesc.nl-be/%22
Upgrade-Insecure-Requests: 1

表格

HTML 代码段

<form action="/SignIn?wa=wsignin1.0&amp;wtrealm=eur%3A%2F%2Feuroconsumers.pro.flinesc.nl-be%2F%22" class="form-horizontal" method="post">
<input name="__RequestVerificationToken" type="hidden" value="...">
<input name="Identification" type="text" > <!-- username or e-mail address-->
<input name="Password" type="password">
<input checked="checked" name="RememberMe" type="checkbox" value="true">
<input type="submit" class="btn" id="LoginButton">
</form>

原始数据请求

__RequestVerificationToken=...&Identification=...&Password=...&RememberMe=true&RememberMe=false

0 个答案:

没有答案
相关问题