结构化JSON项目的正则表达式匹配

时间:2019-05-06 14:21:00

标签: javascript json regex google-apps-script google-sheets

我遇到一个问题,我试图获取一个长的多行字符串,在正则表达式提取元素之前,使用一堆正则表达式替换使其成为(接近但并非完美)JSON格式。

该字符串是从域中提取的原始Whois数据,我正在尝试获取到期日期等信息。(Whois有很多细微之处,我要忽略,例如1200多个不同顶级域名(TLD)的Whois服务器,每种格式都不同,等等。

我的函数似乎只适用于单个域,但是当我在for循环中使用该函数时,我得到的结果确实不稳定。我不确定这是否与某种正则表达式限制有关,还是与应用脚本中的某些内容有关,或者我是否有错之处。

初始处理后的示例字符串:

(var newString =)

  

{“ DomainName”:“ WANITAMALAS.COM”,“ RegistryDomainID”:“ 2115163838_DOMAIN_COM-VRSN”,“ RegistrarWHOISServer”:“ whois.PublicDomainRegistry.com”,“ RegistrarURL”:“ www.publicdomainregistry.com”,“ UpdatedDate “:” 2019-04-14T10:02:49Z“,” CreationDate“:” 2017-04-17T15:35:40Z“,” RegistryExpiryDate“:” 2020-04-17T15:35:40Z“,” Registrar“: “ PDRLtd.dbaPublicDomainRegistry.com”,“ RegistrarIANAID”:“ 303”,“ RegistrarAbuseContactEmail”:“ abuse-contact@publicdomainregistry.com”,“ RegistrarAbuseContactPhone”:“ + 1.2013775952”,“ DomainStatus”:“ clientTransferProhibitedicann.orgepp#client ,“ NameServer1”:“ BAYAN.NS.CLOUDFLARE.COM”,“ NameServer2”:“ LORNA.NS.CLOUDFLARE.COM”,“ DNSSEC”:“ unsigned”,“ ICANNWhoisInaccuracyComplaintForm的URL”:“ www.icann.orgwicf”,“ >>> whoisdatabase的最新更新“:” 2019-05-06T13:55:16Z <<<“,” Whoisstatuscodes上的更多信息请访问itiiti.orgepp“,” NOTICE“:”此记录中显示的过期日期是“”,“注册商的注册”是当前域名注册的担保人pire.Thisdatedoesnotnecessarilyreflecttheexpiration”, “dateofthedomainnameregistrant'sagreementwiththesponsoring”, “registrar.Usersmayconsultthesponsoringregistrar'sWhoisdatabaseto”, “viewtheregistrar'sreporteddateofexpirationforthisregistration”, “TERMSOFUSE”: “YouarenotauthorizedtoaccessorqueryourWhois”, “databasethroughtheuseofelectronicprocessesthatarehigh-音量和”, “automatedexceptasreasonablynecessarytoregisterdomainnamesor”, “modifyexistingregistrations; theDatainVeriSignGlobalRegistry”, “服务(\ ”VeriSign公司\“)WhoisdatabaseisprovidedbyVeriSignfor”, “informationpurposesonlyandtoassistpersonsinobtaininginformation”, “aboutorrelatedtoadomainnameregistrationrecord.VeriSigndoesnot”, “guaranteeitsaccuracy.BysubmittingaWhoisqueryyouagreetoabide”, “bythefollowingtermsofuse”: “YouagreethatyoumayusethisDataonly”, “forlawfulpurposesandthatundernocircumstanceswillyouusethisData”, “为”:“(1)allowenableorotherwisesupportthetransmissionofmass通过电子邮件发送“,”未经请求的商业广告或招揽NE “ ”orfacsimile;或(2)enablehighvolumeautomatedelectronicprocesses“, ”thatapplytoVeriSign(oritscomputersystems).Thecompilation“, ”repackagingdisseminationorotheruseofthisDataisexpressly“, ”prohibitedwithoutthepriorwrittenconsentofVeriSign.Youagreenotto“, ”useelectronicprocessesthatareautomatedandhigh-volumetoaccessor“, ”querytheWhoisdatabaseexceptasreasonablynecessarytoregister“, ”domainnamesormodifyexistingregistrations.VeriSignreservestheright“,” torestrictyouraccesstotheWhoisdatabaseinitssolediscretiontoensure “,”操作稳定性。VeriSign可能会限制或终止您对它们的访问权限“,” Whois数据库将无法遵循这些使用条款。VeriSign“,”保留修改这些条款的权利。“,” Registry数据库包含ONLY.COM.NET.EDU域和“,”“”

我的功能:

function processAllDomains(){
    //Bunch of Sheets stuff here...//
    for(var i = 0 ; i< urlsList.length ; ++){
        whoisDataLookup(urlsList[i],"-"); // Second argument "Status" can be an error from a previous step in the process
    }
    //Do stuff//
}
function whoisDataLookup(domain,status) {
    var whoisData = [domain, status];

    whoisData.push(new Date());
    var CAYKwhoIsServer = "https://myHerokuWhoisApp.com/";

    var regexChecks = [];
    var Registrar = /(?:Registrar|RegistryDomainID|SponsoringRegistrar)":"(.*?)",/gim
        regexChecks.push(Registrar);

    var RegistrarURL = /(?:RegistrarWHOISServer|RegistrarURL)":"(.*?)",/gim
        //  var RegistrarWhoisUrl = /RegistrarWHOISServer":"(.*?)",/gi
        regexChecks.push(RegistrarURL);

    var UpdatedDate = /(?:UpdatedDate|Lastupdatedon)":"(.*?)",/gim
        regexChecks.push(UpdatedDate);

    var CreationDate = /(?:CreationDate|created|RegistrationTime)":"(.*?)",/gim
        regexChecks.push(CreationDate);

    var RegistryExpiryDate = /(?:RegistryExpiryDate|paid-till|ExpirationTime)":"(.*?)",/gim
        regexChecks.push(RegistryExpiryDate);

    var NameServer1 = /NameServer1":"(.*?)",/gim
        regexChecks.push(NameServer1);

    var NameServer2 = /NameServer2":"(.*?)",/gim
        regexChecks.push(NameServer2);

    var DNSSEC = /DNSSEC":"(.*?)",/gim
        regexChecks.push(DNSSEC);

    var fetchURL = CAYKwhoIsServer + domain;
    Logger.log('fetchURL is: ' + fetchURL);
    if (status != "-") { // If not null mark, then an error exists, so null the whole row
        for (var d = 0; d < regexChecks.length + 2; ++d) {
            whoisData.push("-")
        }
        return whoisData;
    }

    var whoisFetch = UrlFetchAppCacheToDrive(fetchURL, domain + "--whois") // just a url fetch, but saves the fetched content to Google drive, then if the same url is requested in future, pull the drive content, instead of fetching it from the web again.
        //    Logger.log('whoisFetch is: ' + whoisFetch);
        var whoisJSON = JSON.parse(whoisFetch);
    // Logger.log(whoisJSON.keys);


    var whoisString = JSON.stringify(whoisJSON["result"]); // the whois pulls other stuff, only the result is the actual whois string
    //   Logger.log('whoisString is: ' + whoisString);
    if (whoisString == undefined) {
        whoisString = "-";
    }

    whoisData.push(whoisString);

    try {
        whoisString = whoisString.replace(/ /g, '');
        whoisString = whoisString.replace(/\,/g, '');

        if (whoisString.match("WhoisdServerVersion")) {
            whoisString = whoisString.replace(/.*WhoisdServerVersion/, '"WhoisdServerVersion');
            whoisString = whoisString.replace(/\(.*\)/, '');

        }
// Start a bunch of whois string cleanup, working towards the sample string shown above.
        whoisString = whoisString.replace(/.*\(\w\:\)/, '$1');
        whoisString = whoisString.replace(/\%/g, '');
        whoisString = whoisString.replace(/\\r/gi, '');
        whoisString = whoisString.replace(/\\n\\n/g, '\\n');
        whoisString = whoisString.replace(/\\n\\n/g, '\\n');
        whoisString = whoisString.replace(/\\n\\n/g, '\\n');
        whoisString = whoisString.replace(/\\n$/, '');
        //    Logger.log('whoisString Check is: ' + whoisString);
        whoisString = whoisString.replace(/https?:\/\//gi, '');
        whoisString = whoisString.replace(/\//gi, '');
        whoisString = whoisString.replace(/(\d\d)\:/gi, "$1qqqqqqqqqq"); // temporarily replace some COLON (:), to be sure we do not split up the string based on IPV6 ip addresses, which sometimes appear in whois string.
        whoisString = whoisString.replace(/NameServers?:/i, 'NameServer1:');
        whoisString = whoisString.replace(/NameServer:/i, 'NameServer2:');
        whoisString = whoisString.replace(/NameServer:/i, 'NameServer3:');
        whoisString = whoisString.replace(/NameServer:/i, 'NameServer4:');
        whoisString = whoisString.replace(/NameServer:/i, 'NameServer5:');
        whoisString = whoisString.replace(/NameServer:/i, 'NameServer6:');
        whoisString = whoisString.replace(/NameServer:/i, 'NameServer7:');
        whoisString = whoisString.replace(/NameServer:/i, 'NameServer8:');
        whoisString = whoisString.replace(/:/g, '":"');
        whoisString = whoisString.replace(/":"\\n/g, '":"');
        whoisString = whoisString.replace(/qqqqqqqqqq/gi, ":");
        whoisString = whoisString.replace(/\\n/gi, '","');
        // whoisString = whoisString.replace(/,""$/,'');


        var newString = "{" + whoisString + "}";



    } catch (e) {
        whoisData.splice(1, 1, "WhoisLookupError")
    }
    if (newString) {
        whoisData.push(newString);
    } else {
        whoisData.push('-')
    }
// **Everything seems to work as expected to this point, as my output shows 
// newString properly in all cases, from here, it gets wonky though.**

    for (var e = 0; e < regexChecks.length; ++e) {
        Utilities.sleep(50)
        var regexCheck = regexChecks[e];
        //  Logger.log('regexCheck is: ' + regexCheck);
        var metric = regexCheck.exec(newString);

        if (metric != undefined) {
            Logger.log(regexCheck + ' metric found is: ' + metric[1]);
            whoisData.push(metric[1])
        } else {
            Logger.log(regexCheck + ' metric not found');

            whoisData.push('-');
        }
    }
    return whoisData;

}

Example Output showing erratic regex matching, when values exist and when regex101.com shows the regex should have matched.

这是单个正则表达式匹配的示例,记录为未定义: (我并不真正担心多次比赛,现在只需要在现场进行一些操作即可) Here is an example of a single regex match, which logged as undefined

1 个答案:

答案 0 :(得分:1)

我将继续努力使其成为有效的JSON。您提供的字符串是有效的JSON,除了一种重复模式:

一些长字符串文字似乎被分成几个用逗号分隔的字符串文字。如果您只需将这些序列包装在方括号中,它们就会形成有效的数组。

这是如何工作的:

// The string you provided in the question:
var newString = `{"DomainName":"WANITAMALAS.COM","RegistryDomainID":"2115163838_DOMAIN_COM-VRSN","RegistrarWHOISServer":"whois.PublicDomainRegistry.com","RegistrarURL":"www.publicdomainregistry.com","UpdatedDate":"2019-04-14T10:02:49Z","CreationDate":"2017-04-17T15:35:40Z","RegistryExpiryDate":"2020-04-17T15:35:40Z","Registrar":"PDRLtd.dbaPublicDomainRegistry.com","RegistrarIANAID":"303","RegistrarAbuseContactEmail":"abuse-contact@publicdomainregistry.com","RegistrarAbuseContactPhone":"+1.2013775952","DomainStatus":"clientTransferProhibitedicann.orgepp#clientTransferProhibited","NameServer1":"BAYAN.NS.CLOUDFLARE.COM","NameServer2":"LORNA.NS.CLOUDFLARE.COM","DNSSEC":"unsigned","URLoftheICANNWhoisInaccuracyComplaintForm":"www.icann.orgwicf",">>>Lastupdateofwhoisdatabase":"2019-05-06T13:55:16Z<<<","FormoreinformationonWhoisstatuscodespleasevisiticann.orgepp","NOTICE":"Theexpirationdatedisplayedinthisrecordisthedatethe","registrar'ssponsorshipofthedomainnameregistrationintheregistryis","currentlysettoexpire.Thisdatedoesnotnecessarilyreflecttheexpiration","dateofthedomainnameregistrant'sagreementwiththesponsoring","registrar.Usersmayconsultthesponsoringregistrar'sWhoisdatabaseto","viewtheregistrar'sreporteddateofexpirationforthisregistration.","TERMSOFUSE":"YouarenotauthorizedtoaccessorqueryourWhois","databasethroughtheuseofelectronicprocessesthatarehigh-volumeand","automatedexceptasreasonablynecessarytoregisterdomainnamesor","modifyexistingregistrations;theDatainVeriSignGlobalRegistry","Services'(\\\"VeriSign\\\")WhoisdatabaseisprovidedbyVeriSignfor","informationpurposesonlyandtoassistpersonsinobtaininginformation","aboutorrelatedtoadomainnameregistrationrecord.VeriSigndoesnot","guaranteeitsaccuracy.BysubmittingaWhoisqueryyouagreetoabide","bythefollowingtermsofuse":"YouagreethatyoumayusethisDataonly","forlawfulpurposesandthatundernocircumstanceswillyouusethisData","to":"(1)allowenableorotherwisesupportthetransmissionofmass","unsolicitedcommercialadvertisingorsolicitationsviae-mailtelephone","orfacsimile;or(2)enablehighvolumeautomatedelectronicprocesses","thatapplytoVeriSign(oritscomputersystems).Thecompilation","repackagingdisseminationorotheruseofthisDataisexpressly","prohibitedwithoutthepriorwrittenconsentofVeriSign.Youagreenotto","useelectronicprocessesthatareautomatedandhigh-volumetoaccessor","querytheWhoisdatabaseexceptasreasonablynecessarytoregister","domainnamesormodifyexistingregistrations.VeriSignreservestheright","torestrictyouraccesstotheWhoisdatabaseinitssolediscretiontoensure","operationalstability.VeriSignmayrestrictorterminateyouraccesstothe","Whoisdatabaseforfailuretoabidebythesetermsofuse.VeriSign","reservestherighttomodifythesetermsatanytime.","TheRegistrydatabasecontainsONLY.COM.NET.EDUdomainsand","Registrars.",""}`;

// Turn the series of string literals into arrays of strings:
var json = newString.replace(/":\s*("(?:\\.|[^"])*"(?:,\s*"(?:\\.|[^"])*")+)(?!\s*:)/g, '": [$1]');

// Parse
var obj = JSON.parse(json);

console.log(obj);

某些键/值看起来有些奇怪,因为它们似乎只是已删除空白的较长文本的一部分。如果对(未知)原始字符串进行不同的处理,可能会获得更好的结果。但是,该对象至少可以让您轻松提取所需的内容:

console.log(obj.DomainName);
console.log(obj.UpdatedDate);
// ...