Javascript正则表达式忽略第一个捕获组

时间:2015-12-20 15:31:15

标签: javascript regex

之前我问了一个类似的问题并且认为我得到了正确答案,但后来发现我正在捕捉一些我不应该的字符串。

我正在尝试解析一个大文本文件并使用正则表达式拉出某些元素。我正在为我的网站使用Node,所以我在Javascript中这样做。

在下面的示例中,我尝试使用逗号和句点匹配10个数字字符串。在第一个例子中,我匹配正确的模式,但我捕获了两个外围字符串(我只想要以“4 0000 ....”开头的行末尾的数字。)

https://regex101.com/r/nO8nM1/8

在此示例中,我匹配字符串的正确实例,但我无法忽略第一个捕获组,因此包含其他字符和空格。

https://regex101.com/r/uB6hE4/1

正则表达式:

/(\d+,\d+.\d+)(?=")|(\d+,\d+,\d+.\d+)(?=")/gm

示例数据:

                  23205        - Grants Current-County Operatin                        4,425,327.00"

"    4   0000047387         Central Equatoria State          1003-1478 Sta Hosp Oper Oct                   85,784.00"
"    4   0000047442         EASTERN EQUATORIA ST             1003-1479 Sta Hosp Oper Oct                   93,137.00"
"    4   0000047485         JONGLEI STATE                    1003-1519 Sta Hosp Oper Oct                  144,608.00"
"    4   0000047501         Lakes State                      1003-1482 Sta Hosp Oper Oct                   93,137.00"
"    4   0000047528         Unity State                      1003-1484 Sta Hosp Oper Oct                   75,980.00"
"    4   0000047532         Northern Bahr-el State           1003-1483 Sta Hosp Oper Oct                   58,824.00"
"    4   0000047615         Western E State                  1003-1488 Sta Hosp Oper Oct                   93,137.00"
"    4   0000047638         Warap State                      1003-1486 Sta Hosp Oper Oct                   51,471.00"
"    4   0000047680         Upper Nile State                 1003-1485 Capitation                  102,941.00"
"    4   0000047703         Western BG State                 1003-1487 Sta Hosp Oper Oct                   34,314.00"
                                                                                             ----------------------
"        Total For Period          4                                                                      833,333.00"
 ----------------------------------------------------------------------------------------------------------------------------
 Fiscal Year        2015/16                               Republic Of South Sudan                         Date     2015/11/20
 Period                   5                                                                               Time       12:58:40
                                                  FreeBalance Financial Management System                 Page              7
 ----------------------------------------------------------------------------------------------------------------------------
                                                            Vendor Analysis Report

                                                              1091 Health (MOH)
  Prd   Voucher #          Vendor Name                      Description                          Amount
  ---   ----------------   ------------------------------   -----------------------------    ----------------------
                                                                                             ----------------------
"  

(\d+,\d+,\d+.\d+)(?=")

正则表达式2:

/(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+.\d+)(?=")|(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+,\d+.\d+)(?=")/gm

在我的代码中,我将这些值推送到对象数组(如果它们存在)。我尝试只推送与我想要的相关的匹配组,但这导致仅从匹配中推送索引项。

我很难尝试?:?=?!的几种不同组合来忽略第二个链接中的第一个捕获组无济于事。我觉得解决方案必须相当简单,但我不能完全实现。对我做错了什么的想法?

我的代码:

var openFile = function(event) {
    var input = event.target;
    var reader = new FileReader();
  reader.onload = function() {
    var text = reader.result;
    // console.log(text.substring(0, 999999999999999));
      var section = text.substring(0, 9999999999999999);
      var subSection = [];
      console.log(typeof subSection);
      var masterArray = new Object();
      var uploadDate = "";
      var period = "";
      var transferArray = [];
      var subSectionRegex = /   Total([\s\S]*?)Total|^\s+\d{4,5}([\s\S]*?)Total F/gm;
      var transferCodeRegex = /[0-9]{4,5}/;
      var voucherNumberRegex = /([0-9]{7,10}[\S])(?=\s+)/g;
      var vendorRegex = /(?!\d{10})(\S+\s\S+(\s\S+)?)(?=\s+100)|(?!\d{10})(\S+(\s\S+)?)(?=\s+100)/gm;
      var descriptionRegex = /(?!\d{10})(\S+\s\S+(\s\S+)?)(?=\s+100)|(?!\d{10})(\S+(\s\S+)?)(?=\s+100)|(?!\d{10})(\S+\s(\s\S+)?)(?=\s+100)/g;
      // var descriptionRegex = /(\d{4}-\d{4})(\D+)*\s\D/g;
      var amountRegex = /(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+.\d+)(?=")|(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+,\d+.\d+)(?=")/gm;
      // var amountRegex = /(\d+,\d+.\d+)(?=")|(\d+,\d+,\d+.\d+)(?=")/gm;
      // var amountRegex = /\w\s{10,20}(\d+(?:,\d{3})*\.\d+)/gm;
      var oneLineAmountRegex = /(\d+,\d+,\d+.\d+)|\d+,\d+.\d+/g;
      var oneLineDescRegex = / - (\D+)|- \d+(\D+)/gm;
      var allData = [{}];

      console.log('section: ' + typeof section);
        subSection = section.match(subSectionRegex);
        subSection = subSection.filter(Boolean);
        console.log(typeof subSection);

      function extractDate() {
        uploadDate = section.match(/Date (.*)/)[1].trim();
        uploadDate = new Date(uploadDate);
        allData["uploadDate"] = uploadDate;
      }
      extractDate();
      // console.log(allData.uploadDate);

      function extractPeriod() {
        period = section.match(/Period (.*)/)[1].trim();
        period = period.split(" ");
        period = period[0];
        period = parseInt(period);
        // console.log("period: " + period);
        allData["period"] = period;
      }
      extractPeriod();
      // console.log(allData.period);

      function extractDetails() {
        for(var i = 0; i < subSection.length; i++) {
            if(subSection[i].match(transferCodeRegex) && subSection[i].match(voucherNumberRegex) && subSection[i].match(vendorRegex) && subSection[i].match(descriptionRegex) && subSection[i].match(amountRegex)) {
                transferArray.push({
                    "transferCode": subSection[i].match(transferCodeRegex),
                    "details": [{
                        "voucherNumber": subSection[i].match(voucherNumberRegex),
                        "vendor": subSection[i].match(vendorRegex),
                        "description": subSection[i].match(descriptionRegex),
                        "total": subSection[i].match(amountRegex)
                    }]
                })
            } else {
                transferArray.push({
                    "transferCode": subSection[i].match(transferCodeRegex),
                    "details": [{
                        "voucherNumber": subSection[i].match(voucherNumberRegex),
                        "description": subSection[i].match(oneLineDescRegex),
                        "total": subSection[i].match(oneLineAmountRegex)
                    }]
                })
            }
        }
      }

    function removeNulls(obj) {
            var isArray = obj instanceof Array;
            for(var k in obj) {
                console.log('k: ' + k);
            if(obj[k] === null || obj[k] === undefined) isArray ? obj.splice(k, 1) : delete obj[k];
            else if (typeof obj[k] === "object") removeNulls(obj[k]);
            }
        }

        removeNulls(transferArray);
        console.log(transferArray);
        console.log(JSON.stringify(transferArray, null, 2))

        function cleanData() {
            transferArray.forEach(function(e) {
                console.log(e)
            e.details.forEach(function(evt) {
                console.log(evt)
                console.log(evt.amount)
                console.log(evt.description)
            for(i = 0; i < evt.amount.length; i++) {
              // evt.amount[i] = evt.amount[i].toString();
              // evt.amount[i] = evt.amount[i].replace(/^[a-zA-Z]\s+/g, '');
              evt.amount[i] = parseFloat(evt.amount[i].replace(/\,/g, ""));
            }
            for(i = 0; i < evt.description.length; i++) {
            evt.description[i] = evt.description[i].toString();
            evt.description[i] = evt.description[i].trim();
            }
            return(evt);
            })
                // console.log(evt.amount);
                // console.log(evt.description);
            });
        }
        cleanData();
        console.log(transferArray);

        console.log(transferArray);

            //adds detailed data to allData array
      allData["section"] = transferArray;

      extractDetails();
      console.log(allData);

      function pushArrayToObject() {

      }
  };
  reader.readAsText(input.files[0]);
};

0 个答案:

没有答案