我正在尝试解析一个大文本文件并使用正则表达式拉出某些元素。我正在为我的网站使用Node,所以我在Javascript中这样做。
在下面的示例中,我尝试使用逗号和句点匹配10个数字字符串。在第一个例子中,我匹配正确的模式,但我捕获了两个外围字符串(我只想要以“4 0000 ....”开头的行末尾的数字。)
https://regex101.com/r/nO8nM1/8
在此示例中,我匹配字符串的正确实例,但我无法忽略第一个捕获组,因此包含其他字符和空格。
https://regex101.com/r/uB6hE4/1
正则表达式:
/(\d+,\d+.\d+)(?=")|(\d+,\d+,\d+.\d+)(?=")/gm
示例数据:
23205 - Grants Current-County Operatin 4,425,327.00"
" 4 0000047387 Central Equatoria State 1003-1478 Sta Hosp Oper Oct 85,784.00"
" 4 0000047442 EASTERN EQUATORIA ST 1003-1479 Sta Hosp Oper Oct 93,137.00"
" 4 0000047485 JONGLEI STATE 1003-1519 Sta Hosp Oper Oct 144,608.00"
" 4 0000047501 Lakes State 1003-1482 Sta Hosp Oper Oct 93,137.00"
" 4 0000047528 Unity State 1003-1484 Sta Hosp Oper Oct 75,980.00"
" 4 0000047532 Northern Bahr-el State 1003-1483 Sta Hosp Oper Oct 58,824.00"
" 4 0000047615 Western E State 1003-1488 Sta Hosp Oper Oct 93,137.00"
" 4 0000047638 Warap State 1003-1486 Sta Hosp Oper Oct 51,471.00"
" 4 0000047680 Upper Nile State 1003-1485 Capitation 102,941.00"
" 4 0000047703 Western BG State 1003-1487 Sta Hosp Oper Oct 34,314.00"
----------------------
" Total For Period 4 833,333.00"
----------------------------------------------------------------------------------------------------------------------------
Fiscal Year 2015/16 Republic Of South Sudan Date 2015/11/20
Period 5 Time 12:58:40
FreeBalance Financial Management System Page 7
----------------------------------------------------------------------------------------------------------------------------
Vendor Analysis Report
1091 Health (MOH)
Prd Voucher # Vendor Name Description Amount
--- ---------------- ------------------------------ ----------------------------- ----------------------
----------------------
"
(\d+,\d+,\d+.\d+)(?=")
正则表达式2:
/(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+.\d+)(?=")|(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+,\d+.\d+)(?=")/gm
在我的代码中,我将这些值推送到对象数组(如果它们存在)。我尝试只推送与我想要的相关的匹配组,但这导致仅从匹配中推送索引项。
我很难尝试?:
,?=
和?!
的几种不同组合来忽略第二个链接中的第一个捕获组无济于事。我觉得解决方案必须相当简单,但我不能完全实现。对我做错了什么的想法?
我的代码:
var openFile = function(event) {
var input = event.target;
var reader = new FileReader();
reader.onload = function() {
var text = reader.result;
// console.log(text.substring(0, 999999999999999));
var section = text.substring(0, 9999999999999999);
var subSection = [];
console.log(typeof subSection);
var masterArray = new Object();
var uploadDate = "";
var period = "";
var transferArray = [];
var subSectionRegex = / Total([\s\S]*?)Total|^\s+\d{4,5}([\s\S]*?)Total F/gm;
var transferCodeRegex = /[0-9]{4,5}/;
var voucherNumberRegex = /([0-9]{7,10}[\S])(?=\s+)/g;
var vendorRegex = /(?!\d{10})(\S+\s\S+(\s\S+)?)(?=\s+100)|(?!\d{10})(\S+(\s\S+)?)(?=\s+100)/gm;
var descriptionRegex = /(?!\d{10})(\S+\s\S+(\s\S+)?)(?=\s+100)|(?!\d{10})(\S+(\s\S+)?)(?=\s+100)|(?!\d{10})(\S+\s(\s\S+)?)(?=\s+100)/g;
// var descriptionRegex = /(\d{4}-\d{4})(\D+)*\s\D/g;
var amountRegex = /(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+.\d+)(?=")|(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+,\d+.\d+)(?=")/gm;
// var amountRegex = /(\d+,\d+.\d+)(?=")|(\d+,\d+,\d+.\d+)(?=")/gm;
// var amountRegex = /\w\s{10,20}(\d+(?:,\d{3})*\.\d+)/gm;
var oneLineAmountRegex = /(\d+,\d+,\d+.\d+)|\d+,\d+.\d+/g;
var oneLineDescRegex = / - (\D+)|- \d+(\D+)/gm;
var allData = [{}];
console.log('section: ' + typeof section);
subSection = section.match(subSectionRegex);
subSection = subSection.filter(Boolean);
console.log(typeof subSection);
function extractDate() {
uploadDate = section.match(/Date (.*)/)[1].trim();
uploadDate = new Date(uploadDate);
allData["uploadDate"] = uploadDate;
}
extractDate();
// console.log(allData.uploadDate);
function extractPeriod() {
period = section.match(/Period (.*)/)[1].trim();
period = period.split(" ");
period = period[0];
period = parseInt(period);
// console.log("period: " + period);
allData["period"] = period;
}
extractPeriod();
// console.log(allData.period);
function extractDetails() {
for(var i = 0; i < subSection.length; i++) {
if(subSection[i].match(transferCodeRegex) && subSection[i].match(voucherNumberRegex) && subSection[i].match(vendorRegex) && subSection[i].match(descriptionRegex) && subSection[i].match(amountRegex)) {
transferArray.push({
"transferCode": subSection[i].match(transferCodeRegex),
"details": [{
"voucherNumber": subSection[i].match(voucherNumberRegex),
"vendor": subSection[i].match(vendorRegex),
"description": subSection[i].match(descriptionRegex),
"total": subSection[i].match(amountRegex)
}]
})
} else {
transferArray.push({
"transferCode": subSection[i].match(transferCodeRegex),
"details": [{
"voucherNumber": subSection[i].match(voucherNumberRegex),
"description": subSection[i].match(oneLineDescRegex),
"total": subSection[i].match(oneLineAmountRegex)
}]
})
}
}
}
function removeNulls(obj) {
var isArray = obj instanceof Array;
for(var k in obj) {
console.log('k: ' + k);
if(obj[k] === null || obj[k] === undefined) isArray ? obj.splice(k, 1) : delete obj[k];
else if (typeof obj[k] === "object") removeNulls(obj[k]);
}
}
removeNulls(transferArray);
console.log(transferArray);
console.log(JSON.stringify(transferArray, null, 2))
function cleanData() {
transferArray.forEach(function(e) {
console.log(e)
e.details.forEach(function(evt) {
console.log(evt)
console.log(evt.amount)
console.log(evt.description)
for(i = 0; i < evt.amount.length; i++) {
// evt.amount[i] = evt.amount[i].toString();
// evt.amount[i] = evt.amount[i].replace(/^[a-zA-Z]\s+/g, '');
evt.amount[i] = parseFloat(evt.amount[i].replace(/\,/g, ""));
}
for(i = 0; i < evt.description.length; i++) {
evt.description[i] = evt.description[i].toString();
evt.description[i] = evt.description[i].trim();
}
return(evt);
})
// console.log(evt.amount);
// console.log(evt.description);
});
}
cleanData();
console.log(transferArray);
console.log(transferArray);
//adds detailed data to allData array
allData["section"] = transferArray;
extractDetails();
console.log(allData);
function pushArrayToObject() {
}
};
reader.readAsText(input.files[0]);
};