我有一个我需要解析的CSV文件类型。以下是我需要考虑的条件(缺少列标题,引号内的换行符,缺少数据等):
ID,NAME,TITLE,DESCRIPTION,,
PRO1234,"JOHN SMITH",ENGINEER,"JOHN HAS BEEN WORKING
HARD ON BEING A GOOD
SERVENT."
PRO1235,"KEITH SMITH",ENGINEER,"keith has been working
hard on being a good
servent."
PRO1235,"KENNY SMITH",,"keith has been working
hard on being a good
servent."
PRO1235,"RICK SMITH",,,
您会注意到说明中有换行符以及新行数据的换行符。
我已经编写了这个正则表达式来查找引号OUTSIDE的换行符,并且效果很好here
代码,使用Node.js:
var fs = require('fs');
function parseCSV(filename){
var rx = new RegExp(/\n(?=([^"\\]*(\\.|"([^"\\]*\\.)*[^"\\]*"))*[^"]*$)/g);
var strFile = fs.readFileSync(filename).toString();
console.log("line feed count via match: " + strFile.match(rx).length);
var csv = strFile.split(rx);
console.log("csv length: " + csv.length);
console.log("csv items ###############################");
csv.forEach(function(e,i,a){
console.log("item e: " + e);
});
}
当我运行此功能时,您会看到换行计数(匹配找到的换行)正确, 4 。但是,当使用与String.split()相同的ret时,它会返回 17 ,结果数组不稳定:
line feed count via match: 4
csv length: 17
csv items ###############################
item e: ID,NAME,TITLE,DESCRIPTION,,
item e:
PRO1235,"RICK SMITH"
item e: "RICK SMITH"
item e: undefined
item e: PRO1234,"JOHN SMITH",ENGINEER,"JOHN HAS BEEN WORKING
HARD ON BEING A GOOD
SERVENT."
item e:
PRO1235,"RICK SMITH"
item e: "RICK SMITH"
item e: undefined
item e: PRO1235,"KEITH SMITH",ENGINEER,"keith has been working
hard on being a good
servent."
item e:
PRO1235,"RICK SMITH"
item e: "RICK SMITH"
item e: undefined
item e: PRO1235,"KENNY SMITH",,"keith has been working
hard on being a good
servent."
item e: PRO1235,"RICK SMITH"
item e: "RICK SMITH"
item e: undefined
item e: PRO1235,"RICK SMITH",,,
分手时我做错了什么?我的想法是,如果我能识别出与match()完美匹配的4个换行符,那么相同的regEx应该提供“拆分”字符串的位置。
答案 0 :(得分:1)
你有太多的捕获组。拆分将在分割字符串时返回捕获的组。 请考虑以下简单示例:
var simpleString = "111aaa222bbb";
var regxNoCaptureGroup = /\d+/;
var regxWithCaptureGroup = /(\d+)/;
var regxWithNoncapturingGroup = /(?:\d+)/;
simpleString.split(regxNoCaptureGroup); //["", "aaa", "bbb"]
simpleString.split(regxWithNoncapturingGroup); //same as above
simpleString.split(regxWithCaptureGroup); //["", "111", "aaa", "222", "bbb"] - includes captured groups
您在捕获组中拥有捕获组。请记住,拆分将找到该组,并将其删除以找到拆分部分,因此拆分数字(如第一个示例中)将仅返回字母。 在您的情况下,它将删除捕获的任何内容。 对于捕获组,它会在结果中返回它们 - 所以如果你计划在正则表达式中使用split,你应该建立一个只捕获所需内容的好的正则表达式。
答案 1 :(得分:0)
感谢anubhava的回答,这很好用:
var $ = jQuery = require('jquery');
var csv = require('./jquery.csv-0.71.min.js');
var fs = require('fs');
var strFile = fs.readFileSync("./data/TestData.csv").toString();
var obj = $.csv.toObjects(strFile);
var str = JSON.stringify(obj, null, 4);
console.log("str: " + str);
谁不喜欢新的轮子?
在我的辩护中,我在节点上尝试了3个工具,并且所有3个工具都是针对原始条件下最简单的情况和文件编写的。