请澄清一下,我专门在寻找一个RegEx,它将:
在换行符上分割...除非换行符用双引号引起来。
如果换行符用双引号引起来,它将:
""
)转换为外部双引号内的单引号我有一个看起来像这样的数据网格。
复制并粘贴后,即为结果文本:
Data
Data Data
Data Data Data
Data Data Data"
Data Data "Da
ta"
Data Data "Da
ta"""
Data Data Data""
Data Data """Da
ta"""
Data Data """Da
ta"""
由于单元格的换行符 inside 导致一些奇怪的行为,导致文本变得有些奇怪:
""
)。 我希望能够将这些文本粘贴到textarea中,然后即使提到了这种奇怪的行为,也可以在HTML表格中重新创建原始网格。
我已经找到并稍微修改了this代码,我认为这很接近,但是我不认为RegEx是正确的,因此我还在this答案中添加了RegEx作为选项(我已将其注释掉,因为它会导致“内存不足”异常:
function splitOnNewlineExceptInDoubleQuotes(string) {
//The parenthesis in the regex creates a captured group within the quotes
var myRegexp = /[^\n"]+|"([^"]*)"/gim;
//var myRegexp = /(\n)(?=(?:[^\"]|\"[^\"]*\")*$)/m;
var myString = string
var myArray = [];
do {
//Each call to exec returns the next regex match as an array
var match = myRegexp.exec(myString);
if (match != null)
{
//Index 1 in the array is the captured group if it exists
//Index 0 is the matched text, which we use if no captured group exists
myArray.push(match[1] ? match[1] : match[0]);
}
} while (match != null);
return myArray
}
因此,我认为使用正则表达式(与完整状态机相对)是可能的,但是我不太确定该怎么做。
答案 0 :(得分:3)
这里是一个正则表达式,它将与您的源代码的每个组成部分一一对应,并编号为捕获组:
这将处理单行数据或一次处理所有行。
还可以处理CLRF(\r\n
)和RF(\n
)的行尾。
/(?:(\t)|(\r?\n)|"((?:[^"]+|"")*)"|([^\t\r\n]+))/
在这里,我们使用捕获的组来指示要做什么。
这将在控制台中输出行的数组。
var str =
'Data ' + "\r\n" +
'Data Data ' + "\r\n" +
'Data Data Data' + "\r\n" +
'Data Data Data"' + "\r\n" +
'Data Data "Da' + "\r\n" +
'ta"' + "\r\n" +
'Data Data "Da' + "\r\n" +
'ta"""' + "\r\n" +
'Data Data Data""' + "\r\n" +
'Data Data """Da' + "\r\n" +
'ta"""' + "\r\n" +
'Data Data """Da' + "\r\n" +
'' + "\r\n" +
'ta"""';
var myregexp = /(?:(\t)|(\r?\n)|"((?:[^"]+|"")*)"|([^\t\r\n]+))/ig;
var match = myregexp.exec(str);
var emptyRow = [];
var row = emptyRow.slice();
var rows = [];
var prevTab = false;
while (match != null) {
if (match[4]) {
// Unquoted data
row.push(match[4]);
prevTab = false;
} else if (match[3]) {
// Quoted data (replace escaped double quotes with single)
row.push(match[3].replace(/""/g, "'"));
prevTab = false;
} else if (match[1]) {
// Tab seperator
if (prevTab) {
// Two tabs means empty data
row.push('');
}
prevTab = true;
} else if (match[2]) {
// End of the row
if (prevTab) {
// Previously had a tab, so include the empty data
row.push('');
}
prevTab = false;
rows.push(row);
// Here we are ensuring the new empty row doesn't reference the old one.
row = emptyRow.slice();
}
match = myregexp.exec(str);
}
// Handles missing new line at end of string
if (row.length) {
if (prevTab) {
// Previously had a tab, so include the empty data
row.push('');
}
rows.push(row);
}
console.log('rows', rows);
// (?:(\t)|(\r?\n)|"((?:[^"]+|"")*)"|([^\t\r\n]+))
//
// Options: Case insensitive; ^$ don’t match at line breaks
//
// Match the regular expression below «(?:(\t)|(\r?\n)|"((?:[^"]+|"")*)"|([^\t\r\n]+))»
// Match this alternative (attempting the next alternative only if this one fails) «(\t)»
// Match the regex below and capture its match into backreference number 1 «(\t)»
// Match the tab character «\t»
// Or match this alternative (attempting the next alternative only if this one fails) «(\r?\n)»
// Match the regex below and capture its match into backreference number 2 «(\r?\n)»
// Match the carriage return character «\r?»
// Between zero and one times, as many times as possible, giving back as needed (greedy) «?»
// Match the line feed character «\n»
// Or match this alternative (attempting the next alternative only if this one fails) «"((?:[^"]+|"")*)"»
// Match the character “"” literally «"»
// Match the regex below and capture its match into backreference number 3 «((?:[^"]+|"")*)»
// Match the regular expression below «(?:[^"]+|"")*»
// Between zero and unlimited times, as many times as possible, giving back as needed (greedy) «*»
// Match this alternative (attempting the next alternative only if this one fails) «[^"]+»
// Match any character that is NOT a “"” «[^"]+»
// Between one and unlimited times, as many times as possible, giving back as needed (greedy) «+»
// Or match this alternative (the entire group fails if this one fails to match) «""»
// Match the character string “""” literally «""»
// Match the character “"” literally «"»
// Or match this alternative (the entire group fails if this one fails to match) «([^\t\r\n]+)»
// Match the regex below and capture its match into backreference number 4 «([^\t\r\n]+)»
// Match any single character NOT present in the list below «[^\t\r\n]+»
// Between one and unlimited times, as many times as possible, giving back as needed (greedy) «+»
// The tab character «\t»
// The carriage return character «\r»
// The line feed character «\n»
答案 1 :(得分:2)
虽然可以使用非常复杂的正则表达式定义解析器(尽管我什至不相信这种语法是可能的),但使用解析器生成器来进行解析将更加容易和可维护。以更易读的格式定义语法。
使用PEG.js,您可以为纯Excel表格式定义以下简单语法:
Table
= row: Row '\n' table: Table { return [row, ...table] }
/ row: Row { return [row] }
Row
= cell: Cell '\t' row: Row { return [cell, ...row] }
/ cell: Cell { return [cell] }
Cell
= '"' value: Value '"' { return value }
/ $ [^\t\n]*
Value
= escaped: $ Escaped
{ return escaped.replace(/""/g, '"') }
Escaped
= multiline: $ ([^"\t]+ / '""')+
& { return multiline.includes('\n') }
以下是您输入的演示:
window.excelTableParser=function(){"use strict";function n(r,t,e,u){this.message=r,this.r=t,this.t=e,this.e=u,this.name="SyntaxError","function"==typeof Error.captureStackTrace&&Error.captureStackTrace(this,n)}return function(n,r){function t(){this.constructor=n}t.prototype=r.prototype,n.prototype=new t}(n,Error),n.u=function(n,r){var t={o:function(n){return'"'+u(n.i)+'"'},f:function(n){var r,t="";for(r=0;r<n.c.length;r++)t+=n.c[r]instanceof Array?o(n.c[r][0])+"-"+o(n.c[r][1]):o(n.c[r]);return"["+(n.s?"^":"")+t+"]"},a:function(n){return"any character"},l:function(n){return"end of input"},x:function(n){return n.description}};function e(n){return n.charCodeAt(0).toString(16).toUpperCase()}function u(n){return n.replace(/\\/g,"\\\\").replace(/"/g,'\\"').replace(/\0/g,"\\0").replace(/\t/g,"\\t").replace(/\n/g,"\\n").replace(/\r/g,"\\r").replace(/[\x00-\x0F]/g,function(n){return"\\x0"+e(n)}).replace(/[\x10-\x1F\x7F-\x9F]/g,function(n){return"\\x"+e(n)})}function o(n){return n.replace(/\\/g,"\\\\").replace(/\]/g,"\\]").replace(/\^/g,"\\^").replace(/-/g,"\\-").replace(/\0/g,"\\0").replace(/\t/g,"\\t").replace(/\n/g,"\\n").replace(/\r/g,"\\r").replace(/[\x00-\x0F]/g,function(n){return"\\x0"+e(n)}).replace(/[\x10-\x1F\x7F-\x9F]/g,function(n){return"\\x"+e(n)})}return"Expected "+function(n){var r,e,u,o=new Array(n.length);for(r=0;r<n.length;r++)o[r]=(u=n[r],t[u.g](u));if(o.sort(),o.length>0){for(r=1,e=1;r<o.length;r++)o[r-1]!==o[r]&&(o[e]=o[r],e++);o.length=e}switch(o.length){case 1:return o[0];case 2:return o[0]+" or "+o[1];default:return o.slice(0,-1).join(", ")+", or "+o[o.length-1]}}(n)+" but "+function(n){return n?'"'+u(n)+'"':"end of input"}(r)+" found."},{v:n,parse:function(r,t){t=void 0!==t?t:{};var e,u={},o={d:I},i=I,f="\n",c=q("\n",!1),s=function(n,r){return[n,...r]},a=function(n){return[n]},l="\t",x=q("\t",!1),g=function(n,r){return[n,...r]},v=function(n){return[n]},d='"',h=q('"',!1),p=function(n){return n},y=/^[^\t\n]/,w=z(["\t","\n"],!0,!1),F=function(n){return n.replace(/""/g,'"')},E=/^[^"\t]/,m=z(['"',"\t"],!0,!1),P='""',C=q('""',!1),b=function(n){return n.includes("\n")},A=0,S=[{h:1,p:1}],R=0,T=[],j=0,k={};if("startRule"in t){if(!(t.y in o))throw new Error("Can't start parsing from rule \""+t.y+'".');i=o[t.y]}function q(n,r){return{g:"literal",i:n,ignoreCase:r}}function z(n,r,t){return{g:"class",c:n,s:r,ignoreCase:t}}function B(n){var t,e=S[n];if(e)return e;for(t=n-1;!S[t];)t--;for(e={h:(e=S[t]).h,p:e.p};t<n;)10===r.charCodeAt(t)?(e.h++,e.p=1):e.p++,t++;return S[n]=e,e}function D(n,r){var t=B(n),e=B(r);return{w:{F:n,h:t.h,p:t.p},l:{F:r,h:e.h,p:e.p}}}function G(n){A<R||(A>R&&(R=A,T=[]),T.push(n))}function H(r,t,e){return new n(n.u(r,t),r,t,e)}function I(){var n,t,e,o,i=5*A+0,l=k[i];return l?(A=l.m,l.P):(n=A,(t=J())!==u?(10===r.charCodeAt(A)?(e=f,A++):(e=u,0===j&&G(c)),e!==u&&(o=I())!==u?n=t=s(t,o):(A=n,n=u)):(A=n,n=u),n===u&&(n=A,(t=J())!==u&&(t=a(t)),n=t),k[i]={m:A,P:n},n)}function J(){var n,t,e,o,i=5*A+1,f=k[i];return f?(A=f.m,f.P):(n=A,(t=K())!==u?(9===r.charCodeAt(A)?(e=l,A++):(e=u,0===j&&G(x)),e!==u&&(o=J())!==u?n=t=g(t,o):(A=n,n=u)):(A=n,n=u),n===u&&(n=A,(t=K())!==u&&(t=v(t)),n=t),k[i]={m:A,P:n},n)}function K(){var n,t,e,o,i=5*A+2,f=k[i];if(f)return A=f.m,f.P;if(n=A,34===r.charCodeAt(A)?(t=d,A++):(t=u,0===j&&G(h)),t!==u&&(e=function(){var n,t,e,o=5*A+3,i=k[o];return i?(A=i.m,i.P):(n=A,t=A,(t=(e=function(){var n,t,e,o,i,f=5*A+4,c=k[f];if(c)return A=c.m,c.P;if(n=A,t=A,e=[],o=[],E.test(r.charAt(A))?(i=r.charAt(A),A++):(i=u,0===j&&G(m)),i!==u)for(;i!==u;)o.push(i),E.test(r.charAt(A))?(i=r.charAt(A),A++):(i=u,0===j&&G(m));else o=u;if(o===u&&(r.substr(A,2)===P?(o=P,A+=2):(o=u,0===j&&G(C))),o!==u)for(;o!==u;){if(e.push(o),o=[],E.test(r.charAt(A))?(i=r.charAt(A),A++):(i=u,0===j&&G(m)),i!==u)for(;i!==u;)o.push(i),E.test(r.charAt(A))?(i=r.charAt(A),A++):(i=u,0===j&&G(m));else o=u;o===u&&(r.substr(A,2)===P?(o=P,A+=2):(o=u,0===j&&G(C)))}else e=u;return(t=e!==u?r.substring(t,A):e)!==u&&(e=(e=b(t))?void 0:u)!==u?n=t=[t,e]:(A=n,n=u),k[f]={m:A,P:n},n}())!==u?r.substring(t,A):e)!==u&&(t=F(t)),n=t,k[o]={m:A,P:n},n)}())!==u?(34===r.charCodeAt(A)?(o=d,A++):(o=u,0===j&&G(h)),o!==u?n=t=p(e):(A=n,n=u)):(A=n,n=u),n===u){for(n=A,t=[],y.test(r.charAt(A))?(e=r.charAt(A),A++):(e=u,0===j&&G(w));e!==u;)t.push(e),y.test(r.charAt(A))?(e=r.charAt(A),A++):(e=u,0===j&&G(w));n=t!==u?r.substring(n,A):t}return k[i]={m:A,P:n},n}if((e=i())!==u&&A===r.length)return e;throw e!==u&&A<r.length&&G({g:"end"}),H(T,R<r.length?r.charAt(R):null,R<r.length?D(R,R+1):D(R,R))}}}();
console.log(excelTableParser.parse(`Data
Data Data
Data Data Data
Data Data Data"
Data Data "Da
ta"
Data Data "Da
ta"""
Data Data Data""
Data Data """Da
ta"""
Data Data """Da
ta"""`));
答案 2 :(得分:2)
这是一个正则表达式解决方案,它将文本拆分为多个单元格(包括带有换行符的单元格)。它不能解决所有复杂问题,但现在您可以分别处理每个单元格,因此应该更容易解析。
警告:正如Patrick Roberts在评论中指出的那样,这仅在多行单元格仅在最后一列中找到时有效。
const input = `Data
Data Data
Data Data Data
Data Data Data"
Data Data "Da
ta"
Data Data "Da
ta"""
Data Data Data""
Data Data """Da
ta"""
Data Data """Da
ta"""`;
const s = (input + '\n')
.replace(/(("[^"]*")*)\n/g, '$1\r')
.trim('\r');
const cells = s
.split('\r')
.map(row => row.split('\t'));
console.log(cells);
答案 3 :(得分:1)
我无法提出一个健壮的正则表达式解决方案,但这是一个可行的解决方案。
注意:我稍微改变了输入以测试不在最后一列中的多行单元格。
const input = `Data
Data Data
"Da
ta" Data Data
Data Data Data"
Data Data "Da
ta"
Data Data "Da
ta"""
Data Data Data""
Data Data """Da
ta"""
Data Data """Da
ta"""`;
const columnCount = (input.split('\n')[0].match(/\t/g) || []).length + 1;
//parse input into cells and undo wonkiness
const parts = input.split(/[\t\n]/);
const cells = [];
let next = '';
for (let part of parts) {
next += part + '\n';
const quoteCount = (next.match(/"/g) || []).length;
if (!next.startsWith('"') || quoteCount % 2 === 0) {
let cell = next.trim('\n');
if (cell.startsWith('"') && cell.endsWith('"')) {
cell = cell.replace(/^"([^]*)"$/, '$1');
cell = cell.replace(/""/g, '"');
}
cells.push(cell);
next = '';
}
}
//rearrange cells into rows
const rows = [];
let row = [];
for (let cell of cells) {
row.push(cell);
if (row.length === columnCount) {
rows.push(row);
row = [];
}
}
//display results in table
const tableElem = document.getElementById('table');
for (let row of rows) {
let trElem = document.createElement('tr');
for (let cell of row) {
let tdElem = document.createElement('td');
tdElem.innerHTML = cell.replace('\n', '<br/>');
trElem.appendChild(tdElem);
}
tableElem.appendChild(trElem);
}
<style>
table, th, td {
border: 1px solid black;
border-collapse: collapse;
padding: 2px;
}
</style>
<table id="table"></table>